Skip to main content

Instagram

Setup requirements

pip install httpx playwright

Login to Instagram

There are two options to works with Surfsky cloud browser: one time profile and persistent profile.

One time profile doesn't save it's state on close, and it is perfect for scraping, but we will show you how it's work here too.

Persistent profile instead saves it's state on close, so you can stop it and continue to work on the next start with all previously opened pages and stored cookies.

After either one time or persistent profile start you will get CDP url to connect browser automation tool to browser. You can see start profile response on our api page

import os

import httpx

API_TOKEN = os.environ['API_TOKEN']
PROXY = os.environ['PROXY']


async def start_one_time_profile():
async with httpx.AsyncClient(
base_url='https://api-public.surfsky.io',
headers={'X-Cloud-Api-Token': API_TOKEN},
timeout=60.0,
) as client:
browser_data_resp = await client.post('/profiles/one_time', json={'proxy': PROXY})
browser_data_resp.raise_for_status()

return browser_data_resp.json()

After getting CDP url you can use any of automation tool as if it is browser running on your local machine.

There is function to log in to Instagram that accepts CDP url as an argument.

import asyncio
import os

from playwright.async_api import async_playwright

USERNAME = os.environ['USERNAME']
PASSWORD = os.environ['PASSWORD']

async def login_to_instagram(cdp_url):
async with async_playwright() as p:
browser = await p.chromium.connect_over_cdp(cdp_url)
async with browser:
page = await browser.new_page()
await page.goto('https://instagram.com')

username = page.locator('//input[@name="username"]')
password = page.locator('//input[@name="password"]')

await username.wait_for()

await username.fill(USERNAME)
await password.fill(PASSWORD)

login = page.locator('//form[@id="loginForm"]//button[@type="submit"]')
await login.wait_for()
await login.click()

await asyncio.sleep(10)
await page.screenshot(path='instagram.jpg')
Full example
import os

import httpx
from playwright.async_api import async_playwright

USERNAME = os.environ['USERNAME']
PASSWORD = os.environ['PASSWORD']

API_TOKEN = os.environ['API_TOKEN']
PROXY = os.environ['PROXY']


async def start_persistent_profile():
async with httpx.AsyncClient(
base_url='https://api-public.surfsky.io',
headers={'X-Cloud-Api-Token': API_TOKEN},
timeout=60.0,
) as client:
browser_data_resp = await client.post(f'/profiles/{PROFILE_UUID}/start')
browser_data_resp.raise_for_status()

return browser_data_resp.json()


async def login_to_instagram(cdp_url):
browser_data = await start_persistent_profile()

cdp_url = browser_data['ws_url']
async with async_playwright() as p:
browser = await p.chromium.connect_over_cdp(cdp_url)
async with browser:
page = await browser.new_page()
await page.goto('https://instagram.com')

username = page.locator('//input[@name="username"]')
password = page.locator('//input[@name="password"]')

await username.wait_for()

await username.fill(USERNAME)
await password.fill(PASSWORD)

login = page.locator('//form[@id="loginForm"]//button[@type="submit"]')
await login.wait_for()
await login.click()

await asyncio.sleep(10)
await page.screenshot(path='instagram.jpg')

Collect user's followers

Same as in the previous example you have a choice to start one time or persistent profile, but rest of the code will work the same for both of them. In this example we will use persistent profile.

import asyncio
import os
from contextlib import contextmanager

import httpx
from playwright.async_api import async_playwright

API_TOKEN = os.environ['API_TOKEN']
PROFILE_UUID = os.environ['PROFILE_UUID']

INSTAGRAM_API = 'https://www.instagram.com/api/v1'
INSTAGRAM_USERNAME = os.environ['INSTAGRAM_USERNAME']


async def start_persistent_profile():
async with httpx.AsyncClient(
base_url='https://api-public.surfsky.io',
headers={'X-Cloud-Api-Token': API_TOKEN},
timeout=60.0,
) as client:
browser_data_resp = await client.post(
f'/profiles/{PROFILE_UUID}/start',
json={
'browser_settings': {
'inactive_kill_timeout': 240
}
}
)
browser_data_resp.raise_for_status()

return browser_data_resp.json()


@contextmanager
def user_info_handler(page):
result = asyncio.Future()

target_url = f'{INSTAGRAM_API}/users/web_profile_info/?username={INSTAGRAM_USERNAME}'

async def handler(response):
if response.url == target_url:
result.set_result((await response.json())['data']['user'])

page.on('response', handler)
yield result
page.remove_listener('response', handler)


@contextmanager
def followers_handler(user_id, page):
result = asyncio.Future()

target_url = f'{INSTAGRAM_API}/friendships/{user_id}/followers'

async def handler(response):
if response.url.startswith(target_url):
result.set_result((await response.json()))

page.on('response', handler)
yield result
page.remove_listener('response', handler)


async def main():
browser_data = await start_persistent_profile()

cdp_url = browser_data['ws_url']
async with async_playwright() as p:
browser = await p.chromium.connect_over_cdp(cdp_url)
async with browser:
default_context = browser.contexts[0]
page = await default_context.new_page()

with user_info_handler(page) as user_info:
await page.goto(f'https://instagram.com/{INSTAGRAM_USERNAME}')
user_info = await user_info

user_id = user_info['id']

all_followers = []
with followers_handler(user_id, page) as followers_fut:
await page.goto(f'https://instagram.com/{INSTAGRAM_USERNAME}/followers/')
followers = await asyncio.wait_for(followers_fut, 30)

all_followers.extend(followers['users'])

while followers['big_list']:
followers_modal = page.locator('//div[@role="dialog"]//div[@role="dialog"]')
await followers_modal.wait_for()
await followers_modal.hover()

with followers_handler(user_id, page) as followers_fut:
await page.mouse.wheel(0, 15000)
followers = await followers_fut

all_followers.extend(followers['users'])

print(f'Total followers scraped: {len(all_followers)}')