DataDome
Using Surfsky with DataDome Protected Sites
Surfsky provides robust capabilities for handling DataDome protected websites. Our solution includes:
- Automatic browser fingerprint randomization
- Proxy rotation support
- Optional captcha solving integration
When working with DataDome, CAPTCHAs most often appear due to proxy usage and/or repetitive and suspicious actions. To avoid this, you can use CAPTCHA solving services like 2Captcha
. This requires an API key. You can either use your own key or contact our team for assistance. If you prefer to work without a CAPTCHA solver, you must strictly follow these measures:
- Use only the highest quality proxies, preferably residential or mobile
- If you're making requests without delays and real action emulation, you need to rotate proxies every 3-5 requests
- When increasing delays by several seconds and emulating real user actions, the number of possible requests increases
If you use a CAPTCHA solver, after completing the CAPTCHA and receiving DataDome cookies, it becomes possible to make over 25 requests.
Prerequisites
To use Surfsky, you'll need:
- API Key - A unique authentication token for accessing our services
- Assigned Hostname - Your dedicated Surfsky endpoint
- Proxies - Proxies in the format:Supported protocols: HTTP, HTTPS, SOCKS5, SSH
protocol://username:password@host:port
- 2Captcha Key (Optional) - If you want to enable automatic captcha solving.
To obtain your API key and hostname, please contact our team.
If you need proxies, please contact our team.
Captcha Solving
By default, captcha solving is disabled. To enable it:
- Set
SOLVE_CAPTCHA = True
in your code - Get an API key from 2captcha.com
- Set your 2captcha API key in the
CAPTCHA_KEY
variable
Alternatively, you can contact our support team for custom captcha solving solutions.
Code Examples
Here are complete examples showing how to handle DataDome protected sites with proxy rotation and optional captcha solving:
- Python
- JavaScript
First, install the required packages:
pip install playwright requests httpx beautifulsoup4 uuid
Here's a complete example for handling DataDome protected sites:
from datetime import datetime
import re
import uuid
import httpx
import asyncio
from playwright.async_api import async_playwright, Page, Browser
from bs4 import BeautifulSoup
CLOUD_API_TOKEN = "YOUR_API_TOKEN"
SOLVE_CAPTCHA = False
CAPTCHA_KEY = "YOUR_2CAPTCHA_KEY"
def generate_proxy() -> str:
"""Generate a proxy string with a new random SID"""
proxy_sid = str(uuid.uuid4().hex)
return f"socks5://username:password@host:port" # Replace with your proxy details
class DataDomeIPBannedException(Exception):
"""Raised when DataDome has banned the IP address (t=bv in captcha URL)"""
pass
class DataDomeCaptchaException(Exception):
"""Raised when there's an issue with the captcha solving process"""
pass
class DataDomeCaptchaSolver:
def __init__(self, api_key: str):
self.api_key = api_key
def format_proxy_string(self, proxy_string: str) -> dict:
regex = r'^(socks5|http|https):\/\/([^:]+):([^@]+)@([^:]+):(\d+)$'
match = re.match(regex, proxy_string)
if not match:
raise ValueError('Invalid proxy string format')
protocol, login, password, host, port = match.groups()
return {
"proxyType": protocol,
"proxyAddress": host,
"proxyPort": port,
"proxyLogin": login,
"proxyPassword": password
}
async def solve(self, page: Page, url: str, proxy: str) -> None:
captcha_url = await self.maybe_get_captcha_url(page)
if not captcha_url:
print("DataDome captcha not found")
return
print("Found captcha. Trying to solve...")
print("captcha_url:", captcha_url)
# Check if IP is banned
if 't=bv' in captcha_url:
raise DataDomeIPBannedException("IP is banned by DataDome (t=bv). Need to change IP address.")
if 't=fe' not in captcha_url:
raise DataDomeCaptchaException("Expected t=fe in captcha URL")
user_agent = await self.get_user_agent(page)
proxy_obj = self.format_proxy_string(proxy)
task_id = await self.create_datadome_task(url, captcha_url, user_agent, proxy_obj)
print("2captcha taskId: ", task_id)
solution_cookies = await self.get_task_result(task_id)
print("2captcha solutionCookies: ", solution_cookies)
await self.set_cookies(page, solution_cookies)
await page.reload(wait_until='networkidle')
async def set_cookies(self, page: Page, solution_cookies: str) -> None:
"""Set cookies from "solution_cookies" string"""
await page.evaluate(f'''() => {{
document.cookie = `{solution_cookies}`;
}}''')
print("Cookies set")
async def maybe_get_captcha_url(self, page: Page) -> str:
return await page.evaluate('''() => {
const frame = document.querySelector('iframe[src*="captcha-delivery.com/captcha/"]');
return frame ? frame.src : null;
}''')
async def get_user_agent(self, page: Page) -> str:
return await page.evaluate('() => navigator.userAgent')
async def create_datadome_task(self, website_url: str, captcha_url: str, user_agent: str, proxy_obj: dict) -> str:
task_data = {
"type": "DataDomeSliderTask",
"websiteURL": website_url,
"captchaUrl": captcha_url,
"userAgent": user_agent,
"proxyType": proxy_obj["proxyType"],
"proxyAddress": proxy_obj["proxyAddress"],
"proxyPort": proxy_obj["proxyPort"],
"proxyLogin": proxy_obj["proxyLogin"],
"proxyPassword": proxy_obj["proxyPassword"]
}
data = {
"clientKey": self.api_key,
"task": task_data
}
async with httpx.AsyncClient() as client:
response = await client.post('https://api.2captcha.com/createTask', json=data)
response_data = response.json()
return response_data["taskId"]
async def get_task_result(self, task_id: str) -> str:
while True:
await asyncio.sleep(5)
async with httpx.AsyncClient() as client:
response = await client.get('https://2captcha.com/res.php', params={
"key": self.api_key,
"action": "get",
"id": task_id
})
result = response.text
if result == 'CAPCHA_NOT_READY':
print('2captcha task not ready yet. Waiting...')
continue
print(result)
if not result.startswith('OK|'):
raise Exception(f"Failed to get 2captcha result: {result}")
return result.split('|')[1]
class SurfskyBrowser:
def __init__(self, playwright, proxy: str):
self.playwright = playwright
self.proxy = proxy
self.browser: Browser | None = None
self.page: Page | None = None
async def setup(self) -> None:
"""Initialize browser and page"""
cdp_url = await self._start_browser()
print(f"Connecting to browser at {cdp_url}")
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
context = self.browser.contexts[0] if self.browser.contexts else await self.browser.new_context()
self.page = context.pages[0] if context.pages else await context.new_page()
print("Browser setup complete")
async def _start_browser(self) -> str:
"""Initialize and start a browser session"""
url = "https://api-public.surfsky.io/profiles/one_time"
headers = {
"Content-Type": "application/json",
"X-Cloud-Api-Token": CLOUD_API_TOKEN
}
data = {
"browser_settings": {
"inactive_kill_timeout": 60,
},
"fingerprint": {
"os": "mac"
}
}
if self.proxy:
data["proxy"] = self.proxy
async with httpx.AsyncClient(timeout=60) as client:
response = await client.post(url, headers=headers, json=data)
response.raise_for_status()
data = response.json()
devtools_url = data["inspector"]["pages"][0]["devtools_url"]
print("Devtools URL:\n", devtools_url)
return data["ws_url"]
async def close(self) -> None:
"""Safely close browser"""
if self.browser:
try:
await self.browser.close()
print("Browser closed successfully")
except Exception as e:
print(f"Error closing browser: {str(e)}")
class UseCase:
"""Use case for scraping websites with DataDome handling"""
def __init__(self, browser: SurfskyBrowser, solver: DataDomeCaptchaSolver, solve_captcha: bool = False):
self.browser = browser
self.solver = solver
self.solve_captcha = solve_captcha
async def execute(self, url: str) -> bool:
"""Execute the scraping use case for a single URL"""
if not self.browser.page:
raise RuntimeError("Browser not initialized")
try:
await self._navigate_to_page(url)
await self._handle_captcha(url)
await self._save_screenshot()
# Here you can add more specific scraping logic
# await self._extract_reviews()
return True
except DataDomeIPBannedException as e:
print(f"IP banned at URL {url}: {str(e)}")
return False
except Exception as e:
print(f"Error processing URL {url}: {str(e)}")
return True # Non-IP-ban errors considered "successful"
async def _navigate_to_page(self, url: str) -> None:
await self.browser.page.goto(url)
html = await self.browser.page.content()
soup = BeautifulSoup(html, 'html.parser')
print(f"Page title: {soup.title.string}")
async def _handle_captcha(self, url: str) -> None:
"""Handle DataDome captcha if present"""
if not self.solve_captcha:
print("Captcha solving disabled, skipping...")
return
await self.solver.solve(self.browser.page, url, self.browser.proxy)
async def _save_screenshot(self) -> None:
"""Save screenshot of the current page"""
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
screenshot_filename = f"screenshot_{ts}.png"
await self.browser.page.screenshot(path=screenshot_filename)
print(f"Screenshot saved as {screenshot_filename}")
class DataDomeScraper:
def __init__(self, solve_captcha: bool = SOLVE_CAPTCHA):
self.browser: SurfskyBrowser | None = None
self.solver = DataDomeCaptchaSolver(CAPTCHA_KEY)
self.solve_captcha = solve_captcha
async def _rotate_browser(self, playwright) -> None:
"""Rotate browser: close existing and create new one"""
# Close existing browser if any
if self.browser:
try:
await self.browser.close()
print("Previous browser closed successfully")
except Exception as e:
print(f"Error closing previous browser: {str(e)}")
finally:
self.browser = None
# Create new browser
try:
proxy = generate_proxy()
print(f"Creating new browser")
self.browser = SurfskyBrowser(playwright, proxy)
await self.browser.setup()
print("New browser setup completed")
except Exception as e:
print(f"Failed to create new browser: {str(e)}")
self.browser = None
raise
async def process_url(self, playwright, url: str) -> bool:
"""Process single URL with browser management"""
if not self.browser:
await self._rotate_browser(playwright)
try:
use_case = UseCase(self.browser, self.solver, self.solve_captcha)
return await use_case.execute(url)
except DataDomeIPBannedException:
print(f"IP banned, rotating browser...")
await self._rotate_browser(playwright)
return False
except Exception as e:
print(f"Unexpected error during URL processing: {str(e)}")
await self._rotate_browser(playwright)
raise
async def process_urls(urls: list[str], solve_captcha: bool = True) -> None:
"""Main function to process all URLs"""
remaining_urls = urls.copy()
async with async_playwright() as p:
scraper = DataDomeScraper(solve_captcha)
while remaining_urls:
url = remaining_urls[0]
try:
success = await scraper.process_url(p, url)
if success:
remaining_urls.pop(0)
print(f"Processed URL {url}. Remaining: {len(remaining_urls)}")
except Exception as e:
print(f"Error processing URL {url}: {str(e)}")
await scraper._rotate_browser(p) # Ensure clean state after error
print("All URLs processed")
if __name__ == "__main__":
# Example URLs
urls = [
# Change to your target URL
"https://www.example.com/page1",
"https://www.example.com/page2",
]
# Use global SOLVE_CAPTCHA setting
asyncio.run(process_urls(urls, solve_captcha=SOLVE_CAPTCHA))
First, install the required packages:
npm install playwright axios cheerio uuid
Here's a complete example for handling DataDome protected sites:
const { chromium } = require('playwright');
const { v4: uuidv4 } = require('uuid');
const axios = require('axios');
const cheerio = require('cheerio');
const CLOUD_API_TOKEN = 'YOUR_API_TOKEN';
const SOLVE_CAPTCHA = false;
const CAPTCHA_KEY = 'YOUR_2CAPTCHA_KEY';
const generateProxy = () => {
const proxySid = uuidv4();
return `socks5://username:password@host:port`; // Replace with your proxy details
};
class DataDomeIPBannedException extends Error {
constructor(message) {
super(message);
this.name = 'DataDomeIPBannedException';
}
}
class DataDomeCaptchaException extends Error {
constructor(message) {
super(message);
this.name = 'DataDomeCaptchaException';
}
}
class DataDomeCaptchaSolver {
constructor(apiKey) {
this.apiKey = apiKey;
}
formatProxyString(proxyString) {
const regex = /^(socks5|http|https):\/\/([^:]+):([^@]+)@([^:]+):(\d+)$/;
const match = proxyString.match(regex);
if (!match) {
throw new Error('Invalid proxy string format');
}
const [, protocol, login, password, host, port] = match;
return {
proxyType: protocol,
proxyAddress: host,
proxyPort: port,
proxyLogin: login,
proxyPassword: password,
};
}
async solve(page, url, proxy) {
const captchaUrl = await this.maybeGetCaptchaUrl(page);
if (!captchaUrl) {
console.log('DataDome captcha not found');
return;
}
console.log('Found captcha. Trying to solve...');
console.log('captcha_url:', captchaUrl);
if (captchaUrl.includes('t=bv')) {
throw new DataDomeIPBannedException('IP is banned by DataDome (t=bv). Need to change IP address.');
}
if (!captchaUrl.includes('t=fe')) {
throw new DataDomeCaptchaException('Expected t=fe in captcha URL');
}
const userAgent = await this.getUserAgent(page);
const proxyObj = this.formatProxyString(proxy);
const taskId = await this.createDatadomeTask(url, captchaUrl, userAgent, proxyObj);
console.log('2captcha taskId: ', taskId);
const solutionCookies = await this.getTaskResult(taskId);
console.log('2captcha solutionCookies: ', solutionCookies);
await this.setCookies(page, solutionCookies);
await page.reload({ waitUntil: 'networkidle' });
}
async setCookies(page, solutionCookies) {
await page.evaluate(`document.cookie = "${solutionCookies}"`);
console.log('Cookies set');
}
async maybeGetCaptchaUrl(page) {
return page.evaluate(() => {
const frame = document.querySelector('iframe[src*="captcha-delivery.com/captcha/"]');
return frame ? frame.src : null;
});
}
async getUserAgent(page) {
return page.evaluate(() => navigator.userAgent);
}
async createDatadomeTask(websiteUrl, captchaUrl, userAgent, proxyObj) {
const taskData = {
type: 'DataDomeSliderTask',
websiteURL: websiteUrl,
captchaUrl,
userAgent,
...proxyObj,
};
const data = {
clientKey: this.apiKey,
task: taskData,
};
const response = await axios.post('https://api.2captcha.com/createTask', data);
return response.data.taskId;
}
async getTaskResult(taskId) {
while (true) {
await new Promise(resolve => setTimeout(resolve, 5000));
const response = await axios.get('https://2captcha.com/res.php', {
params: {
key: this.apiKey,
action: 'get',
id: taskId,
},
});
const result = response.data;
if (result === 'CAPCHA_NOT_READY') {
console.log('2captcha task not ready yet. Waiting...');
continue;
}
console.log(result);
if (!result.startsWith('OK|')) {
throw new Error(`Failed to get 2captcha result: ${result}`);
}
return result.split('|')[1];
}
}
}
class SurfskyBrowser {
constructor(proxy) {
this.proxy = proxy;
this.browser = null;
this.page = null;
}
async setup() {
const cdpUrl = await this.startBrowser();
console.log(`Connecting to browser at ${cdpUrl}`);
this.browser = await chromium.connectOverCDP(cdpUrl);
const context = this.browser.contexts()[0] || await this.browser.newContext();
this.page = context.pages()[0] || await context.newPage();
console.log('Browser setup complete');
}
async startBrowser() {
const url = 'https://api-public.surfsky.io/profiles/one_time';
const headers = {
'Content-Type': 'application/json',
'X-Cloud-Api-Token': CLOUD_API_TOKEN,
};
const data = {
browser_settings: {
inactive_kill_timeout: 60,
},
fingerprint: {
os: 'mac',
},
};
if (this.proxy) {
data.proxy = this.proxy;
}
const response = await axios.post(url, data, { headers });
const responseData = response.data;
console.log('Devtools URL:\n', responseData.inspector.pages[0].devtools_url);
return responseData.ws_url;
}
async close() {
if (this.browser) {
try {
await this.browser.close();
console.log('Browser closed successfully');
} catch (e) {
console.log(`Error closing browser: ${e.message}`);
}
}
}
}
class UseCase {
constructor(browser, solver, solveCaptcha = false) {
this.browser = browser;
this.solver = solver;
this.solveCaptcha = solveCaptcha;
}
async execute(url) {
if (!this.browser.page) {
throw new Error('Browser not initialized');
}
try {
await this.navigateToPage(url);
if (this.solveCaptcha) {
await this.handleCaptcha(url);
}
await this.saveScreenshot();
// Here you can add more specific scraping logic
// await this.extractReviews();
return true;
} catch (e) {
if (e instanceof DataDomeIPBannedException) {
console.log(`IP banned at URL ${url}: ${e.message}`);
return false;
}
console.log(`Error processing URL ${url}: ${e.message}`);
return true; // Non-IP-ban errors considered "successful"
}
}
async navigateToPage(url) {
await this.browser.page.goto(url);
const html = await this.browser.page.content();
const $ = cheerio.load(html);
console.log(`Page title: ${$('title').text()}`);
}
async handleCaptcha(url) {
await this.solver.solve(this.browser.page, url, this.browser.proxy);
}
async saveScreenshot() {
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const screenshotFilename = `screenshot_${timestamp}.png`;
await this.browser.page.screenshot({ path: screenshotFilename });
console.log(`Screenshot saved as ${screenshotFilename}`);
}
}
class DataDomeScraper {
constructor(solveCaptcha = false) {
this.browser = null;
this.solver = new DataDomeCaptchaSolver(CAPTCHA_KEY);
this.solveCaptcha = solveCaptcha;
}
async rotateBrowser() {
if (this.browser) {
try {
await this.browser.close();
console.log('Previous browser closed successfully');
} catch (e) {
console.log(`Error closing previous browser: ${e.message}`);
} finally {
this.browser = null;
}
}
try {
const proxy = generateProxy();
console.log('Creating new browser');
this.browser = new SurfskyBrowser(proxy);
await this.browser.setup();
console.log('New browser setup completed');
} catch (e) {
console.log(`Failed to create new browser: ${e.message}`);
this.browser = null;
throw e;
}
}
async processUrl(url) {
if (!this.browser) {
await this.rotateBrowser();
}
try {
const useCase = new UseCase(this.browser, this.solver, this.solveCaptcha);
return await useCase.execute(url);
} catch (e) {
if (e instanceof DataDomeIPBannedException) {
console.log('IP banned, rotating browser...');
await this.rotateBrowser();
return false;
}
console.log(`Unexpected error during URL processing: ${e.message}`);
await this.rotateBrowser();
throw e;
}
}
}
async function processUrls(urls, solveCaptcha = true) {
const remainingUrls = [...urls];
const scraper = new DataDomeScraper(solveCaptcha);
while (remainingUrls.length > 0) {
const url = remainingUrls[0];
try {
const success = await scraper.processUrl(url);
if (success) {
remainingUrls.shift();
console.log(`Processed URL ${url}. Remaining: ${remainingUrls.length}`);
}
} catch (e) {
console.log(`Error processing URL ${url}: ${e.message}`);
await scraper.rotateBrowser();
}
}
console.log('All URLs processed');
}
# Example URLs
const urls = [
# Change to your target URL
"https://www.example.com/page1",
"https://www.example.com/page2",
];
processUrls(urls, SOLVE_CAPTCHA).catch(console.error);
Important Notes
- Always remember to close the browser when you're done to release your session limit
- Inactive sessions are automatically closed after 30 seconds (configurable via
inactive_kill_timeout
) - One time profile is used only once and then deleted
- A proxy is required and must be passed to the
create_profile
function - You can run multiple sessions according to your subscription plan's session limit
- If captcha solving is enabled, the system will attempt to solve DataDome captchas automatically
For more advanced usage and error handling, check out our API Reference.