Skip to main content
This document explains: In modern Web data scraping scenarios, why Selenium is chosen as the page scraping framework, and the officially recommended usage pattern.

1. Positioning and Role of Selenium

Selenium is a browser automation framework based on the WebDriver protocol. By using Remote WebDriver, it can control real browsers running remotely. Its core capabilities include:
  • Control of real Chromium browsers
  • Page loading and JavaScript execution
  • DOM querying and basic event simulation
  • Support for connecting to remote fingerprint browser clusters
Selenium does not simulate browser HTTP requests. Instead, it drives a real browser to execute actual page logic via the WebDriver protocol.

1️⃣ Connecting to the Remote Fingerprint Browser

The platform provides a fingerprint browser service based on the HTTP WebDriver protocol,
which can be accessed in Remote mode.
try:
    Auth = os.environ.get("PROXY_AUTH")
    CafeSDK.Log.info(f"Current browser auth info: {Auth}")
except Exception as e:
    CafeSDK.Log.error(f"Failed to get browser auth info: {e}")
    Auth = None
    return

browser_url = f'http://{Auth}@chrome-http-inner.cafescraper.com'  # WebDriver endpoint of the fingerprint browser
rest_item = {"url": url, "html": "", "resp_status": "200"}

# Configure Chrome options
chrome_options = webdriver.ChromeOptions()

# Common options
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')

CafeSDK.Log.info(f"Requested URL: {url}")
try:
    driver = webdriver.Remote(
        command_executor=browser_url,
        options=chrome_options
    )
except Exception as e:
    CafeSDK.Log.info(f"[Error] Failed to connect to fingerprint browser: {e}")
    rest_item['resp_status'] = "403"
    return

2️⃣ Page Navigation and Content Retrieval

try:
    driver.get(url)
    WebDriverWait(driver, 3 * 60).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    html = driver.page_source
    rest_item["html"] = html
except Exception as e:
    CafeSDK.Log.info(f"[Error] Failed to get page HTML: {e}")
    rest_item['resp_status'] = "500"

CafeSDK.Result.push_data(rest_item)

3. Complete Platform Script Entry Example (Recommended)

import asyncio
import os

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

from sdk import CafeSDK

async def run():
    CafeSDK.Log.info("🚀 Init...")
    CafeSDK.Log.info("====================================================")
    CafeSDK.Log.info("🚀 CafeScraper Selenium Browser Scrape Demo")
    CafeSDK.Log.info("====================================================")

    headers = [
        {"label": "url", "key": "url", "format": "text"},
        {"label": "html", "key": "html", "format": "text"},
        {"label": "resp_status", "key": "resp_status", "format": "text"},
    ]
    CafeSDK.Result.set_table_header(headers)

    input_json_dict = CafeSDK.Parameter.get_input_json_dict()
    url = input_json_dict['url']

    Auth = os.environ.get("PROXY_AUTH")
    CafeSDK.Log.info(f"Current browser auth info: {Auth}")

    browser_url = f'http://{Auth}@chrome-http-inner.cafescraper.com'
    rest_item = {"url": url, "html": "", "resp_status": "200"}

    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')

    try:
        driver = webdriver.Remote(
            command_executor=browser_url,
            options=chrome_options
        )
    except Exception as e:
        CafeSDK.Log.info(f"[Error] Failed to connect fingerprint browser: {e}")
        rest_item['resp_status'] = "403"
        return

    try:
        driver.get(url)
        WebDriverWait(driver, 3 * 60).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
        rest_item["html"] = driver.page_source
    except Exception as e:
        CafeSDK.Log.info(f"[Error] Failed to retrieve HTML: {e}")
        rest_item['resp_status'] = "500"

    CafeSDK.Result.push_data(rest_item)

if __name__ == "__main__":
    asyncio.run(run())

4. Dynamic Content and DOM Operations

Selecting a Single Element

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Method 1: CSS selectors (recommended)
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
element = driver.find_element(By.ID, 'main-content')
element = driver.find_element(By.TAG_NAME, 'h1')

# Method 2: XPath
element = driver.find_element(By.XPATH, '//div[@class="container"]')
element = driver.find_element(By.XPATH, '//button[contains(text(), "Submit")]')

# Method 3: Other locators
element = driver.find_element(By.CLASS_NAME, 'product-item')
element = driver.find_element(By.NAME, 'username')
element = driver.find_element(By.LINK_TEXT, 'Buy Now')
element = driver.find_element(By.PARTIAL_LINK_TEXT, 'Buy')

# Wait for element to appear (recommended)
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, '.product-title'))
)

# Check element existence and attributes
try:
    element = driver.find_element(By.CSS_SELECTOR, '.product-title')
    text = element.text
    html = element.get_attribute('outerHTML')
    class_name = element.get_attribute('class')
    href = element.get_attribute('href')
    is_displayed = element.is_displayed()
except NoSuchElementException:
    print("Element not found")

Batch Element Processing

# Get all matching elements
product_items = driver.find_elements(By.CSS_SELECTOR, '.product-item')
print(f"Found {len(product_items)} products")

# Iterate and extract
products_data = []
for item in product_items:
    try:
        name = item.find_element(By.CSS_SELECTOR, '.name').text
    except NoSuchElementException:
        name = ''

    try:
        price = item.find_element(By.CSS_SELECTOR, '.price').text
    except NoSuchElementException:
        price = ''

    try:
        link = item.find_element(By.CSS_SELECTOR, '.link').get_attribute('href')
    except NoSuchElementException:
        link = ''

    products_data.append({
        'name': name,
        'price': price,
        'link': link
    })

# JavaScript-based bulk extraction (higher performance)
products_data = driver.execute_script('''
    const items = document.querySelectorAll('.product-item');
    return Array.from(items).map(item => {
        const nameElem = item.querySelector('.name');
        const priceElem = item.querySelector('.price');
        const linkElem = item.querySelector('.link');
        return {
            name: nameElem ? nameElem.textContent.trim() : '',
            price: priceElem ? priceElem.textContent.trim() : '',
            link: linkElem ? linkElem.href : ''
        };
    });
''')
Characteristics:
  • Operates on a real browser DOM
  • Retrieves fully JavaScript-rendered content
  • Fully consistent with frontend rendering logic

**❌ Using **sleepto wait for page loading

time.sleep(5)
Issues:
  • Cannot guarantee JavaScript execution is complete
  • Fails on slow pages
  • Wastes time on fast pages

**❌ Using **requeststo simulate browser behavior

requests.get(url)
Issues:
  • Incomplete page content
  • Easily detected by anti-bot systems
  • Unstable success rate