Documentation Index
Fetch the complete documentation index at: https://docs.cafescraper.com/llms.txt
Use this file to discover all available pages before exploring further.
This document explains:
In modern Web data scraping scenarios, why Selenium is chosen as the page scraping framework, and the officially recommended usage pattern.
1. Positioning and Role of Selenium
Selenium is a browser automation framework based on the WebDriver protocol.
By using Remote WebDriver, it can control real browsers running remotely.
Its core capabilities include:
- Control of real Chromium browsers
- Page loading and JavaScript execution
- DOM querying and basic event simulation
- Support for connecting to remote fingerprint browser clusters
Selenium does not simulate browser HTTP requests.
Instead, it drives a real browser to execute actual page logic via the WebDriver protocol.
2. Officially Recommended Usage
1️⃣ Connecting to the Remote Fingerprint Browser
The platform provides a fingerprint browser service based on the HTTP WebDriver protocol,
which can be accessed in Remote mode.
try:
Auth = os.environ.get("PROXY_AUTH")
CafeSDK.Log.info(f"Current browser auth info: {Auth}")
except Exception as e:
CafeSDK.Log.error(f"Failed to get browser auth info: {e}")
Auth = None
return
browser_url = f'http://{Auth}@chrome-http-inner.cafescraper.com' # WebDriver endpoint of the fingerprint browser
rest_item = {"url": url, "html": "", "resp_status": "200"}
# Configure Chrome options
chrome_options = webdriver.ChromeOptions()
# Common options
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
CafeSDK.Log.info(f"Requested URL: {url}")
try:
driver = webdriver.Remote(
command_executor=browser_url,
options=chrome_options
)
except Exception as e:
CafeSDK.Log.info(f"[Error] Failed to connect to fingerprint browser: {e}")
rest_item['resp_status'] = "403"
return
2️⃣ Page Navigation and Content Retrieval
try:
driver.get(url)
WebDriverWait(driver, 3 * 60).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
html = driver.page_source
rest_item["html"] = html
except Exception as e:
CafeSDK.Log.info(f"[Error] Failed to get page HTML: {e}")
rest_item['resp_status'] = "500"
CafeSDK.Result.push_data(rest_item)
3. Complete Platform Script Entry Example (Recommended)
import asyncio
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from sdk import CafeSDK
async def run():
CafeSDK.Log.info("🚀 Init...")
CafeSDK.Log.info("====================================================")
CafeSDK.Log.info("🚀 CafeScraper Selenium Browser Scrape Demo")
CafeSDK.Log.info("====================================================")
headers = [
{"label": "url", "key": "url", "format": "text"},
{"label": "html", "key": "html", "format": "text"},
{"label": "resp_status", "key": "resp_status", "format": "text"},
]
CafeSDK.Result.set_table_header(headers)
input_json_dict = CafeSDK.Parameter.get_input_json_dict()
url = input_json_dict['url']
Auth = os.environ.get("PROXY_AUTH")
CafeSDK.Log.info(f"Current browser auth info: {Auth}")
browser_url = f'http://{Auth}@chrome-http-inner.cafescraper.com'
rest_item = {"url": url, "html": "", "resp_status": "200"}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920,1080')
try:
driver = webdriver.Remote(
command_executor=browser_url,
options=chrome_options
)
except Exception as e:
CafeSDK.Log.info(f"[Error] Failed to connect fingerprint browser: {e}")
rest_item['resp_status'] = "403"
return
try:
driver.get(url)
WebDriverWait(driver, 3 * 60).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
rest_item["html"] = driver.page_source
except Exception as e:
CafeSDK.Log.info(f"[Error] Failed to retrieve HTML: {e}")
rest_item['resp_status'] = "500"
CafeSDK.Result.push_data(rest_item)
if __name__ == "__main__":
asyncio.run(run())
4. Dynamic Content and DOM Operations
Selecting a Single Element
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Method 1: CSS selectors (recommended)
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
element = driver.find_element(By.ID, 'main-content')
element = driver.find_element(By.TAG_NAME, 'h1')
# Method 2: XPath
element = driver.find_element(By.XPATH, '//div[@class="container"]')
element = driver.find_element(By.XPATH, '//button[contains(text(), "Submit")]')
# Method 3: Other locators
element = driver.find_element(By.CLASS_NAME, 'product-item')
element = driver.find_element(By.NAME, 'username')
element = driver.find_element(By.LINK_TEXT, 'Buy Now')
element = driver.find_element(By.PARTIAL_LINK_TEXT, 'Buy')
# Wait for element to appear (recommended)
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '.product-title'))
)
# Check element existence and attributes
try:
element = driver.find_element(By.CSS_SELECTOR, '.product-title')
text = element.text
html = element.get_attribute('outerHTML')
class_name = element.get_attribute('class')
href = element.get_attribute('href')
is_displayed = element.is_displayed()
except NoSuchElementException:
print("Element not found")
Batch Element Processing
# Get all matching elements
product_items = driver.find_elements(By.CSS_SELECTOR, '.product-item')
print(f"Found {len(product_items)} products")
# Iterate and extract
products_data = []
for item in product_items:
try:
name = item.find_element(By.CSS_SELECTOR, '.name').text
except NoSuchElementException:
name = ''
try:
price = item.find_element(By.CSS_SELECTOR, '.price').text
except NoSuchElementException:
price = ''
try:
link = item.find_element(By.CSS_SELECTOR, '.link').get_attribute('href')
except NoSuchElementException:
link = ''
products_data.append({
'name': name,
'price': price,
'link': link
})
# JavaScript-based bulk extraction (higher performance)
products_data = driver.execute_script('''
const items = document.querySelectorAll('.product-item');
return Array.from(items).map(item => {
const nameElem = item.querySelector('.name');
const priceElem = item.querySelector('.price');
const linkElem = item.querySelector('.link');
return {
name: nameElem ? nameElem.textContent.trim() : '',
price: priceElem ? priceElem.textContent.trim() : '',
link: linkElem ? linkElem.href : ''
};
});
''')
Characteristics:
- Operates on a real browser DOM
- Retrieves fully JavaScript-rendered content
- Fully consistent with frontend rendering logic
5. Officially Not Recommended Practices (Anti-Patterns)
**❌ Using **sleepto wait for page loading
Issues:
- Cannot guarantee JavaScript execution is complete
- Fails on slow pages
- Wastes time on fast pages
**❌ Using **requeststo simulate browser behavior
Issues:
- Incomplete page content
- Easily detected by anti-bot systems
- Unstable success rate