import asyncio, sys, traceback, re
import random
import time
import os
from lxml import etree
from urllib.parse import urlparse, urlencode
from playwright.async_api import async_playwright
from sdk import CafeSDK
async def run():
CafeSDK.Log.info("🚀 Init...")
CafeSDK.Log.info("====================================================")
CafeSDK.Log.info("🚀 CafeScraper Playwright Browser Scrape Demo")
CafeSDK.Log.info("====================================================")
headers = [
{"label": "url", "key": "url", "format": "text"},
{"label": "html", "key": "html", "format": "text"},
{"label": "resp_status", "key": "resp_status", "format": "text"},
]
CafeSDK.Result.set_table_header(headers)
input_json_dict = CafeSDK.Parameter.get_input_json_dict()
CafeSDK.Log.debug(f"======input_json_dict====== {input_json_dict}")
url = input_json_dict['url']
try:
Auth = os.environ.get("PROXY_AUTH")
CafeSDK.Log.info(f"Current browser authentication info: {Auth}")
except Exception as e:
CafeSDK.Log.error(f"Failed to obtain browser authentication info: {e}")
return
browser_url = f'ws://{Auth}@chrome-ws-inner.cafescraper.com'
rest_item = {"url": url, "html": "", "resp_status": "200"}
async with async_playwright() as playwright:
CafeSDK.Log.info(f"Requested URL: {url}")
try:
browser = await playwright.chromium.connect_over_cdp(browser_url)
except Exception as e:
CafeSDK.Log.info(f"[Error] Failed to connect fingerprint browser: {e}")
rest_item['resp_status'] = "403"
await asyncio.sleep(5)
await browser.close()
return
try:
page = await browser.new_page(no_viewport=True)
await page.goto(url, timeout=3 * 60 * 1000)
html = await page.content()
rest_item["html"] = html
except Exception as e:
CafeSDK.Log.info(f"[Error] Failed to retrieve page HTML: {e}")
rest_item['resp_status'] = "500"
CafeSDK.Result.push_data(rest_item)
await asyncio.sleep(5)
await browser.close()
if __name__ == "__main__":
asyncio.run(run())