diff --git a/flake.nix b/flake.nix index 1ccb2d3..1c416c6 100644 --- a/flake.nix +++ b/flake.nix @@ -19,6 +19,7 @@ pkgs.python312.withPackages (ps: [ ps.backoff ps.beautifulsoup4 + ps.curl-cffi ps.httpx ps.ndjson ps.pydantic @@ -30,6 +31,7 @@ pkgs.python312.withPackages (ps: [ ps.backoff ps.beautifulsoup4 + ps.curl-cffi ps.httpx ps.ndjson ps.pydantic diff --git a/lua/cp/scraper.lua b/lua/cp/scraper.lua index 28705df..8f9759d 100644 --- a/lua/cp/scraper.lua +++ b/lua/cp/scraper.lua @@ -44,9 +44,7 @@ local function run_scraper(platform, subcommand, args, opts) return { success = false, error = msg } end - local needs_browser = subcommand == 'submit' - or subcommand == 'login' - or (platform == 'codeforces' and (subcommand == 'metadata' or subcommand == 'tests')) + local needs_browser = subcommand == 'submit' or subcommand == 'login' if needs_browser then utils.setup_nix_submit_env() diff --git a/pyproject.toml b/pyproject.toml index e677a57..9ffc00c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "backoff>=2.2.1", "beautifulsoup4>=4.13.5", "scrapling[fetchers]>=0.4", + "curl-cffi>=0.13.0", "httpx>=0.28.1", "ndjson>=0.3.1", "pydantic>=2.11.10", diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 8ae768e..d2e7083 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -7,6 +7,7 @@ from typing import Any import requests from bs4 import BeautifulSoup, Tag +from curl_cffi import requests as curl_requests from .base import BaseScraper, extract_precision from .models import ( @@ -140,30 +141,10 @@ def _is_interactive(block: Tag) -> bool: def _fetch_problems_html(contest_id: str) -> str: - try: - from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import] - except ImportError: - raise RuntimeError("scrapling is required for Codeforces metadata") - - from .atcoder import _ensure_browser - - _ensure_browser() - url = f"{BASE_URL}/contest/{contest_id}/problems" - html = "" - - def page_action(page): - nonlocal html - html = page.content() - - with StealthySession( - headless=True, - timeout=BROWSER_SESSION_TIMEOUT, - google_search=False, - ) as session: - session.fetch(url, page_action=page_action, solve_cloudflare=True) - - return html + response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT) + response.raise_for_status() + return response.text def _parse_all_blocks(html: str) -> list[dict[str, Any]]: diff --git a/tests/conftest.py b/tests/conftest.py index b6ff810..deb7e3a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,7 @@ from typing import Any import httpx import pytest import requests +from curl_cffi import requests as curl_requests ROOT = Path(__file__).resolve().parent.parent FIX = Path(__file__).resolve().parent / "fixtures" @@ -135,10 +136,15 @@ def run_scraper_offline(fixture_text): case "codeforces": - def _mock_fetch_problems_html(cid: str) -> str: - return _router_codeforces( - url=f"https://codeforces.com/contest/{cid}/problems" - ) + class MockCurlResponse: + def __init__(self, html: str): + self.text = html + + def raise_for_status(self): + pass + + def _mock_curl_get(url: str, **kwargs): + return MockCurlResponse(_router_codeforces(url=url)) def _mock_requests_get(url: str, **kwargs): if "api/contest.list" in url: @@ -169,7 +175,7 @@ def run_scraper_offline(fixture_text): raise AssertionError(f"Unexpected requests.get call: {url}") return { - "_fetch_problems_html": _mock_fetch_problems_html, + "curl_requests.get": _mock_curl_get, "requests.get": _mock_requests_get, } @@ -209,8 +215,23 @@ def run_scraper_offline(fixture_text): return MockResponse(data) raise AssertionError(f"No fixture for CodeChef url={url!r}") + class MockCodeChefCurlResponse: + def __init__(self, html: str): + self.text = html + + def raise_for_status(self): + pass + + def _mock_curl_get(url: str, **kwargs): + if "/problems/" in url: + problem_id = url.rstrip("/").split("/")[-1] + html = fixture_text(f"codechef/{problem_id}.html") + return MockCodeChefCurlResponse(html) + raise AssertionError(f"No fixture for CodeChef url={url!r}") + return { "__offline_get_async": __offline_get_async, + "curl_requests.get": _mock_curl_get, } case _: @@ -229,7 +250,7 @@ def run_scraper_offline(fixture_text): offline_fetches = _make_offline_fetches(scraper_name) if scraper_name == "codeforces": - ns._fetch_problems_html = offline_fetches["_fetch_problems_html"] + curl_requests.get = offline_fetches["curl_requests.get"] requests.get = offline_fetches["requests.get"] elif scraper_name == "atcoder": ns._fetch = offline_fetches["_fetch"] @@ -238,6 +259,7 @@ def run_scraper_offline(fixture_text): httpx.AsyncClient.get = offline_fetches["__offline_fetch_text"] elif scraper_name == "codechef": httpx.AsyncClient.get = offline_fetches["__offline_get_async"] + curl_requests.get = offline_fetches["curl_requests.get"] scraper_class = getattr(ns, scraper_classes[scraper_name]) scraper = scraper_class()