From 0b40e0f33e7cf43dc045734493d5f64f9f00d7d6 Mon Sep 17 00:00:00 2001 From: Barrett Ruth <62671086+barrettruth@users.noreply.github.com> Date: Fri, 6 Mar 2026 13:25:44 -0500 Subject: [PATCH] fix: replace curl_cffi with scrapling in codeforces metadata (#334) ## Problem `codeforces.py` used `curl_cffi` to bypass Cloudflare when fetching contest problem HTML, making it unavailable in the nix python env and requiring an extra dependency across `pyproject.toml` and `flake.nix`. ## Solution Rewrite `_fetch_problems_html` to use scrapling `StealthySession` with `solve_cloudflare=True`, matching the existing CF submit pattern. Extend `needs_browser` in `scraper.lua` to route CF `metadata` and `tests` through the FHS env on NixOS. Remove `curl-cffi` from `pyproject.toml`, `flake.nix`, and test mocks. --- flake.nix | 2 -- lua/cp/scraper.lua | 4 +++- pyproject.toml | 1 - scrapers/codeforces.py | 27 +++++++++++++++++++++++---- tests/conftest.py | 34 ++++++---------------------------- 5 files changed, 32 insertions(+), 36 deletions(-) diff --git a/flake.nix b/flake.nix index 1c416c6..1ccb2d3 100644 --- a/flake.nix +++ b/flake.nix @@ -19,7 +19,6 @@ pkgs.python312.withPackages (ps: [ ps.backoff ps.beautifulsoup4 - ps.curl-cffi ps.httpx ps.ndjson ps.pydantic @@ -31,7 +30,6 @@ pkgs.python312.withPackages (ps: [ ps.backoff ps.beautifulsoup4 - ps.curl-cffi ps.httpx ps.ndjson ps.pydantic diff --git a/lua/cp/scraper.lua b/lua/cp/scraper.lua index 8f9759d..28705df 100644 --- a/lua/cp/scraper.lua +++ b/lua/cp/scraper.lua @@ -44,7 +44,9 @@ local function run_scraper(platform, subcommand, args, opts) return { success = false, error = msg } end - local needs_browser = subcommand == 'submit' or subcommand == 'login' + local needs_browser = subcommand == 'submit' + or subcommand == 'login' + or (platform == 'codeforces' and (subcommand == 'metadata' or subcommand == 'tests')) if needs_browser then utils.setup_nix_submit_env() diff --git a/pyproject.toml b/pyproject.toml index 9ffc00c..e677a57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ dependencies = [ "backoff>=2.2.1", "beautifulsoup4>=4.13.5", "scrapling[fetchers]>=0.4", - "curl-cffi>=0.13.0", "httpx>=0.28.1", "ndjson>=0.3.1", "pydantic>=2.11.10", diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index d2e7083..8ae768e 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -7,7 +7,6 @@ from typing import Any import requests from bs4 import BeautifulSoup, Tag -from curl_cffi import requests as curl_requests from .base import BaseScraper, extract_precision from .models import ( @@ -141,10 +140,30 @@ def _is_interactive(block: Tag) -> bool: def _fetch_problems_html(contest_id: str) -> str: + try: + from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import] + except ImportError: + raise RuntimeError("scrapling is required for Codeforces metadata") + + from .atcoder import _ensure_browser + + _ensure_browser() + url = f"{BASE_URL}/contest/{contest_id}/problems" - response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT) - response.raise_for_status() - return response.text + html = "" + + def page_action(page): + nonlocal html + html = page.content() + + with StealthySession( + headless=True, + timeout=BROWSER_SESSION_TIMEOUT, + google_search=False, + ) as session: + session.fetch(url, page_action=page_action, solve_cloudflare=True) + + return html def _parse_all_blocks(html: str) -> list[dict[str, Any]]: diff --git a/tests/conftest.py b/tests/conftest.py index deb7e3a..b6ff810 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,6 @@ from typing import Any import httpx import pytest import requests -from curl_cffi import requests as curl_requests ROOT = Path(__file__).resolve().parent.parent FIX = Path(__file__).resolve().parent / "fixtures" @@ -136,15 +135,10 @@ def run_scraper_offline(fixture_text): case "codeforces": - class MockCurlResponse: - def __init__(self, html: str): - self.text = html - - def raise_for_status(self): - pass - - def _mock_curl_get(url: str, **kwargs): - return MockCurlResponse(_router_codeforces(url=url)) + def _mock_fetch_problems_html(cid: str) -> str: + return _router_codeforces( + url=f"https://codeforces.com/contest/{cid}/problems" + ) def _mock_requests_get(url: str, **kwargs): if "api/contest.list" in url: @@ -175,7 +169,7 @@ def run_scraper_offline(fixture_text): raise AssertionError(f"Unexpected requests.get call: {url}") return { - "curl_requests.get": _mock_curl_get, + "_fetch_problems_html": _mock_fetch_problems_html, "requests.get": _mock_requests_get, } @@ -215,23 +209,8 @@ def run_scraper_offline(fixture_text): return MockResponse(data) raise AssertionError(f"No fixture for CodeChef url={url!r}") - class MockCodeChefCurlResponse: - def __init__(self, html: str): - self.text = html - - def raise_for_status(self): - pass - - def _mock_curl_get(url: str, **kwargs): - if "/problems/" in url: - problem_id = url.rstrip("/").split("/")[-1] - html = fixture_text(f"codechef/{problem_id}.html") - return MockCodeChefCurlResponse(html) - raise AssertionError(f"No fixture for CodeChef url={url!r}") - return { "__offline_get_async": __offline_get_async, - "curl_requests.get": _mock_curl_get, } case _: @@ -250,7 +229,7 @@ def run_scraper_offline(fixture_text): offline_fetches = _make_offline_fetches(scraper_name) if scraper_name == "codeforces": - curl_requests.get = offline_fetches["curl_requests.get"] + ns._fetch_problems_html = offline_fetches["_fetch_problems_html"] requests.get = offline_fetches["requests.get"] elif scraper_name == "atcoder": ns._fetch = offline_fetches["_fetch"] @@ -259,7 +238,6 @@ def run_scraper_offline(fixture_text): httpx.AsyncClient.get = offline_fetches["__offline_fetch_text"] elif scraper_name == "codechef": httpx.AsyncClient.get = offline_fetches["__offline_get_async"] - curl_requests.get = offline_fetches["curl_requests.get"] scraper_class = getattr(ns, scraper_classes[scraper_name]) scraper = scraper_class()