fix: replace curl_cffi with scrapling in codeforces metadata

Problem: `codeforces.py` used `curl_cffi` to bypass Cloudflare when
fetching contest problem HTML, making it unavailable in the nix python
env and requiring an extra dependency.

Solution: rewrite `_fetch_problems_html` to use scrapling
`StealthySession` with `solve_cloudflare=True`, matching the existing
CF submit pattern. Extend `needs_browser` in `scraper.lua` to route CF
`metadata` and `tests` through the FHS env on NixOS. Remove `curl-cffi`
from `pyproject.toml`, `flake.nix`, and test mocks.
This commit is contained in:
Barrett Ruth 2026-03-06 13:18:06 -05:00
parent 543480a4fe
commit 297c71e7c7
Signed by: barrett
GPG key ID: A6C96C9349D2FC81
5 changed files with 30 additions and 35 deletions

View file

@ -19,7 +19,6 @@
pkgs.python312.withPackages (ps: [
ps.backoff
ps.beautifulsoup4
ps.curl-cffi
ps.httpx
ps.ndjson
ps.pydantic
@ -31,7 +30,6 @@
pkgs.python312.withPackages (ps: [
ps.backoff
ps.beautifulsoup4
ps.curl-cffi
ps.httpx
ps.ndjson
ps.pydantic

View file

@ -45,6 +45,7 @@ local function run_scraper(platform, subcommand, args, opts)
end
local needs_browser = subcommand == 'submit' or subcommand == 'login'
or (platform == 'codeforces' and (subcommand == 'metadata' or subcommand == 'tests'))
if needs_browser then
utils.setup_nix_submit_env()

View file

@ -8,7 +8,6 @@ dependencies = [
"backoff>=2.2.1",
"beautifulsoup4>=4.13.5",
"scrapling[fetchers]>=0.4",
"curl-cffi>=0.13.0",
"httpx>=0.28.1",
"ndjson>=0.3.1",
"pydantic>=2.11.10",

View file

@ -7,7 +7,6 @@ from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
from curl_cffi import requests as curl_requests
from .base import BaseScraper, extract_precision
from .models import (
@ -141,10 +140,30 @@ def _is_interactive(block: Tag) -> bool:
def _fetch_problems_html(contest_id: str) -> str:
try:
from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import]
except ImportError:
raise RuntimeError("scrapling is required for Codeforces metadata")
from .atcoder import _ensure_browser
_ensure_browser()
url = f"{BASE_URL}/contest/{contest_id}/problems"
response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
response.raise_for_status()
return response.text
html = ""
def page_action(page):
nonlocal html
html = page.content()
with StealthySession(
headless=True,
timeout=BROWSER_SESSION_TIMEOUT,
google_search=False,
) as session:
session.fetch(url, page_action=page_action, solve_cloudflare=True)
return html
def _parse_all_blocks(html: str) -> list[dict[str, Any]]:

View file

@ -10,7 +10,6 @@ from typing import Any
import httpx
import pytest
import requests
from curl_cffi import requests as curl_requests
ROOT = Path(__file__).resolve().parent.parent
FIX = Path(__file__).resolve().parent / "fixtures"
@ -136,15 +135,10 @@ def run_scraper_offline(fixture_text):
case "codeforces":
class MockCurlResponse:
def __init__(self, html: str):
self.text = html
def raise_for_status(self):
pass
def _mock_curl_get(url: str, **kwargs):
return MockCurlResponse(_router_codeforces(url=url))
def _mock_fetch_problems_html(cid: str) -> str:
return _router_codeforces(
url=f"https://codeforces.com/contest/{cid}/problems"
)
def _mock_requests_get(url: str, **kwargs):
if "api/contest.list" in url:
@ -175,7 +169,7 @@ def run_scraper_offline(fixture_text):
raise AssertionError(f"Unexpected requests.get call: {url}")
return {
"curl_requests.get": _mock_curl_get,
"_fetch_problems_html": _mock_fetch_problems_html,
"requests.get": _mock_requests_get,
}
@ -215,23 +209,8 @@ def run_scraper_offline(fixture_text):
return MockResponse(data)
raise AssertionError(f"No fixture for CodeChef url={url!r}")
class MockCodeChefCurlResponse:
def __init__(self, html: str):
self.text = html
def raise_for_status(self):
pass
def _mock_curl_get(url: str, **kwargs):
if "/problems/" in url:
problem_id = url.rstrip("/").split("/")[-1]
html = fixture_text(f"codechef/{problem_id}.html")
return MockCodeChefCurlResponse(html)
raise AssertionError(f"No fixture for CodeChef url={url!r}")
return {
"__offline_get_async": __offline_get_async,
"curl_requests.get": _mock_curl_get,
}
case _:
@ -250,7 +229,7 @@ def run_scraper_offline(fixture_text):
offline_fetches = _make_offline_fetches(scraper_name)
if scraper_name == "codeforces":
curl_requests.get = offline_fetches["curl_requests.get"]
ns._fetch_problems_html = offline_fetches["_fetch_problems_html"]
requests.get = offline_fetches["requests.get"]
elif scraper_name == "atcoder":
ns._fetch = offline_fetches["_fetch"]
@ -259,7 +238,6 @@ def run_scraper_offline(fixture_text):
httpx.AsyncClient.get = offline_fetches["__offline_fetch_text"]
elif scraper_name == "codechef":
httpx.AsyncClient.get = offline_fetches["__offline_get_async"]
curl_requests.get = offline_fetches["curl_requests.get"]
scraper_class = getattr(ns, scraper_classes[scraper_name])
scraper = scraper_class()