fix: replace curl_cffi with scrapling in codeforces metadata

Problem: `codeforces.py` used `curl_cffi` to bypass Cloudflare when fetching contest problem HTML, making it unavailable in the nix python env and requiring an extra dependency. Solution: rewrite `_fetch_problems_html` to use scrapling `StealthySession` with `solve_cloudflare=True`, matching the existing CF submit pattern. Extend `needs_browser` in `scraper.lua` to route CF `metadata` and `tests` through the FHS env on NixOS. Remove `curl-cffi` from `pyproject.toml`, `flake.nix`, and test mocks.
2026-03-06 13:18:06 -05:00 · 2026-03-06 13:18:06 -05:00 · 297c71e7c7
commit 297c71e7c7
parent 543480a4fe
5 changed files with 30 additions and 35 deletions
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@ -7,7 +7,6 @@ from typing import Any

 import requests
 from bs4 import BeautifulSoup, Tag
-from curl_cffi import requests as curl_requests

 from .base import BaseScraper, extract_precision
 from .models import (
@ -141,10 +140,30 @@ def _is_interactive(block: Tag) -> bool:


 def _fetch_problems_html(contest_id: str) -> str:
+    try:
+        from scrapling.fetchers import StealthySession  # type: ignore[import-untyped,unresolved-import]
+    except ImportError:
+        raise RuntimeError("scrapling is required for Codeforces metadata")
+
+    from .atcoder import _ensure_browser
+
+    _ensure_browser()
+
    url = f"{BASE_URL}/contest/{contest_id}/problems"
-    response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
-    response.raise_for_status()
-    return response.text
+    html = ""
+
+    def page_action(page):
+        nonlocal html
+        html = page.content()
+
+    with StealthySession(
+        headless=True,
+        timeout=BROWSER_SESSION_TIMEOUT,
+        google_search=False,
+    ) as session:
+        session.fetch(url, page_action=page_action, solve_cloudflare=True)
+
+    return html


 def _parse_all_blocks(html: str) -> list[dict[str, Any]]: