fix: replace curl_cffi with scrapling in codeforces metadata

Problem: `codeforces.py` used `curl_cffi` to bypass Cloudflare when
fetching contest problem HTML, making it unavailable in the nix python
env and requiring an extra dependency.

Solution: rewrite `_fetch_problems_html` to use scrapling
`StealthySession` with `solve_cloudflare=True`, matching the existing
CF submit pattern. Extend `needs_browser` in `scraper.lua` to route CF
`metadata` and `tests` through the FHS env on NixOS. Remove `curl-cffi`
from `pyproject.toml`, `flake.nix`, and test mocks.
This commit is contained in:
Barrett Ruth 2026-03-06 13:18:06 -05:00
parent 543480a4fe
commit 297c71e7c7
Signed by: barrett
GPG key ID: A6C96C9349D2FC81
5 changed files with 30 additions and 35 deletions

View file

@ -7,7 +7,6 @@ from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
from curl_cffi import requests as curl_requests
from .base import BaseScraper, extract_precision
from .models import (
@ -141,10 +140,30 @@ def _is_interactive(block: Tag) -> bool:
def _fetch_problems_html(contest_id: str) -> str:
try:
from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import]
except ImportError:
raise RuntimeError("scrapling is required for Codeforces metadata")
from .atcoder import _ensure_browser
_ensure_browser()
url = f"{BASE_URL}/contest/{contest_id}/problems"
response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
response.raise_for_status()
return response.text
html = ""
def page_action(page):
nonlocal html
html = page.content()
with StealthySession(
headless=True,
timeout=BROWSER_SESSION_TIMEOUT,
google_search=False,
) as session:
session.fetch(url, page_action=page_action, solve_cloudflare=True)
return html
def _parse_all_blocks(html: str) -> list[dict[str, Any]]: