fix: replace curl_cffi with scrapling in codeforces metadata (#334)

## Problem

`codeforces.py` used `curl_cffi` to bypass Cloudflare when fetching
contest problem HTML, making it unavailable in the nix python env and
requiring an extra dependency across `pyproject.toml` and `flake.nix`.

## Solution

Rewrite `_fetch_problems_html` to use scrapling `StealthySession` with
`solve_cloudflare=True`, matching the existing CF submit pattern. Extend
`needs_browser` in `scraper.lua` to route CF `metadata` and `tests`
through the FHS env on NixOS. Remove `curl-cffi` from `pyproject.toml`,
`flake.nix`, and test mocks.
This commit is contained in:
Barrett Ruth 2026-03-06 13:25:44 -05:00 committed by GitHub
parent 6a3d2fe4f8
commit 0b40e0f33e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 32 additions and 36 deletions

View file

@ -7,7 +7,6 @@ from typing import Any
import requests
from bs4 import BeautifulSoup, Tag
from curl_cffi import requests as curl_requests
from .base import BaseScraper, extract_precision
from .models import (
@ -141,10 +140,30 @@ def _is_interactive(block: Tag) -> bool:
def _fetch_problems_html(contest_id: str) -> str:
try:
from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import]
except ImportError:
raise RuntimeError("scrapling is required for Codeforces metadata")
from .atcoder import _ensure_browser
_ensure_browser()
url = f"{BASE_URL}/contest/{contest_id}/problems"
response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
response.raise_for_status()
return response.text
html = ""
def page_action(page):
nonlocal html
html = page.content()
with StealthySession(
headless=True,
timeout=BROWSER_SESSION_TIMEOUT,
google_search=False,
) as session:
session.fetch(url, page_action=page_action, solve_cloudflare=True)
return html
def _parse_all_blocks(html: str) -> list[dict[str, Any]]: