fix(codechef): rewrite contest list, drop curl_cffi

Problem: `scrape_contest_list` made O(N) requests (one per Starters
number up to ~200) to discover division sub-contests. `run_one` also
fetched problem HTML via `curl_cffi` solely for the memory limit,
unavailable in the nix python env.

Solution: use `/api/list/contests/all` directly — filter to `^START\d+$`
codes and map to `ContestSummary` in a single request. Remove
`_fetch_html_sync`, `MEMORY_LIMIT_RE`, and `_extract_memory_limit`;
hardcode `memory_mb = 256.0` and `precision = None` in `run_one`.
This commit is contained in:
Barrett Ruth 2026-03-06 13:18:05 -05:00
parent 543480a4fe
commit 59bed7b902
Signed by: barrett
GPG key ID: A6C96C9349D2FC81

View file

@ -3,13 +3,13 @@
import asyncio import asyncio
import json import json
import re import re
from datetime import datetime
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import httpx import httpx
from curl_cffi import requests as curl_requests
from .base import BaseScraper, extract_precision from .base import BaseScraper
from .timeouts import BROWSER_NAV_TIMEOUT, BROWSER_SESSION_TIMEOUT, HTTP_TIMEOUT from .timeouts import BROWSER_NAV_TIMEOUT, BROWSER_SESSION_TIMEOUT, HTTP_TIMEOUT
from .models import ( from .models import (
ContestListResult, ContestListResult,
@ -25,7 +25,6 @@ BASE_URL = "https://www.codechef.com"
API_CONTESTS_ALL = "/api/list/contests/all" API_CONTESTS_ALL = "/api/list/contests/all"
API_CONTEST = "/api/contests/{contest_id}" API_CONTEST = "/api/contests/{contest_id}"
API_PROBLEM = "/api/contests/{contest_id}/problems/{problem_id}" API_PROBLEM = "/api/contests/{contest_id}/problems/{problem_id}"
PROBLEM_URL = "https://www.codechef.com/problems/{problem_id}"
HEADERS = { HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
@ -44,9 +43,6 @@ _CC_CHECK_LOGIN_JS = """() => {
return !!document.querySelector('a[href="/logout"]') || return !!document.querySelector('a[href="/logout"]') ||
!!document.querySelector('[class*="user-name"]'); !!document.querySelector('[class*="user-name"]');
}""" }"""
MEMORY_LIMIT_RE = re.compile(
r"Memory\s+[Ll]imit.*?([0-9.]+)\s*(MB|GB)", re.IGNORECASE | re.DOTALL
)
async def fetch_json(client: httpx.AsyncClient, path: str) -> dict[str, Any]: async def fetch_json(client: httpx.AsyncClient, path: str) -> dict[str, Any]:
@ -55,23 +51,6 @@ async def fetch_json(client: httpx.AsyncClient, path: str) -> dict[str, Any]:
return r.json() return r.json()
def _extract_memory_limit(html: str) -> float:
m = MEMORY_LIMIT_RE.search(html)
if not m:
return 256.0
value = float(m.group(1))
unit = m.group(2).upper()
if unit == "GB":
return value * 1024.0
return value
def _fetch_html_sync(url: str) -> str:
response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
response.raise_for_status()
return response.text
def _login_headless_codechef(credentials: dict[str, str]) -> LoginResult: def _login_headless_codechef(credentials: dict[str, str]) -> LoginResult:
try: try:
from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import] from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import]
@ -364,56 +343,29 @@ class CodeChefScraper(BaseScraper):
data = await fetch_json(client, API_CONTESTS_ALL) data = await fetch_json(client, API_CONTESTS_ALL)
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
return self._contests_error(f"Failed to fetch contests: {e}") return self._contests_error(f"Failed to fetch contests: {e}")
all_contests = data.get("future_contests", []) + data.get( contests: list[ContestSummary] = []
"past_contests", [] seen: set[str] = set()
for c in data.get("future_contests", []) + data.get("past_contests", []):
code = c.get("contest_code", "")
name = c.get("contest_name", code)
if not re.match(r"^START\d+$", code):
continue
if code in seen:
continue
seen.add(code)
start_time: int | None = None
iso = c.get("contest_start_date_iso")
if iso:
try:
dt = datetime.fromisoformat(iso)
start_time = int(dt.timestamp())
except Exception:
pass
contests.append(
ContestSummary(id=code, name=name, display_name=name, start_time=start_time)
) )
max_num = 0 if not contests:
for contest in all_contests: return self._contests_error("No Starters contests found")
contest_code = contest.get("contest_code", "")
if contest_code.startswith("START"):
match = re.match(r"START(\d+)", contest_code)
if match:
num = int(match.group(1))
max_num = max(max_num, num)
if max_num == 0:
return self._contests_error("No Starters contests found")
contests = []
sem = asyncio.Semaphore(CONNECTIONS)
async def fetch_divisions(i: int) -> list[ContestSummary]:
parent_id = f"START{i}"
async with sem:
try:
parent_data = await fetch_json(
client, API_CONTEST.format(contest_id=parent_id)
)
except Exception as e:
import sys
print(f"Error fetching {parent_id}: {e}", file=sys.stderr)
return []
child_contests = parent_data.get("child_contests", {})
if not child_contests:
return []
base_name = f"Starters {i}"
divisions = []
for div_key, div_data in child_contests.items():
div_code = div_data.get("contest_code", "")
div_num = div_data.get("div", {}).get("div_number", "")
if div_code and div_num:
divisions.append(
ContestSummary(
id=div_code,
name=base_name,
display_name=f"{base_name} (Div. {div_num})",
)
)
return divisions
tasks = [fetch_divisions(i) for i in range(1, max_num + 1)]
for coro in asyncio.as_completed(tasks):
divisions = await coro
contests.extend(divisions)
return ContestListResult(success=True, error="", contests=contests) return ContestListResult(success=True, error="", contests=contests)
async def stream_tests_for_category_async(self, category_id: str) -> None: async def stream_tests_for_category_async(self, category_id: str) -> None:
@ -481,14 +433,9 @@ class CodeChefScraper(BaseScraper):
] ]
time_limit_str = problem_data.get("max_timelimit", "1") time_limit_str = problem_data.get("max_timelimit", "1")
timeout_ms = int(float(time_limit_str) * 1000) timeout_ms = int(float(time_limit_str) * 1000)
problem_url = PROBLEM_URL.format(problem_id=problem_code) memory_mb = 256.0
loop = asyncio.get_event_loop()
html = await loop.run_in_executor(
None, _fetch_html_sync, problem_url
)
memory_mb = _extract_memory_limit(html)
interactive = False interactive = False
precision = extract_precision(html) precision = None
except Exception: except Exception:
tests = [] tests = []
timeout_ms = 1000 timeout_ms = 1000