From cc48c901c06d80ecd50e0bc1f2bef51402dc8f10 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 18:58:27 -0500 Subject: [PATCH] fix(scrapers): harden CSES and CF submit edge cases (#295) Problem: CSES `_web_login` did bare dict indexing on the API response, causing an opaque `KeyError` if fields were absent. `_check_token` swallowed all exceptions as `False`, treating transient network errors as invalid tokens. CF wrote cookies unconditionally (login and submit), and swallowed `_solve_turnstile` failures in `submit_action`. Solution: Replace direct indexing with `.get()` + `RuntimeError` for missing CSES API fields. Re-raise `httpx` network/timeout exceptions from `_check_token`. Guard CF cookie writes behind an `X-User-Handle` check (the CF auth cookie). Propagate `_solve_turnstile` errors so failures surface rather than silently proceeding. --- scrapers/codeforces.py | 10 ++++------ scrapers/cses.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 7a96483..43d7328 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -401,7 +401,8 @@ def _login_headless_cf(credentials: dict[str, str]) -> LoginResult: try: browser_cookies = session.context.cookies() - cookie_cache.write_text(json.dumps(browser_cookies)) + if any(c.get("name") == "X-User-Handle" for c in browser_cookies): + cookie_cache.write_text(json.dumps(browser_cookies)) except Exception: pass @@ -478,10 +479,7 @@ def _submit_headless( if "/enter" in page.url or "/login" in page.url: needs_relogin = True return - try: - _solve_turnstile(page) - except Exception: - pass + _solve_turnstile(page) try: page.select_option( 'select[name="submittedProblemIndex"]', @@ -550,7 +548,7 @@ def _submit_headless( try: browser_cookies = session.context.cookies() - if browser_cookies: + if any(c.get("name") == "X-User-Handle" for c in browser_cookies): cookie_cache.write_text(json.dumps(browser_cookies)) except Exception: pass diff --git a/scrapers/cses.py b/scrapers/cses.py index ef5deda..cbf76e2 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -352,8 +352,12 @@ class CSESScraper(BaseScraper): f"{API_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT ) api_data = api_resp.json() - token: str = api_data["X-Auth-Token"] - auth_url: str = api_data["authentication_url"] + token: str | None = api_data.get("X-Auth-Token") + auth_url: str | None = api_data.get("authentication_url") + if not token: + raise RuntimeError("CSES API login response missing 'X-Auth-Token'") + if not auth_url: + raise RuntimeError("CSES API login response missing 'authentication_url'") auth_page = await client.get(auth_url, headers=HEADERS, timeout=HTTP_TIMEOUT) auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text) @@ -388,8 +392,8 @@ class CSESScraper(BaseScraper): timeout=HTTP_TIMEOUT, ) return r.status_code == 200 - except Exception: - return False + except (httpx.ConnectError, httpx.TimeoutException, httpx.NetworkError): + raise async def submit( self,