From 08593d828db7bac9d6291bde092dc2f9a2ef7254 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 01:18:09 -0500 Subject: [PATCH 1/9] docs: add table of contents to vimdoc --- doc/cp.nvim.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/doc/cp.nvim.txt b/doc/cp.nvim.txt index f61d39e..2c0cc5c 100644 --- a/doc/cp.nvim.txt +++ b/doc/cp.nvim.txt @@ -3,6 +3,37 @@ Author: Barrett Ruth License: Same terms as Vim itself (see |license|) +============================================================================== +CONTENTS *cp-contents* + + 1. Introduction .................................................. |cp.nvim| + 2. Requirements ........................................ |cp-requirements| + 3. Setup ........................................................ |cp-setup| + 4. Configuration ................................................ |cp-config| + 5. Commands .................................................. |cp-commands| + 6. Mappings .................................................. |cp-mappings| + 7. Language Selection .................................. |cp-lang-selection| + 8. Workflow .................................................. |cp-workflow| + 9. Workflow Example ............................................ |cp-example| + 10. Verdict Formatting ................................. |cp-verdict-format| + 11. Picker Integration .......................................... |cp-picker| + 12. Picker Keymaps ........................................ |cp-picker-keys| + 13. Panel ........................................................ |cp-panel| + 14. Interactive Mode .......................................... |cp-interact| + 15. Stress Testing .............................................. |cp-stress| + 16. Race .......................................................... |cp-race| + 17. Credentials ............................................ |cp-credentials| + 18. Submit ...................................................... |cp-submit| + 19. ANSI Colors ................................................... |cp-ansi| + 20. Highlight Groups ........................................ |cp-highlights| + 21. Terminal Colors .................................... |cp-terminal-colors| + 22. Highlight Customization .......................... |cp-highlight-custom| + 23. Helpers .................................................... |cp-helpers| + 24. Statusline Integration .................................. |cp-statusline| + 25. Panel Keymaps .......................................... |cp-panel-keys| + 26. File Structure ................................................ |cp-files| + 27. Health Check ................................................ |cp-health| + ============================================================================== INTRODUCTION *cp.nvim* From 027fae65a477b50f416d36edd84ad3bdcb71cede Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 01:18:16 -0500 Subject: [PATCH 2/9] perf(cses): cache API token across submits Problem: every `:CP submit` on CSES ran the full 5-request login flow (~1.5 s overhead) even when the token from a previous submit was still valid. Solution: persist the API token in credentials via a `credentials` ndjson event. On subsequent submits, validate the cached token with a single GET before falling back to the full login. --- scrapers/cses.py | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/scrapers/cses.py b/scrapers/cses.py index b2e845a..2c2c2ce 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -342,6 +342,19 @@ class CSESScraper(BaseScraper): return None return token + async def _check_token( + self, client: httpx.AsyncClient, token: str + ) -> bool: + try: + r = await client.get( + f"{API_URL}/login", + headers={"X-Auth-Token": token, **HEADERS}, + timeout=TIMEOUT_S, + ) + return r.status_code == 200 + except Exception: + return False + async def submit( self, contest_id: str, @@ -356,11 +369,30 @@ class CSESScraper(BaseScraper): return self._submit_error("Missing credentials. Use :CP login cses") async with httpx.AsyncClient(follow_redirects=True) as client: - print(json.dumps({"status": "logging_in"}), flush=True) + token = credentials.get("token") + + if token: + print(json.dumps({"status": "checking_login"}), flush=True) + if not await self._check_token(client, token): + token = None - token = await self._web_login(client, username, password) if not token: - return self._submit_error("Login failed (bad credentials?)") + print(json.dumps({"status": "logging_in"}), flush=True) + token = await self._web_login(client, username, password) + if not token: + return self._submit_error("Login failed (bad credentials?)") + print( + json.dumps( + { + "credentials": { + "username": username, + "password": password, + "token": token, + } + } + ), + flush=True, + ) print(json.dumps({"status": "submitting"}), flush=True) From f4055b071b17abdbe111abf44aecd5fd8a978a3b Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 01:18:23 -0500 Subject: [PATCH 3/9] feat(codeforces): implement submit via headless browser Problem: Codeforces submit was a stub returning "not yet implemented". Solution: use StealthySession (same pattern as AtCoder) to handle Cloudflare Turnstile on the login page, fill credentials, navigate to the contest submit form, upload source via file input, and cache cookies at `~/.cache/cp-nvim/codeforces-cookies.json` so repeat submits skip the login entirely. Uses a single browser page action that checks for the submit form before navigating, avoiding redundant page loads and Turnstile challenges. --- scrapers/codeforces.py | 163 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 160 insertions(+), 3 deletions(-) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index c0495d8..7fc5c1c 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -2,7 +2,9 @@ import asyncio import json +import os import re +import tempfile from typing import Any import requests @@ -10,6 +12,7 @@ from bs4 import BeautifulSoup, Tag from curl_cffi import requests as curl_requests from .base import BaseScraper, extract_precision +from .language_ids import get_language_id from .models import ( ContestListResult, ContestSummary, @@ -289,13 +292,167 @@ class CodeforcesScraper(BaseScraper): language_id: str, credentials: dict[str, str], ) -> SubmitResult: + return await asyncio.to_thread( + _submit_headless, + contest_id, + problem_id, + source_code, + language_id, + credentials, + ) + + +def _submit_headless( + contest_id: str, + problem_id: str, + source_code: str, + language_id: str, + credentials: dict[str, str], +) -> SubmitResult: + from pathlib import Path + + try: + from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import] + except ImportError: return SubmitResult( success=False, - error="Codeforces submit not yet implemented", - submission_id="", - verdict="", + error="scrapling is required for Codeforces submit", ) + from .atcoder import _ensure_browser, _solve_turnstile + + _ensure_browser() + + cookie_cache = ( + Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" + ) + cookie_cache.parent.mkdir(parents=True, exist_ok=True) + saved_cookies: list[dict[str, Any]] = [] + if cookie_cache.exists(): + try: + saved_cookies = json.loads(cookie_cache.read_text()) + except Exception: + pass + + login_error: str | None = None + submit_error: str | None = None + + def do_login_and_submit(page): + nonlocal login_error, submit_error + + has_submit_form = page.evaluate( + "() => !!document.querySelector('form.submit-form')" + ) + + if not has_submit_form: + if "/enter" not in page.url: + page.goto( + f"{BASE_URL}/enter", + wait_until="domcontentloaded", + timeout=10000, + ) + + try: + _solve_turnstile(page) + except Exception: + pass + + print(json.dumps({"status": "logging_in"}), flush=True) + try: + page.fill( + 'input[name="handleOrEmail"]', + credentials.get("username", ""), + ) + page.fill( + 'input[name="password"]', + credentials.get("password", ""), + ) + page.locator( + '#enterForm input[type="submit"]' + ).click() + page.wait_for_url( + lambda url: "/enter" not in url, timeout=10000 + ) + except Exception as e: + login_error = str(e) + return + + page.goto( + f"{BASE_URL}/contest/{contest_id}/submit", + wait_until="domcontentloaded", + timeout=10000, + ) + + print(json.dumps({"status": "submitting"}), flush=True) + try: + page.select_option( + 'select[name="submittedProblemIndex"]', + problem_id.upper(), + ) + page.select_option( + 'select[name="programTypeId"]', language_id + ) + with tempfile.NamedTemporaryFile( + mode="w", suffix=".cpp", delete=False, prefix="cf_" + ) as tf: + tf.write(source_code) + tmp_path = tf.name + try: + page.set_input_files( + 'input[name="sourceFile"]', tmp_path + ) + page.wait_for_timeout(500) + except Exception: + page.fill('textarea[name="source"]', source_code) + finally: + os.unlink(tmp_path) + page.locator('form.submit-form input.submit').click() + page.wait_for_url( + lambda url: "/my" in url or "/status" in url, + timeout=10000, + ) + except Exception as e: + submit_error = str(e) + + try: + with StealthySession( + headless=True, + timeout=15000, + google_search=False, + cookies=saved_cookies, + ) as session: + print(json.dumps({"status": "checking_login"}), flush=True) + session.fetch( + f"{BASE_URL}/contest/{contest_id}/submit", + page_action=do_login_and_submit, + solve_cloudflare=True, + ) + + try: + browser_cookies = session.context.cookies() + if any( + c["name"] == "JSESSIONID" for c in browser_cookies + ): + cookie_cache.write_text(json.dumps(browser_cookies)) + except Exception: + pass + + if login_error: + return SubmitResult( + success=False, error=f"Login failed: {login_error}" + ) + if submit_error: + return SubmitResult(success=False, error=submit_error) + + return SubmitResult( + success=True, + error="", + submission_id="", + verdict="submitted", + ) + except Exception as e: + return SubmitResult(success=False, error=str(e)) + if __name__ == "__main__": CodeforcesScraper().run_cli() From 2cdde85d36d1c60ab75c6af03edb4cfa344ae9e6 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 01:35:40 -0500 Subject: [PATCH 4/9] refactor: centralize timeout constants in `scrapers/timeouts.py` Problem: each scraper defined its own timeout constants (`TIMEOUT_S`, `TIMEOUT_SECONDS`) with inconsistent values (15s vs 30s) and browser timeouts were scattered as magic numbers (60000, 15000, 5000, 500). Solution: introduce `scrapers/timeouts.py` with named constants for HTTP requests, browser session/navigation/element/turnstile/settle timeouts, and submission polling. All six scrapers now import from the shared module. --- scrapers/atcoder.py | 25 ++++++++++++++++--------- scrapers/codechef.py | 6 +++--- scrapers/codeforces.py | 23 ++++++++++++++--------- scrapers/cses.py | 22 +++++++++++----------- scrapers/kattis.py | 6 +++--- scrapers/timeouts.py | 9 +++++++++ scrapers/usaco.py | 4 ++-- 7 files changed, 58 insertions(+), 37 deletions(-) create mode 100644 scrapers/timeouts.py diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 8be75ff..719135e 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -29,11 +29,18 @@ from .models import ( TestCase, TestsResult, ) +from .timeouts import ( + BROWSER_ELEMENT_WAIT, + BROWSER_NAV_TIMEOUT, + BROWSER_SESSION_TIMEOUT, + BROWSER_SETTLE_DELAY, + BROWSER_TURNSTILE_POLL, + HTTP_TIMEOUT, +) MIB_TO_MB = 1.048576 BASE_URL = "https://atcoder.jp" ARCHIVE_URL = f"{BASE_URL}/contests/archive" -TIMEOUT_SECONDS = 30 HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" } @@ -76,7 +83,7 @@ def _retry_after_requests(details): on_backoff=_retry_after_requests, ) def _fetch(url: str) -> str: - r = _session.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS) + r = _session.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) if r.status_code in RETRY_STATUS: raise requests.HTTPError(response=r) r.raise_for_status() @@ -99,7 +106,7 @@ def _giveup_httpx(exc: Exception) -> bool: giveup=_giveup_httpx, ) async def _get_async(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.text @@ -255,7 +262,7 @@ def _solve_turnstile(page) -> None: except Exception: pass try: - page.wait_for_function(_TURNSTILE_JS, timeout=5000) + page.wait_for_function(_TURNSTILE_JS, timeout=BROWSER_TURNSTILE_POLL) return except Exception: pass @@ -331,7 +338,7 @@ def _submit_headless( page.fill('input[name="username"]', credentials.get("username", "")) page.fill('input[name="password"]', credentials.get("password", "")) page.click("#submit") - page.wait_for_url(lambda url: "/login" not in url, timeout=60000) + page.wait_for_url(lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT) except Exception as e: login_error = str(e) @@ -345,7 +352,7 @@ def _submit_headless( ) page.locator( f'select[name="data.LanguageId"] option[value="{language_id}"]' - ).wait_for(state="attached", timeout=15000) + ).wait_for(state="attached", timeout=BROWSER_ELEMENT_WAIT) page.select_option('select[name="data.LanguageId"]', language_id) with tempfile.NamedTemporaryFile( mode="w", suffix=".cpp", delete=False, prefix="atcoder_" @@ -354,18 +361,18 @@ def _submit_headless( tmp_path = tf.name try: page.set_input_files("#input-open-file", tmp_path) - page.wait_for_timeout(500) + page.wait_for_timeout(BROWSER_SETTLE_DELAY) finally: os.unlink(tmp_path) page.locator('button[type="submit"]').click() - page.wait_for_url(lambda url: "/submissions/me" in url, timeout=60000) + page.wait_for_url(lambda url: "/submissions/me" in url, timeout=BROWSER_NAV_TIMEOUT) except Exception as e: submit_error = str(e) try: with StealthySession( headless=True, - timeout=60000, + timeout=BROWSER_SESSION_TIMEOUT, google_search=False, cookies=saved_cookies, ) as session: diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 57ce33e..c4b9d37 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -9,6 +9,7 @@ import httpx from curl_cffi import requests as curl_requests from .base import BaseScraper, extract_precision +from .timeouts import HTTP_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -26,7 +27,6 @@ PROBLEM_URL = "https://www.codechef.com/problems/{problem_id}" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 8 MEMORY_LIMIT_RE = re.compile( r"Memory\s+[Ll]imit.*?([0-9.]+)\s*(MB|GB)", re.IGNORECASE | re.DOTALL @@ -34,7 +34,7 @@ MEMORY_LIMIT_RE = re.compile( async def fetch_json(client: httpx.AsyncClient, path: str) -> dict: - r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.json() @@ -51,7 +51,7 @@ def _extract_memory_limit(html: str) -> float: def _fetch_html_sync(url: str) -> str: - response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_S) + response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT) response.raise_for_status() return response.text diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 7fc5c1c..05e4ba0 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -21,10 +21,15 @@ from .models import ( SubmitResult, TestCase, ) +from .timeouts import ( + BROWSER_NAV_TIMEOUT, + BROWSER_SESSION_TIMEOUT, + BROWSER_SETTLE_DELAY, + HTTP_TIMEOUT, +) BASE_URL = "https://codeforces.com" API_CONTEST_LIST_URL = f"{BASE_URL}/api/contest.list" -TIMEOUT_SECONDS = 30 HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" } @@ -139,7 +144,7 @@ def _is_interactive(block: Tag) -> bool: def _fetch_problems_html(contest_id: str) -> str: url = f"{BASE_URL}/contest/{contest_id}/problems" - response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_SECONDS) + response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT) response.raise_for_status() return response.text @@ -226,7 +231,7 @@ class CodeforcesScraper(BaseScraper): async def scrape_contest_list(self) -> ContestListResult: try: - r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS) + r = requests.get(API_CONTEST_LIST_URL, timeout=HTTP_TIMEOUT) r.raise_for_status() data = r.json() if data.get("status") != "OK": @@ -349,7 +354,7 @@ def _submit_headless( page.goto( f"{BASE_URL}/enter", wait_until="domcontentloaded", - timeout=10000, + timeout=BROWSER_NAV_TIMEOUT, ) try: @@ -371,7 +376,7 @@ def _submit_headless( '#enterForm input[type="submit"]' ).click() page.wait_for_url( - lambda url: "/enter" not in url, timeout=10000 + lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT ) except Exception as e: login_error = str(e) @@ -380,7 +385,7 @@ def _submit_headless( page.goto( f"{BASE_URL}/contest/{contest_id}/submit", wait_until="domcontentloaded", - timeout=10000, + timeout=BROWSER_NAV_TIMEOUT, ) print(json.dumps({"status": "submitting"}), flush=True) @@ -401,7 +406,7 @@ def _submit_headless( page.set_input_files( 'input[name="sourceFile"]', tmp_path ) - page.wait_for_timeout(500) + page.wait_for_timeout(BROWSER_SETTLE_DELAY) except Exception: page.fill('textarea[name="source"]', source_code) finally: @@ -409,7 +414,7 @@ def _submit_headless( page.locator('form.submit-form input.submit').click() page.wait_for_url( lambda url: "/my" in url or "/status" in url, - timeout=10000, + timeout=BROWSER_NAV_TIMEOUT, ) except Exception as e: submit_error = str(e) @@ -417,7 +422,7 @@ def _submit_headless( try: with StealthySession( headless=True, - timeout=15000, + timeout=BROWSER_SESSION_TIMEOUT, google_search=False, cookies=saved_cookies, ) as session: diff --git a/scrapers/cses.py b/scrapers/cses.py index 2c2c2ce..fe819fc 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -9,6 +9,7 @@ from typing import Any import httpx from .base import BaseScraper, extract_precision +from .timeouts import HTTP_TIMEOUT, SUBMIT_POLL_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -26,7 +27,6 @@ TASK_PATH = "/problemset/task/{id}" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 8 CSES_LANGUAGES: dict[str, dict[str, str]] = { @@ -78,7 +78,7 @@ def snake_to_title(name: str) -> str: async def fetch_text(client: httpx.AsyncClient, path: str) -> str: - r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.text @@ -290,7 +290,7 @@ class CSESScraper(BaseScraper): password: str, ) -> str | None: login_page = await client.get( - f"{BASE_URL}/login", headers=HEADERS, timeout=TIMEOUT_S + f"{BASE_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT ) csrf_match = re.search(r'name="csrf_token" value="([^"]+)"', login_page.text) if not csrf_match: @@ -304,20 +304,20 @@ class CSESScraper(BaseScraper): "pass": password, }, headers=HEADERS, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) if "Invalid username or password" in login_resp.text: return None api_resp = await client.post( - f"{API_URL}/login", headers=HEADERS, timeout=TIMEOUT_S + f"{API_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT ) api_data = api_resp.json() token: str = api_data["X-Auth-Token"] auth_url: str = api_data["authentication_url"] - auth_page = await client.get(auth_url, headers=HEADERS, timeout=TIMEOUT_S) + auth_page = await client.get(auth_url, headers=HEADERS, timeout=HTTP_TIMEOUT) auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text) form_token = re.search(r'name="token" value="([^"]+)"', auth_page.text) if not auth_csrf or not form_token: @@ -330,13 +330,13 @@ class CSESScraper(BaseScraper): "token": form_token.group(1), }, headers=HEADERS, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) check = await client.get( f"{API_URL}/login", headers={"X-Auth-Token": token, **HEADERS}, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) if check.status_code != 200: return None @@ -349,7 +349,7 @@ class CSESScraper(BaseScraper): r = await client.get( f"{API_URL}/login", headers={"X-Auth-Token": token, **HEADERS}, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) return r.status_code == 200 except Exception: @@ -415,7 +415,7 @@ class CSESScraper(BaseScraper): "Content-Type": "application/json", **HEADERS, }, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) if r.status_code not in range(200, 300): @@ -438,7 +438,7 @@ class CSESScraper(BaseScraper): "X-Auth-Token": token, **HEADERS, }, - timeout=30.0, + timeout=SUBMIT_POLL_TIMEOUT, ) if r.status_code == 200: info = r.json() diff --git a/scrapers/kattis.py b/scrapers/kattis.py index d1675bf..2bfd2d6 100644 --- a/scrapers/kattis.py +++ b/scrapers/kattis.py @@ -10,6 +10,7 @@ from datetime import datetime import httpx from .base import BaseScraper +from .timeouts import HTTP_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -23,7 +24,6 @@ BASE_URL = "https://open.kattis.com" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 8 TIME_RE = re.compile( @@ -37,13 +37,13 @@ MEM_RE = re.compile( async def _fetch_text(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.text async def _fetch_bytes(client: httpx.AsyncClient, url: str) -> bytes: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.content diff --git a/scrapers/timeouts.py b/scrapers/timeouts.py new file mode 100644 index 0000000..a21ad0d --- /dev/null +++ b/scrapers/timeouts.py @@ -0,0 +1,9 @@ +HTTP_TIMEOUT = 15.0 + +BROWSER_SESSION_TIMEOUT = 15000 +BROWSER_NAV_TIMEOUT = 10000 +BROWSER_TURNSTILE_POLL = 5000 +BROWSER_ELEMENT_WAIT = 10000 +BROWSER_SETTLE_DELAY = 500 + +SUBMIT_POLL_TIMEOUT = 30.0 diff --git a/scrapers/usaco.py b/scrapers/usaco.py index 565f1b5..b78f88e 100644 --- a/scrapers/usaco.py +++ b/scrapers/usaco.py @@ -8,6 +8,7 @@ from typing import Any, cast import httpx from .base import BaseScraper +from .timeouts import HTTP_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -21,7 +22,6 @@ BASE_URL = "http://www.usaco.org" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 4 MONTHS = [ @@ -58,7 +58,7 @@ RESULTS_PAGE_RE = re.compile( async def _fetch_text(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S, follow_redirects=True) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT, follow_redirects=True) r.raise_for_status() return r.text From 1afe41103fb1f28d868e423cfe41ed0c3acb1d5f Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 01:39:59 -0500 Subject: [PATCH 5/9] ci: format --- scrapers/atcoder.py | 8 ++++++-- scrapers/codeforces.py | 26 +++++++------------------- scrapers/cses.py | 4 +--- scrapers/usaco.py | 4 +++- 4 files changed, 17 insertions(+), 25 deletions(-) diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 719135e..16eba40 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -338,7 +338,9 @@ def _submit_headless( page.fill('input[name="username"]', credentials.get("username", "")) page.fill('input[name="password"]', credentials.get("password", "")) page.click("#submit") - page.wait_for_url(lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT) + page.wait_for_url( + lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT + ) except Exception as e: login_error = str(e) @@ -365,7 +367,9 @@ def _submit_headless( finally: os.unlink(tmp_path) page.locator('button[type="submit"]').click() - page.wait_for_url(lambda url: "/submissions/me" in url, timeout=BROWSER_NAV_TIMEOUT) + page.wait_for_url( + lambda url: "/submissions/me" in url, timeout=BROWSER_NAV_TIMEOUT + ) except Exception as e: submit_error = str(e) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 05e4ba0..8eaa874 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -328,9 +328,7 @@ def _submit_headless( _ensure_browser() - cookie_cache = ( - Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" - ) + cookie_cache = Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" cookie_cache.parent.mkdir(parents=True, exist_ok=True) saved_cookies: list[dict[str, Any]] = [] if cookie_cache.exists(): @@ -372,9 +370,7 @@ def _submit_headless( 'input[name="password"]', credentials.get("password", ""), ) - page.locator( - '#enterForm input[type="submit"]' - ).click() + page.locator('#enterForm input[type="submit"]').click() page.wait_for_url( lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT ) @@ -394,24 +390,20 @@ def _submit_headless( 'select[name="submittedProblemIndex"]', problem_id.upper(), ) - page.select_option( - 'select[name="programTypeId"]', language_id - ) + page.select_option('select[name="programTypeId"]', language_id) with tempfile.NamedTemporaryFile( mode="w", suffix=".cpp", delete=False, prefix="cf_" ) as tf: tf.write(source_code) tmp_path = tf.name try: - page.set_input_files( - 'input[name="sourceFile"]', tmp_path - ) + page.set_input_files('input[name="sourceFile"]', tmp_path) page.wait_for_timeout(BROWSER_SETTLE_DELAY) except Exception: page.fill('textarea[name="source"]', source_code) finally: os.unlink(tmp_path) - page.locator('form.submit-form input.submit').click() + page.locator("form.submit-form input.submit").click() page.wait_for_url( lambda url: "/my" in url or "/status" in url, timeout=BROWSER_NAV_TIMEOUT, @@ -435,17 +427,13 @@ def _submit_headless( try: browser_cookies = session.context.cookies() - if any( - c["name"] == "JSESSIONID" for c in browser_cookies - ): + if any(c["name"] == "JSESSIONID" for c in browser_cookies): cookie_cache.write_text(json.dumps(browser_cookies)) except Exception: pass if login_error: - return SubmitResult( - success=False, error=f"Login failed: {login_error}" - ) + return SubmitResult(success=False, error=f"Login failed: {login_error}") if submit_error: return SubmitResult(success=False, error=submit_error) diff --git a/scrapers/cses.py b/scrapers/cses.py index fe819fc..7d9f4f0 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -342,9 +342,7 @@ class CSESScraper(BaseScraper): return None return token - async def _check_token( - self, client: httpx.AsyncClient, token: str - ) -> bool: + async def _check_token(self, client: httpx.AsyncClient, token: str) -> bool: try: r = await client.get( f"{API_URL}/login", diff --git a/scrapers/usaco.py b/scrapers/usaco.py index b78f88e..9e4d7da 100644 --- a/scrapers/usaco.py +++ b/scrapers/usaco.py @@ -58,7 +58,9 @@ RESULTS_PAGE_RE = re.compile( async def _fetch_text(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT, follow_redirects=True) + r = await client.get( + url, headers=HEADERS, timeout=HTTP_TIMEOUT, follow_redirects=True + ) r.raise_for_status() return r.text From 73687479460aff950606dafce1cc63e55eaf149b Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 10:35:27 -0500 Subject: [PATCH 6/9] perf(atcoder): bail out early from `_solve_turnstile` when no iframe present Problem: `_solve_turnstile` looped 6 times with ~20s per iteration (15s bounding_box timeout + 5s wait_for_function) when no Turnstile iframe existed on the page, causing a 120-second delay on pages that don't require Turnstile verification. Solution: check for existing token and iframe presence before entering the retry loop. `iframe_loc.count()` returns immediately when no matching elements exist, avoiding the expensive timeout cascade. --- scrapers/atcoder.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 16eba40..45a2195 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -246,14 +246,14 @@ _TURNSTILE_JS = "() => { const el = document.querySelector('[name=\"cf-turnstile def _solve_turnstile(page) -> None: + if page.evaluate(_TURNSTILE_JS): + return + iframe_loc = page.locator('iframe[src*="challenges.cloudflare.com"]') + if not iframe_loc.count(): + return for _ in range(6): - has_token = page.evaluate(_TURNSTILE_JS) - if has_token: - return try: - box = page.locator( - 'iframe[src*="challenges.cloudflare.com"]' - ).first.bounding_box() + box = iframe_loc.first.bounding_box() if box: page.mouse.click( box["x"] + box["width"] * 0.15, From 3ecd200da7c16fdd78a35921e1432a898dc75df1 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 10:35:36 -0500 Subject: [PATCH 7/9] refactor(codeforces): use separate fetches for login and submit Problem: the single `do_login_and_submit` page action navigated between pages within one `session.fetch` call, which was fragile and couldn't leverage `solve_cloudflare` for the Turnstile gate on the submit page. The submit button click also blocked on navigation completion, causing timeouts when CF was slow to process. Solution: split into three separate `session.fetch` calls (homepage login check, `/enter` login, `/contest/{id}/submit`) with `solve_cloudflare=True` on login and submit. Use `no_wait_after=True` on the submit click with a doubled nav timeout. Extract `span.error` text on submit failure instead of a generic timeout message. --- scrapers/codeforces.py | 135 +++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 51 deletions(-) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 8eaa874..8002398 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -22,6 +22,7 @@ from .models import ( TestCase, ) from .timeouts import ( + BROWSER_ELEMENT_WAIT, BROWSER_NAV_TIMEOUT, BROWSER_SESSION_TIMEOUT, BROWSER_SETTLE_DELAY, @@ -307,6 +308,18 @@ class CodeforcesScraper(BaseScraper): ) +def _wait_for_gate_reload(page, wait_selector: str) -> None: + from .atcoder import _solve_turnstile + + if "Verification" not in page.title(): + return + _solve_turnstile(page) + page.wait_for_function( + f"() => !!document.querySelector('{wait_selector}')", + timeout=BROWSER_ELEMENT_WAIT, + ) + + def _submit_headless( contest_id: str, problem_id: str, @@ -337,54 +350,46 @@ def _submit_headless( except Exception: pass + logged_in = False login_error: str | None = None submit_error: str | None = None - def do_login_and_submit(page): - nonlocal login_error, submit_error - - has_submit_form = page.evaluate( - "() => !!document.querySelector('form.submit-form')" + def check_login(page): + nonlocal logged_in + logged_in = page.evaluate( + "() => Array.from(document.querySelectorAll('a'))" + ".some(a => a.textContent.includes('Logout'))" ) - if not has_submit_form: - if "/enter" not in page.url: - page.goto( - f"{BASE_URL}/enter", - wait_until="domcontentloaded", - timeout=BROWSER_NAV_TIMEOUT, - ) - - try: - _solve_turnstile(page) - except Exception: - pass - - print(json.dumps({"status": "logging_in"}), flush=True) - try: - page.fill( - 'input[name="handleOrEmail"]', - credentials.get("username", ""), - ) - page.fill( - 'input[name="password"]', - credentials.get("password", ""), - ) - page.locator('#enterForm input[type="submit"]').click() - page.wait_for_url( - lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT - ) - except Exception as e: - login_error = str(e) - return - - page.goto( - f"{BASE_URL}/contest/{contest_id}/submit", - wait_until="domcontentloaded", - timeout=BROWSER_NAV_TIMEOUT, + def login_action(page): + nonlocal login_error + try: + _wait_for_gate_reload(page, "#enterForm") + except Exception: + pass + try: + page.fill( + 'input[name="handleOrEmail"]', + credentials.get("username", ""), ) + page.fill( + 'input[name="password"]', + credentials.get("password", ""), + ) + page.locator('#enterForm input[type="submit"]').click() + page.wait_for_url( + lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT + ) + except Exception as e: + login_error = str(e) - print(json.dumps({"status": "submitting"}), flush=True) + def submit_action(page): + nonlocal submit_error + try: + _solve_turnstile(page) + except Exception: + pass + tmp_path: str | None = None try: page.select_option( 'select[name="submittedProblemIndex"]', @@ -401,15 +406,26 @@ def _submit_headless( page.wait_for_timeout(BROWSER_SETTLE_DELAY) except Exception: page.fill('textarea[name="source"]', source_code) - finally: - os.unlink(tmp_path) - page.locator("form.submit-form input.submit").click() - page.wait_for_url( - lambda url: "/my" in url or "/status" in url, - timeout=BROWSER_NAV_TIMEOUT, - ) + page.locator("form.submit-form input.submit").click(no_wait_after=True) + try: + page.wait_for_url( + lambda url: "/my" in url or "/status" in url, + timeout=BROWSER_NAV_TIMEOUT * 2, + ) + except Exception: + err_el = page.query_selector("span.error") + if err_el: + submit_error = err_el.inner_text().strip() + else: + submit_error = "Submit failed: page did not navigate" except Exception as e: submit_error = str(e) + finally: + if tmp_path: + try: + os.unlink(tmp_path) + except OSError: + pass try: with StealthySession( @@ -419,9 +435,28 @@ def _submit_headless( cookies=saved_cookies, ) as session: print(json.dumps({"status": "checking_login"}), flush=True) + session.fetch( + f"{BASE_URL}/", + page_action=check_login, + network_idle=True, + ) + + if not logged_in: + print(json.dumps({"status": "logging_in"}), flush=True) + session.fetch( + f"{BASE_URL}/enter", + page_action=login_action, + solve_cloudflare=True, + ) + if login_error: + return SubmitResult( + success=False, error=f"Login failed: {login_error}" + ) + + print(json.dumps({"status": "submitting"}), flush=True) session.fetch( f"{BASE_URL}/contest/{contest_id}/submit", - page_action=do_login_and_submit, + page_action=submit_action, solve_cloudflare=True, ) @@ -432,8 +467,6 @@ def _submit_headless( except Exception: pass - if login_error: - return SubmitResult(success=False, error=f"Login failed: {login_error}") if submit_error: return SubmitResult(success=False, error=submit_error) From 38cd0482f05993e06a69caa89a563a1106c3f985 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 10:37:13 -0500 Subject: [PATCH 8/9] ci: remove unused var --- scrapers/codeforces.py | 1 - t/{1068.cc => a.cc} | 8 +++-- t/cf_cli_debug.py | 67 ++++++++++++++++++++++++++++++++++++++++ t/cf_cli_real.py | 30 ++++++++++++++++++ t/cf_exact.py | 13 ++++++++ t/cf_hang_debug.py | 70 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 186 insertions(+), 3 deletions(-) rename t/{1068.cc => a.cc} (90%) create mode 100644 t/cf_cli_debug.py create mode 100644 t/cf_cli_real.py create mode 100644 t/cf_exact.py create mode 100644 t/cf_hang_debug.py diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 8002398..fd0c129 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -12,7 +12,6 @@ from bs4 import BeautifulSoup, Tag from curl_cffi import requests as curl_requests from .base import BaseScraper, extract_precision -from .language_ids import get_language_id from .models import ( ContestListResult, ContestSummary, diff --git a/t/1068.cc b/t/a.cc similarity index 90% rename from t/1068.cc rename to t/a.cc index 5d3fe37..b8f7123 100644 --- a/t/1068.cc +++ b/t/a.cc @@ -37,7 +37,7 @@ constexpr T MAX = std::numeric_limits::max(); // }}} void solve() { - cout << "hi\n"; + std::cout << "change\n"; } int main() { // {{{ @@ -49,6 +49,10 @@ int main() { // {{{ #else std::cin.tie(nullptr)->sync_with_stdio(false); #endif - solve(); + u32 tc = 1; + std::cin >> tc; + for (u32 t = 0; t < tc; ++t) { + solve(); + } return 0; } // }}} diff --git a/t/cf_cli_debug.py b/t/cf_cli_debug.py new file mode 100644 index 0000000..515df76 --- /dev/null +++ b/t/cf_cli_debug.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Reproduce CLI hang: go through asyncio.to_thread like the real code.""" +import asyncio +import json +import sys +from pathlib import Path + +sys.path.insert(0, ".") + +from scrapers.atcoder import _ensure_browser, _solve_turnstile +from scrapers.codeforces import BASE_URL, _wait_for_gate_reload +from scrapers.timeouts import BROWSER_SESSION_TIMEOUT + + +def _test_submit(): + from scrapling.fetchers import StealthySession + + _ensure_browser() + + cookie_cache = Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" + saved_cookies = [] + if cookie_cache.exists(): + try: + saved_cookies = json.loads(cookie_cache.read_text()) + except Exception: + pass + + logged_in = False + + def check_login(page): + nonlocal logged_in + logged_in = page.evaluate( + "() => Array.from(document.querySelectorAll('a'))" + ".some(a => a.textContent.includes('Logout'))" + ) + print(f"logged_in: {logged_in}", flush=True) + + def submit_action(page): + print(f"ENTERED submit_action: url={page.url}", flush=True) + + with StealthySession( + headless=True, + timeout=BROWSER_SESSION_TIMEOUT, + google_search=False, + cookies=saved_cookies, + ) as session: + print("fetch homepage...", flush=True) + session.fetch(f"{BASE_URL}/", page_action=check_login, network_idle=True) + + print("fetch submit page...", flush=True) + session.fetch( + f"{BASE_URL}/contest/1933/submit", + page_action=submit_action, + ) + print("DONE", flush=True) + + return "ok" + + +async def main(): + print("Running via asyncio.to_thread...", flush=True) + result = await asyncio.to_thread(_test_submit) + print(f"Result: {result}", flush=True) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/t/cf_cli_real.py b/t/cf_cli_real.py new file mode 100644 index 0000000..448dab1 --- /dev/null +++ b/t/cf_cli_real.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +"""Simulate exactly what the CLI does.""" +import asyncio +import json +import os +import sys + +sys.path.insert(0, ".") + +SOURCE = '#include \nusing namespace std;\nint main() { cout << 42; }\n' + + +async def main(): + from scrapers.codeforces import CodeforcesScraper + from scrapers.language_ids import get_language_id + + scraper = CodeforcesScraper() + credentials = json.loads(os.environ.get("CP_CREDENTIALS", "{}")) + language_id = get_language_id("codeforces", "cpp") or "89" + + print(f"source length: {len(SOURCE)}", flush=True) + print(f"credentials keys: {list(credentials.keys())}", flush=True) + print(f"language_id: {language_id}", flush=True) + + result = await scraper.submit("1933", "a", SOURCE, language_id, credentials) + print(result.model_dump_json(indent=2), flush=True) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/t/cf_exact.py b/t/cf_exact.py new file mode 100644 index 0000000..e65bbdb --- /dev/null +++ b/t/cf_exact.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +"""Call _submit_headless directly, no asyncio.""" +import json +import os +import sys + +sys.path.insert(0, ".") + +from scrapers.codeforces import _submit_headless + +creds = json.loads(os.environ.get("CP_CREDENTIALS", "{}")) +result = _submit_headless("1933", "a", "int main(){}", "89", creds) +print(result.model_dump_json(indent=2)) diff --git a/t/cf_hang_debug.py b/t/cf_hang_debug.py new file mode 100644 index 0000000..0c3ba0e --- /dev/null +++ b/t/cf_hang_debug.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +"""Pinpoint where session.fetch hangs on the submit page.""" +import json +import sys +import threading +from pathlib import Path + +sys.path.insert(0, ".") + +from scrapers.atcoder import _ensure_browser +from scrapers.codeforces import BASE_URL +from scrapers.timeouts import BROWSER_SESSION_TIMEOUT + + +def watchdog(label, timeout=20): + import time + time.sleep(timeout) + print(f"WATCHDOG: {label} timed out after {timeout}s", flush=True) + import os + os._exit(1) + + +def main(): + from scrapling.fetchers import StealthySession + + _ensure_browser() + + cookie_cache = Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" + saved_cookies = [] + if cookie_cache.exists(): + try: + saved_cookies = json.loads(cookie_cache.read_text()) + except Exception: + pass + + def check_login(page): + logged_in = page.evaluate( + "() => Array.from(document.querySelectorAll('a'))" + ".some(a => a.textContent.includes('Logout'))" + ) + print(f"logged_in: {logged_in}", flush=True) + + def submit_action(page): + print(f"submit_action ENTERED: url={page.url} title={page.title()}", flush=True) + + try: + with StealthySession( + headless=True, + timeout=BROWSER_SESSION_TIMEOUT, + google_search=False, + cookies=saved_cookies, + ) as session: + print("1. Homepage...", flush=True) + session.fetch(f"{BASE_URL}/", page_action=check_login, network_idle=True) + + print("2. Submit page (no network_idle, no solve_cloudflare)...", flush=True) + t = threading.Thread(target=watchdog, args=("session.fetch submit", 30), daemon=True) + t.start() + + session.fetch( + f"{BASE_URL}/contest/1933/submit", + page_action=submit_action, + ) + print("3. Done!", flush=True) + except Exception as e: + print(f"FATAL: {type(e).__name__}: {e}", flush=True) + + +if __name__ == "__main__": + main() From c95f7f4c536baa7de7cadfdafa17fc085deb6210 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 10:37:26 -0500 Subject: [PATCH 9/9] chore: remove accidentally committed files --- t/a.cc | 58 -------------------------------------- t/cf_cli_debug.py | 67 -------------------------------------------- t/cf_cli_real.py | 30 -------------------- t/cf_exact.py | 13 --------- t/cf_hang_debug.py | 70 ---------------------------------------------- 5 files changed, 238 deletions(-) delete mode 100644 t/a.cc delete mode 100644 t/cf_cli_debug.py delete mode 100644 t/cf_cli_real.py delete mode 100644 t/cf_exact.py delete mode 100644 t/cf_hang_debug.py diff --git a/t/a.cc b/t/a.cc deleted file mode 100644 index b8f7123..0000000 --- a/t/a.cc +++ /dev/null @@ -1,58 +0,0 @@ -#include // {{{ - -#include -#ifdef __cpp_lib_ranges_enumerate -#include -namespace rv = std::views; -namespace rs = std::ranges; -#endif - -#pragma GCC optimize("O2,unroll-loops") -#pragma GCC target("avx2,bmi,bmi2,lzcnt,popcnt") - -using namespace std; - -using i32 = int32_t; -using u32 = uint32_t; -using i64 = int64_t; -using u64 = uint64_t; -using f64 = double; -using f128 = long double; - -#if __cplusplus >= 202002L -template -constexpr T MIN = std::numeric_limits::min(); - -template -constexpr T MAX = std::numeric_limits::max(); -#endif - -#ifdef LOCAL -#define db(...) std::print(__VA_ARGS__) -#define dbln(...) std::println(__VA_ARGS__) -#else -#define db(...) -#define dbln(...) -#endif -// }}} - -void solve() { - std::cout << "change\n"; -} - -int main() { // {{{ - std::cin.exceptions(std::cin.failbit); -#ifdef LOCAL - std::cerr.rdbuf(std::cout.rdbuf()); - std::cout.setf(std::ios::unitbuf); - std::cerr.setf(std::ios::unitbuf); -#else - std::cin.tie(nullptr)->sync_with_stdio(false); -#endif - u32 tc = 1; - std::cin >> tc; - for (u32 t = 0; t < tc; ++t) { - solve(); - } - return 0; -} // }}} diff --git a/t/cf_cli_debug.py b/t/cf_cli_debug.py deleted file mode 100644 index 515df76..0000000 --- a/t/cf_cli_debug.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -"""Reproduce CLI hang: go through asyncio.to_thread like the real code.""" -import asyncio -import json -import sys -from pathlib import Path - -sys.path.insert(0, ".") - -from scrapers.atcoder import _ensure_browser, _solve_turnstile -from scrapers.codeforces import BASE_URL, _wait_for_gate_reload -from scrapers.timeouts import BROWSER_SESSION_TIMEOUT - - -def _test_submit(): - from scrapling.fetchers import StealthySession - - _ensure_browser() - - cookie_cache = Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" - saved_cookies = [] - if cookie_cache.exists(): - try: - saved_cookies = json.loads(cookie_cache.read_text()) - except Exception: - pass - - logged_in = False - - def check_login(page): - nonlocal logged_in - logged_in = page.evaluate( - "() => Array.from(document.querySelectorAll('a'))" - ".some(a => a.textContent.includes('Logout'))" - ) - print(f"logged_in: {logged_in}", flush=True) - - def submit_action(page): - print(f"ENTERED submit_action: url={page.url}", flush=True) - - with StealthySession( - headless=True, - timeout=BROWSER_SESSION_TIMEOUT, - google_search=False, - cookies=saved_cookies, - ) as session: - print("fetch homepage...", flush=True) - session.fetch(f"{BASE_URL}/", page_action=check_login, network_idle=True) - - print("fetch submit page...", flush=True) - session.fetch( - f"{BASE_URL}/contest/1933/submit", - page_action=submit_action, - ) - print("DONE", flush=True) - - return "ok" - - -async def main(): - print("Running via asyncio.to_thread...", flush=True) - result = await asyncio.to_thread(_test_submit) - print(f"Result: {result}", flush=True) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/t/cf_cli_real.py b/t/cf_cli_real.py deleted file mode 100644 index 448dab1..0000000 --- a/t/cf_cli_real.py +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env python3 -"""Simulate exactly what the CLI does.""" -import asyncio -import json -import os -import sys - -sys.path.insert(0, ".") - -SOURCE = '#include \nusing namespace std;\nint main() { cout << 42; }\n' - - -async def main(): - from scrapers.codeforces import CodeforcesScraper - from scrapers.language_ids import get_language_id - - scraper = CodeforcesScraper() - credentials = json.loads(os.environ.get("CP_CREDENTIALS", "{}")) - language_id = get_language_id("codeforces", "cpp") or "89" - - print(f"source length: {len(SOURCE)}", flush=True) - print(f"credentials keys: {list(credentials.keys())}", flush=True) - print(f"language_id: {language_id}", flush=True) - - result = await scraper.submit("1933", "a", SOURCE, language_id, credentials) - print(result.model_dump_json(indent=2), flush=True) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/t/cf_exact.py b/t/cf_exact.py deleted file mode 100644 index e65bbdb..0000000 --- a/t/cf_exact.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -"""Call _submit_headless directly, no asyncio.""" -import json -import os -import sys - -sys.path.insert(0, ".") - -from scrapers.codeforces import _submit_headless - -creds = json.loads(os.environ.get("CP_CREDENTIALS", "{}")) -result = _submit_headless("1933", "a", "int main(){}", "89", creds) -print(result.model_dump_json(indent=2)) diff --git a/t/cf_hang_debug.py b/t/cf_hang_debug.py deleted file mode 100644 index 0c3ba0e..0000000 --- a/t/cf_hang_debug.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -"""Pinpoint where session.fetch hangs on the submit page.""" -import json -import sys -import threading -from pathlib import Path - -sys.path.insert(0, ".") - -from scrapers.atcoder import _ensure_browser -from scrapers.codeforces import BASE_URL -from scrapers.timeouts import BROWSER_SESSION_TIMEOUT - - -def watchdog(label, timeout=20): - import time - time.sleep(timeout) - print(f"WATCHDOG: {label} timed out after {timeout}s", flush=True) - import os - os._exit(1) - - -def main(): - from scrapling.fetchers import StealthySession - - _ensure_browser() - - cookie_cache = Path.home() / ".cache" / "cp-nvim" / "codeforces-cookies.json" - saved_cookies = [] - if cookie_cache.exists(): - try: - saved_cookies = json.loads(cookie_cache.read_text()) - except Exception: - pass - - def check_login(page): - logged_in = page.evaluate( - "() => Array.from(document.querySelectorAll('a'))" - ".some(a => a.textContent.includes('Logout'))" - ) - print(f"logged_in: {logged_in}", flush=True) - - def submit_action(page): - print(f"submit_action ENTERED: url={page.url} title={page.title()}", flush=True) - - try: - with StealthySession( - headless=True, - timeout=BROWSER_SESSION_TIMEOUT, - google_search=False, - cookies=saved_cookies, - ) as session: - print("1. Homepage...", flush=True) - session.fetch(f"{BASE_URL}/", page_action=check_login, network_idle=True) - - print("2. Submit page (no network_idle, no solve_cloudflare)...", flush=True) - t = threading.Thread(target=watchdog, args=("session.fetch submit", 30), daemon=True) - t.start() - - session.fetch( - f"{BASE_URL}/contest/1933/submit", - page_action=submit_action, - ) - print("3. Done!", flush=True) - except Exception as e: - print(f"FATAL: {type(e).__name__}: {e}", flush=True) - - -if __name__ == "__main__": - main()