From 2cdde85d36d1c60ab75c6af03edb4cfa344ae9e6 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 5 Mar 2026 01:35:40 -0500 Subject: [PATCH] refactor: centralize timeout constants in `scrapers/timeouts.py` Problem: each scraper defined its own timeout constants (`TIMEOUT_S`, `TIMEOUT_SECONDS`) with inconsistent values (15s vs 30s) and browser timeouts were scattered as magic numbers (60000, 15000, 5000, 500). Solution: introduce `scrapers/timeouts.py` with named constants for HTTP requests, browser session/navigation/element/turnstile/settle timeouts, and submission polling. All six scrapers now import from the shared module. --- scrapers/atcoder.py | 25 ++++++++++++++++--------- scrapers/codechef.py | 6 +++--- scrapers/codeforces.py | 23 ++++++++++++++--------- scrapers/cses.py | 22 +++++++++++----------- scrapers/kattis.py | 6 +++--- scrapers/timeouts.py | 9 +++++++++ scrapers/usaco.py | 4 ++-- 7 files changed, 58 insertions(+), 37 deletions(-) create mode 100644 scrapers/timeouts.py diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 8be75ff..719135e 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -29,11 +29,18 @@ from .models import ( TestCase, TestsResult, ) +from .timeouts import ( + BROWSER_ELEMENT_WAIT, + BROWSER_NAV_TIMEOUT, + BROWSER_SESSION_TIMEOUT, + BROWSER_SETTLE_DELAY, + BROWSER_TURNSTILE_POLL, + HTTP_TIMEOUT, +) MIB_TO_MB = 1.048576 BASE_URL = "https://atcoder.jp" ARCHIVE_URL = f"{BASE_URL}/contests/archive" -TIMEOUT_SECONDS = 30 HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" } @@ -76,7 +83,7 @@ def _retry_after_requests(details): on_backoff=_retry_after_requests, ) def _fetch(url: str) -> str: - r = _session.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS) + r = _session.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) if r.status_code in RETRY_STATUS: raise requests.HTTPError(response=r) r.raise_for_status() @@ -99,7 +106,7 @@ def _giveup_httpx(exc: Exception) -> bool: giveup=_giveup_httpx, ) async def _get_async(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.text @@ -255,7 +262,7 @@ def _solve_turnstile(page) -> None: except Exception: pass try: - page.wait_for_function(_TURNSTILE_JS, timeout=5000) + page.wait_for_function(_TURNSTILE_JS, timeout=BROWSER_TURNSTILE_POLL) return except Exception: pass @@ -331,7 +338,7 @@ def _submit_headless( page.fill('input[name="username"]', credentials.get("username", "")) page.fill('input[name="password"]', credentials.get("password", "")) page.click("#submit") - page.wait_for_url(lambda url: "/login" not in url, timeout=60000) + page.wait_for_url(lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT) except Exception as e: login_error = str(e) @@ -345,7 +352,7 @@ def _submit_headless( ) page.locator( f'select[name="data.LanguageId"] option[value="{language_id}"]' - ).wait_for(state="attached", timeout=15000) + ).wait_for(state="attached", timeout=BROWSER_ELEMENT_WAIT) page.select_option('select[name="data.LanguageId"]', language_id) with tempfile.NamedTemporaryFile( mode="w", suffix=".cpp", delete=False, prefix="atcoder_" @@ -354,18 +361,18 @@ def _submit_headless( tmp_path = tf.name try: page.set_input_files("#input-open-file", tmp_path) - page.wait_for_timeout(500) + page.wait_for_timeout(BROWSER_SETTLE_DELAY) finally: os.unlink(tmp_path) page.locator('button[type="submit"]').click() - page.wait_for_url(lambda url: "/submissions/me" in url, timeout=60000) + page.wait_for_url(lambda url: "/submissions/me" in url, timeout=BROWSER_NAV_TIMEOUT) except Exception as e: submit_error = str(e) try: with StealthySession( headless=True, - timeout=60000, + timeout=BROWSER_SESSION_TIMEOUT, google_search=False, cookies=saved_cookies, ) as session: diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 57ce33e..c4b9d37 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -9,6 +9,7 @@ import httpx from curl_cffi import requests as curl_requests from .base import BaseScraper, extract_precision +from .timeouts import HTTP_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -26,7 +27,6 @@ PROBLEM_URL = "https://www.codechef.com/problems/{problem_id}" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 8 MEMORY_LIMIT_RE = re.compile( r"Memory\s+[Ll]imit.*?([0-9.]+)\s*(MB|GB)", re.IGNORECASE | re.DOTALL @@ -34,7 +34,7 @@ MEMORY_LIMIT_RE = re.compile( async def fetch_json(client: httpx.AsyncClient, path: str) -> dict: - r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.json() @@ -51,7 +51,7 @@ def _extract_memory_limit(html: str) -> float: def _fetch_html_sync(url: str) -> str: - response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_S) + response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT) response.raise_for_status() return response.text diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 7fc5c1c..05e4ba0 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -21,10 +21,15 @@ from .models import ( SubmitResult, TestCase, ) +from .timeouts import ( + BROWSER_NAV_TIMEOUT, + BROWSER_SESSION_TIMEOUT, + BROWSER_SETTLE_DELAY, + HTTP_TIMEOUT, +) BASE_URL = "https://codeforces.com" API_CONTEST_LIST_URL = f"{BASE_URL}/api/contest.list" -TIMEOUT_SECONDS = 30 HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" } @@ -139,7 +144,7 @@ def _is_interactive(block: Tag) -> bool: def _fetch_problems_html(contest_id: str) -> str: url = f"{BASE_URL}/contest/{contest_id}/problems" - response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_SECONDS) + response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT) response.raise_for_status() return response.text @@ -226,7 +231,7 @@ class CodeforcesScraper(BaseScraper): async def scrape_contest_list(self) -> ContestListResult: try: - r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS) + r = requests.get(API_CONTEST_LIST_URL, timeout=HTTP_TIMEOUT) r.raise_for_status() data = r.json() if data.get("status") != "OK": @@ -349,7 +354,7 @@ def _submit_headless( page.goto( f"{BASE_URL}/enter", wait_until="domcontentloaded", - timeout=10000, + timeout=BROWSER_NAV_TIMEOUT, ) try: @@ -371,7 +376,7 @@ def _submit_headless( '#enterForm input[type="submit"]' ).click() page.wait_for_url( - lambda url: "/enter" not in url, timeout=10000 + lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT ) except Exception as e: login_error = str(e) @@ -380,7 +385,7 @@ def _submit_headless( page.goto( f"{BASE_URL}/contest/{contest_id}/submit", wait_until="domcontentloaded", - timeout=10000, + timeout=BROWSER_NAV_TIMEOUT, ) print(json.dumps({"status": "submitting"}), flush=True) @@ -401,7 +406,7 @@ def _submit_headless( page.set_input_files( 'input[name="sourceFile"]', tmp_path ) - page.wait_for_timeout(500) + page.wait_for_timeout(BROWSER_SETTLE_DELAY) except Exception: page.fill('textarea[name="source"]', source_code) finally: @@ -409,7 +414,7 @@ def _submit_headless( page.locator('form.submit-form input.submit').click() page.wait_for_url( lambda url: "/my" in url or "/status" in url, - timeout=10000, + timeout=BROWSER_NAV_TIMEOUT, ) except Exception as e: submit_error = str(e) @@ -417,7 +422,7 @@ def _submit_headless( try: with StealthySession( headless=True, - timeout=15000, + timeout=BROWSER_SESSION_TIMEOUT, google_search=False, cookies=saved_cookies, ) as session: diff --git a/scrapers/cses.py b/scrapers/cses.py index 2c2c2ce..fe819fc 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -9,6 +9,7 @@ from typing import Any import httpx from .base import BaseScraper, extract_precision +from .timeouts import HTTP_TIMEOUT, SUBMIT_POLL_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -26,7 +27,6 @@ TASK_PATH = "/problemset/task/{id}" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 8 CSES_LANGUAGES: dict[str, dict[str, str]] = { @@ -78,7 +78,7 @@ def snake_to_title(name: str) -> str: async def fetch_text(client: httpx.AsyncClient, path: str) -> str: - r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.text @@ -290,7 +290,7 @@ class CSESScraper(BaseScraper): password: str, ) -> str | None: login_page = await client.get( - f"{BASE_URL}/login", headers=HEADERS, timeout=TIMEOUT_S + f"{BASE_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT ) csrf_match = re.search(r'name="csrf_token" value="([^"]+)"', login_page.text) if not csrf_match: @@ -304,20 +304,20 @@ class CSESScraper(BaseScraper): "pass": password, }, headers=HEADERS, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) if "Invalid username or password" in login_resp.text: return None api_resp = await client.post( - f"{API_URL}/login", headers=HEADERS, timeout=TIMEOUT_S + f"{API_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT ) api_data = api_resp.json() token: str = api_data["X-Auth-Token"] auth_url: str = api_data["authentication_url"] - auth_page = await client.get(auth_url, headers=HEADERS, timeout=TIMEOUT_S) + auth_page = await client.get(auth_url, headers=HEADERS, timeout=HTTP_TIMEOUT) auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text) form_token = re.search(r'name="token" value="([^"]+)"', auth_page.text) if not auth_csrf or not form_token: @@ -330,13 +330,13 @@ class CSESScraper(BaseScraper): "token": form_token.group(1), }, headers=HEADERS, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) check = await client.get( f"{API_URL}/login", headers={"X-Auth-Token": token, **HEADERS}, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) if check.status_code != 200: return None @@ -349,7 +349,7 @@ class CSESScraper(BaseScraper): r = await client.get( f"{API_URL}/login", headers={"X-Auth-Token": token, **HEADERS}, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) return r.status_code == 200 except Exception: @@ -415,7 +415,7 @@ class CSESScraper(BaseScraper): "Content-Type": "application/json", **HEADERS, }, - timeout=TIMEOUT_S, + timeout=HTTP_TIMEOUT, ) if r.status_code not in range(200, 300): @@ -438,7 +438,7 @@ class CSESScraper(BaseScraper): "X-Auth-Token": token, **HEADERS, }, - timeout=30.0, + timeout=SUBMIT_POLL_TIMEOUT, ) if r.status_code == 200: info = r.json() diff --git a/scrapers/kattis.py b/scrapers/kattis.py index d1675bf..2bfd2d6 100644 --- a/scrapers/kattis.py +++ b/scrapers/kattis.py @@ -10,6 +10,7 @@ from datetime import datetime import httpx from .base import BaseScraper +from .timeouts import HTTP_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -23,7 +24,6 @@ BASE_URL = "https://open.kattis.com" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 8 TIME_RE = re.compile( @@ -37,13 +37,13 @@ MEM_RE = re.compile( async def _fetch_text(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.text async def _fetch_bytes(client: httpx.AsyncClient, url: str) -> bytes: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT) r.raise_for_status() return r.content diff --git a/scrapers/timeouts.py b/scrapers/timeouts.py new file mode 100644 index 0000000..a21ad0d --- /dev/null +++ b/scrapers/timeouts.py @@ -0,0 +1,9 @@ +HTTP_TIMEOUT = 15.0 + +BROWSER_SESSION_TIMEOUT = 15000 +BROWSER_NAV_TIMEOUT = 10000 +BROWSER_TURNSTILE_POLL = 5000 +BROWSER_ELEMENT_WAIT = 10000 +BROWSER_SETTLE_DELAY = 500 + +SUBMIT_POLL_TIMEOUT = 30.0 diff --git a/scrapers/usaco.py b/scrapers/usaco.py index 565f1b5..b78f88e 100644 --- a/scrapers/usaco.py +++ b/scrapers/usaco.py @@ -8,6 +8,7 @@ from typing import Any, cast import httpx from .base import BaseScraper +from .timeouts import HTTP_TIMEOUT from .models import ( ContestListResult, ContestSummary, @@ -21,7 +22,6 @@ BASE_URL = "http://www.usaco.org" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } -TIMEOUT_S = 15.0 CONNECTIONS = 4 MONTHS = [ @@ -58,7 +58,7 @@ RESULTS_PAGE_RE = re.compile( async def _fetch_text(client: httpx.AsyncClient, url: str) -> str: - r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S, follow_redirects=True) + r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT, follow_redirects=True) r.raise_for_status() return r.text