From 2cdde85d36d1c60ab75c6af03edb4cfa344ae9e6 Mon Sep 17 00:00:00 2001
From: Barrett Ruth <br.barrettruth@gmail.com>
Date: Thu, 5 Mar 2026 01:35:40 -0500
Subject: [PATCH] refactor: centralize timeout constants in
 `scrapers/timeouts.py`

Problem: each scraper defined its own timeout constants
(`TIMEOUT_S`, `TIMEOUT_SECONDS`) with inconsistent values (15s vs 30s)
and browser timeouts were scattered as magic numbers (60000, 15000,
5000, 500).

Solution: introduce `scrapers/timeouts.py` with named constants for
HTTP requests, browser session/navigation/element/turnstile/settle
timeouts, and submission polling. All six scrapers now import from
the shared module.
---
 scrapers/atcoder.py    | 25 ++++++++++++++++---------
 scrapers/codechef.py   |  6 +++---
 scrapers/codeforces.py | 23 ++++++++++++++---------
 scrapers/cses.py       | 22 +++++++++++-----------
 scrapers/kattis.py     |  6 +++---
 scrapers/timeouts.py   |  9 +++++++++
 scrapers/usaco.py      |  4 ++--
 7 files changed, 58 insertions(+), 37 deletions(-)
 create mode 100644 scrapers/timeouts.py

diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py
index 8be75ff..719135e 100644
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@@ -29,11 +29,18 @@ from .models import (
     TestCase,
     TestsResult,
 )
+from .timeouts import (
+    BROWSER_ELEMENT_WAIT,
+    BROWSER_NAV_TIMEOUT,
+    BROWSER_SESSION_TIMEOUT,
+    BROWSER_SETTLE_DELAY,
+    BROWSER_TURNSTILE_POLL,
+    HTTP_TIMEOUT,
+)
 
 MIB_TO_MB = 1.048576
 BASE_URL = "https://atcoder.jp"
 ARCHIVE_URL = f"{BASE_URL}/contests/archive"
-TIMEOUT_SECONDS = 30
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
 }
@@ -76,7 +83,7 @@ def _retry_after_requests(details):
     on_backoff=_retry_after_requests,
 )
 def _fetch(url: str) -> str:
-    r = _session.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS)
+    r = _session.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
     if r.status_code in RETRY_STATUS:
         raise requests.HTTPError(response=r)
     r.raise_for_status()
@@ -99,7 +106,7 @@ def _giveup_httpx(exc: Exception) -> bool:
     giveup=_giveup_httpx,
 )
 async def _get_async(client: httpx.AsyncClient, url: str) -> str:
-    r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS)
+    r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
     r.raise_for_status()
     return r.text
 
@@ -255,7 +262,7 @@ def _solve_turnstile(page) -> None:
         except Exception:
             pass
         try:
-            page.wait_for_function(_TURNSTILE_JS, timeout=5000)
+            page.wait_for_function(_TURNSTILE_JS, timeout=BROWSER_TURNSTILE_POLL)
             return
         except Exception:
             pass
@@ -331,7 +338,7 @@ def _submit_headless(
             page.fill('input[name="username"]', credentials.get("username", ""))
             page.fill('input[name="password"]', credentials.get("password", ""))
             page.click("#submit")
-            page.wait_for_url(lambda url: "/login" not in url, timeout=60000)
+            page.wait_for_url(lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT)
         except Exception as e:
             login_error = str(e)
 
@@ -345,7 +352,7 @@ def _submit_headless(
             )
             page.locator(
                 f'select[name="data.LanguageId"] option[value="{language_id}"]'
-            ).wait_for(state="attached", timeout=15000)
+            ).wait_for(state="attached", timeout=BROWSER_ELEMENT_WAIT)
             page.select_option('select[name="data.LanguageId"]', language_id)
             with tempfile.NamedTemporaryFile(
                 mode="w", suffix=".cpp", delete=False, prefix="atcoder_"
@@ -354,18 +361,18 @@ def _submit_headless(
                 tmp_path = tf.name
             try:
                 page.set_input_files("#input-open-file", tmp_path)
-                page.wait_for_timeout(500)
+                page.wait_for_timeout(BROWSER_SETTLE_DELAY)
             finally:
                 os.unlink(tmp_path)
             page.locator('button[type="submit"]').click()
-            page.wait_for_url(lambda url: "/submissions/me" in url, timeout=60000)
+            page.wait_for_url(lambda url: "/submissions/me" in url, timeout=BROWSER_NAV_TIMEOUT)
         except Exception as e:
             submit_error = str(e)
 
     try:
         with StealthySession(
             headless=True,
-            timeout=60000,
+            timeout=BROWSER_SESSION_TIMEOUT,
             google_search=False,
             cookies=saved_cookies,
         ) as session:
diff --git a/scrapers/codechef.py b/scrapers/codechef.py
index 57ce33e..c4b9d37 100644
--- a/scrapers/codechef.py
+++ b/scrapers/codechef.py
@@ -9,6 +9,7 @@ import httpx
 from curl_cffi import requests as curl_requests
 
 from .base import BaseScraper, extract_precision
+from .timeouts import HTTP_TIMEOUT
 from .models import (
     ContestListResult,
     ContestSummary,
@@ -26,7 +27,6 @@ PROBLEM_URL = "https://www.codechef.com/problems/{problem_id}"
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 }
-TIMEOUT_S = 15.0
 CONNECTIONS = 8
 MEMORY_LIMIT_RE = re.compile(
     r"Memory\s+[Ll]imit.*?([0-9.]+)\s*(MB|GB)", re.IGNORECASE | re.DOTALL
@@ -34,7 +34,7 @@ MEMORY_LIMIT_RE = re.compile(
 
 
 async def fetch_json(client: httpx.AsyncClient, path: str) -> dict:
-    r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S)
+    r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT)
     r.raise_for_status()
     return r.json()
 
@@ -51,7 +51,7 @@ def _extract_memory_limit(html: str) -> float:
 
 
 def _fetch_html_sync(url: str) -> str:
-    response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_S)
+    response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
     response.raise_for_status()
     return response.text
 
diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py
index 7fc5c1c..05e4ba0 100644
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@@ -21,10 +21,15 @@ from .models import (
     SubmitResult,
     TestCase,
 )
+from .timeouts import (
+    BROWSER_NAV_TIMEOUT,
+    BROWSER_SESSION_TIMEOUT,
+    BROWSER_SETTLE_DELAY,
+    HTTP_TIMEOUT,
+)
 
 BASE_URL = "https://codeforces.com"
 API_CONTEST_LIST_URL = f"{BASE_URL}/api/contest.list"
-TIMEOUT_SECONDS = 30
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
 }
@@ -139,7 +144,7 @@ def _is_interactive(block: Tag) -> bool:
 
 def _fetch_problems_html(contest_id: str) -> str:
     url = f"{BASE_URL}/contest/{contest_id}/problems"
-    response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_SECONDS)
+    response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
     response.raise_for_status()
     return response.text
 
@@ -226,7 +231,7 @@ class CodeforcesScraper(BaseScraper):
 
     async def scrape_contest_list(self) -> ContestListResult:
         try:
-            r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS)
+            r = requests.get(API_CONTEST_LIST_URL, timeout=HTTP_TIMEOUT)
             r.raise_for_status()
             data = r.json()
             if data.get("status") != "OK":
@@ -349,7 +354,7 @@ def _submit_headless(
                 page.goto(
                     f"{BASE_URL}/enter",
                     wait_until="domcontentloaded",
-                    timeout=10000,
+                    timeout=BROWSER_NAV_TIMEOUT,
                 )
 
             try:
@@ -371,7 +376,7 @@ def _submit_headless(
                     '#enterForm input[type="submit"]'
                 ).click()
                 page.wait_for_url(
-                    lambda url: "/enter" not in url, timeout=10000
+                    lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT
                 )
             except Exception as e:
                 login_error = str(e)
@@ -380,7 +385,7 @@ def _submit_headless(
             page.goto(
                 f"{BASE_URL}/contest/{contest_id}/submit",
                 wait_until="domcontentloaded",
-                timeout=10000,
+                timeout=BROWSER_NAV_TIMEOUT,
             )
 
         print(json.dumps({"status": "submitting"}), flush=True)
@@ -401,7 +406,7 @@ def _submit_headless(
                 page.set_input_files(
                     'input[name="sourceFile"]', tmp_path
                 )
-                page.wait_for_timeout(500)
+                page.wait_for_timeout(BROWSER_SETTLE_DELAY)
             except Exception:
                 page.fill('textarea[name="source"]', source_code)
             finally:
@@ -409,7 +414,7 @@ def _submit_headless(
             page.locator('form.submit-form input.submit').click()
             page.wait_for_url(
                 lambda url: "/my" in url or "/status" in url,
-                timeout=10000,
+                timeout=BROWSER_NAV_TIMEOUT,
             )
         except Exception as e:
             submit_error = str(e)
@@ -417,7 +422,7 @@ def _submit_headless(
     try:
         with StealthySession(
             headless=True,
-            timeout=15000,
+            timeout=BROWSER_SESSION_TIMEOUT,
             google_search=False,
             cookies=saved_cookies,
         ) as session:
diff --git a/scrapers/cses.py b/scrapers/cses.py
index 2c2c2ce..fe819fc 100644
--- a/scrapers/cses.py
+++ b/scrapers/cses.py
@@ -9,6 +9,7 @@ from typing import Any
 import httpx
 
 from .base import BaseScraper, extract_precision
+from .timeouts import HTTP_TIMEOUT, SUBMIT_POLL_TIMEOUT
 from .models import (
     ContestListResult,
     ContestSummary,
@@ -26,7 +27,6 @@ TASK_PATH = "/problemset/task/{id}"
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 }
-TIMEOUT_S = 15.0
 CONNECTIONS = 8
 
 CSES_LANGUAGES: dict[str, dict[str, str]] = {
@@ -78,7 +78,7 @@ def snake_to_title(name: str) -> str:
 
 
 async def fetch_text(client: httpx.AsyncClient, path: str) -> str:
-    r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S)
+    r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT)
     r.raise_for_status()
     return r.text
 
@@ -290,7 +290,7 @@ class CSESScraper(BaseScraper):
         password: str,
     ) -> str | None:
         login_page = await client.get(
-            f"{BASE_URL}/login", headers=HEADERS, timeout=TIMEOUT_S
+            f"{BASE_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT
         )
         csrf_match = re.search(r'name="csrf_token" value="([^"]+)"', login_page.text)
         if not csrf_match:
@@ -304,20 +304,20 @@ class CSESScraper(BaseScraper):
                 "pass": password,
             },
             headers=HEADERS,
-            timeout=TIMEOUT_S,
+            timeout=HTTP_TIMEOUT,
         )
 
         if "Invalid username or password" in login_resp.text:
             return None
 
         api_resp = await client.post(
-            f"{API_URL}/login", headers=HEADERS, timeout=TIMEOUT_S
+            f"{API_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT
         )
         api_data = api_resp.json()
         token: str = api_data["X-Auth-Token"]
         auth_url: str = api_data["authentication_url"]
 
-        auth_page = await client.get(auth_url, headers=HEADERS, timeout=TIMEOUT_S)
+        auth_page = await client.get(auth_url, headers=HEADERS, timeout=HTTP_TIMEOUT)
         auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text)
         form_token = re.search(r'name="token" value="([^"]+)"', auth_page.text)
         if not auth_csrf or not form_token:
@@ -330,13 +330,13 @@ class CSESScraper(BaseScraper):
                 "token": form_token.group(1),
             },
             headers=HEADERS,
-            timeout=TIMEOUT_S,
+            timeout=HTTP_TIMEOUT,
         )
 
         check = await client.get(
             f"{API_URL}/login",
             headers={"X-Auth-Token": token, **HEADERS},
-            timeout=TIMEOUT_S,
+            timeout=HTTP_TIMEOUT,
         )
         if check.status_code != 200:
             return None
@@ -349,7 +349,7 @@ class CSESScraper(BaseScraper):
             r = await client.get(
                 f"{API_URL}/login",
                 headers={"X-Auth-Token": token, **HEADERS},
-                timeout=TIMEOUT_S,
+                timeout=HTTP_TIMEOUT,
             )
             return r.status_code == 200
         except Exception:
@@ -415,7 +415,7 @@ class CSESScraper(BaseScraper):
                     "Content-Type": "application/json",
                     **HEADERS,
                 },
-                timeout=TIMEOUT_S,
+                timeout=HTTP_TIMEOUT,
             )
 
             if r.status_code not in range(200, 300):
@@ -438,7 +438,7 @@ class CSESScraper(BaseScraper):
                             "X-Auth-Token": token,
                             **HEADERS,
                         },
-                        timeout=30.0,
+                        timeout=SUBMIT_POLL_TIMEOUT,
                     )
                     if r.status_code == 200:
                         info = r.json()
diff --git a/scrapers/kattis.py b/scrapers/kattis.py
index d1675bf..2bfd2d6 100644
--- a/scrapers/kattis.py
+++ b/scrapers/kattis.py
@@ -10,6 +10,7 @@ from datetime import datetime
 import httpx
 
 from .base import BaseScraper
+from .timeouts import HTTP_TIMEOUT
 from .models import (
     ContestListResult,
     ContestSummary,
@@ -23,7 +24,6 @@ BASE_URL = "https://open.kattis.com"
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 }
-TIMEOUT_S = 15.0
 CONNECTIONS = 8
 
 TIME_RE = re.compile(
@@ -37,13 +37,13 @@ MEM_RE = re.compile(
 
 
 async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
-    r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S)
+    r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
     r.raise_for_status()
     return r.text
 
 
 async def _fetch_bytes(client: httpx.AsyncClient, url: str) -> bytes:
-    r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S)
+    r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
     r.raise_for_status()
     return r.content
 
diff --git a/scrapers/timeouts.py b/scrapers/timeouts.py
new file mode 100644
index 0000000..a21ad0d
--- /dev/null
+++ b/scrapers/timeouts.py
@@ -0,0 +1,9 @@
+HTTP_TIMEOUT = 15.0
+
+BROWSER_SESSION_TIMEOUT = 15000
+BROWSER_NAV_TIMEOUT = 10000
+BROWSER_TURNSTILE_POLL = 5000
+BROWSER_ELEMENT_WAIT = 10000
+BROWSER_SETTLE_DELAY = 500
+
+SUBMIT_POLL_TIMEOUT = 30.0
diff --git a/scrapers/usaco.py b/scrapers/usaco.py
index 565f1b5..b78f88e 100644
--- a/scrapers/usaco.py
+++ b/scrapers/usaco.py
@@ -8,6 +8,7 @@ from typing import Any, cast
 import httpx
 
 from .base import BaseScraper
+from .timeouts import HTTP_TIMEOUT
 from .models import (
     ContestListResult,
     ContestSummary,
@@ -21,7 +22,6 @@ BASE_URL = "http://www.usaco.org"
 HEADERS = {
     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 }
-TIMEOUT_S = 15.0
 CONNECTIONS = 4
 
 MONTHS = [
@@ -58,7 +58,7 @@ RESULTS_PAGE_RE = re.compile(
 
 
 async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
-    r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S, follow_redirects=True)
+    r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT, follow_redirects=True)
     r.raise_for_status()
     return r.text