refactor: centralize timeout constants in scrapers/timeouts.py

Problem: each scraper defined its own timeout constants
(`TIMEOUT_S`, `TIMEOUT_SECONDS`) with inconsistent values (15s vs 30s)
and browser timeouts were scattered as magic numbers (60000, 15000,
5000, 500).

Solution: introduce `scrapers/timeouts.py` with named constants for
HTTP requests, browser session/navigation/element/turnstile/settle
timeouts, and submission polling. All six scrapers now import from
the shared module.
This commit is contained in:
Barrett Ruth 2026-03-05 01:35:40 -05:00
parent f4055b071b
commit 2cdde85d36
7 changed files with 58 additions and 37 deletions

View file

@ -9,6 +9,7 @@ from typing import Any
import httpx
from .base import BaseScraper, extract_precision
from .timeouts import HTTP_TIMEOUT, SUBMIT_POLL_TIMEOUT
from .models import (
ContestListResult,
ContestSummary,
@ -26,7 +27,6 @@ TASK_PATH = "/problemset/task/{id}"
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
TIMEOUT_S = 15.0
CONNECTIONS = 8
CSES_LANGUAGES: dict[str, dict[str, str]] = {
@ -78,7 +78,7 @@ def snake_to_title(name: str) -> str:
async def fetch_text(client: httpx.AsyncClient, path: str) -> str:
r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S)
r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT)
r.raise_for_status()
return r.text
@ -290,7 +290,7 @@ class CSESScraper(BaseScraper):
password: str,
) -> str | None:
login_page = await client.get(
f"{BASE_URL}/login", headers=HEADERS, timeout=TIMEOUT_S
f"{BASE_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT
)
csrf_match = re.search(r'name="csrf_token" value="([^"]+)"', login_page.text)
if not csrf_match:
@ -304,20 +304,20 @@ class CSESScraper(BaseScraper):
"pass": password,
},
headers=HEADERS,
timeout=TIMEOUT_S,
timeout=HTTP_TIMEOUT,
)
if "Invalid username or password" in login_resp.text:
return None
api_resp = await client.post(
f"{API_URL}/login", headers=HEADERS, timeout=TIMEOUT_S
f"{API_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT
)
api_data = api_resp.json()
token: str = api_data["X-Auth-Token"]
auth_url: str = api_data["authentication_url"]
auth_page = await client.get(auth_url, headers=HEADERS, timeout=TIMEOUT_S)
auth_page = await client.get(auth_url, headers=HEADERS, timeout=HTTP_TIMEOUT)
auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text)
form_token = re.search(r'name="token" value="([^"]+)"', auth_page.text)
if not auth_csrf or not form_token:
@ -330,13 +330,13 @@ class CSESScraper(BaseScraper):
"token": form_token.group(1),
},
headers=HEADERS,
timeout=TIMEOUT_S,
timeout=HTTP_TIMEOUT,
)
check = await client.get(
f"{API_URL}/login",
headers={"X-Auth-Token": token, **HEADERS},
timeout=TIMEOUT_S,
timeout=HTTP_TIMEOUT,
)
if check.status_code != 200:
return None
@ -349,7 +349,7 @@ class CSESScraper(BaseScraper):
r = await client.get(
f"{API_URL}/login",
headers={"X-Auth-Token": token, **HEADERS},
timeout=TIMEOUT_S,
timeout=HTTP_TIMEOUT,
)
return r.status_code == 200
except Exception:
@ -415,7 +415,7 @@ class CSESScraper(BaseScraper):
"Content-Type": "application/json",
**HEADERS,
},
timeout=TIMEOUT_S,
timeout=HTTP_TIMEOUT,
)
if r.status_code not in range(200, 300):
@ -438,7 +438,7 @@ class CSESScraper(BaseScraper):
"X-Auth-Token": token,
**HEADERS,
},
timeout=30.0,
timeout=SUBMIT_POLL_TIMEOUT,
)
if r.status_code == 200:
info = r.json()