refactor: centralize timeout constants in scrapers/timeouts.py
Problem: each scraper defined its own timeout constants (`TIMEOUT_S`, `TIMEOUT_SECONDS`) with inconsistent values (15s vs 30s) and browser timeouts were scattered as magic numbers (60000, 15000, 5000, 500). Solution: introduce `scrapers/timeouts.py` with named constants for HTTP requests, browser session/navigation/element/turnstile/settle timeouts, and submission polling. All six scrapers now import from the shared module.
This commit is contained in:
parent
f4055b071b
commit
2cdde85d36
7 changed files with 58 additions and 37 deletions
|
|
@ -29,11 +29,18 @@ from .models import (
|
||||||
TestCase,
|
TestCase,
|
||||||
TestsResult,
|
TestsResult,
|
||||||
)
|
)
|
||||||
|
from .timeouts import (
|
||||||
|
BROWSER_ELEMENT_WAIT,
|
||||||
|
BROWSER_NAV_TIMEOUT,
|
||||||
|
BROWSER_SESSION_TIMEOUT,
|
||||||
|
BROWSER_SETTLE_DELAY,
|
||||||
|
BROWSER_TURNSTILE_POLL,
|
||||||
|
HTTP_TIMEOUT,
|
||||||
|
)
|
||||||
|
|
||||||
MIB_TO_MB = 1.048576
|
MIB_TO_MB = 1.048576
|
||||||
BASE_URL = "https://atcoder.jp"
|
BASE_URL = "https://atcoder.jp"
|
||||||
ARCHIVE_URL = f"{BASE_URL}/contests/archive"
|
ARCHIVE_URL = f"{BASE_URL}/contests/archive"
|
||||||
TIMEOUT_SECONDS = 30
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
|
|
@ -76,7 +83,7 @@ def _retry_after_requests(details):
|
||||||
on_backoff=_retry_after_requests,
|
on_backoff=_retry_after_requests,
|
||||||
)
|
)
|
||||||
def _fetch(url: str) -> str:
|
def _fetch(url: str) -> str:
|
||||||
r = _session.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS)
|
r = _session.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
if r.status_code in RETRY_STATUS:
|
if r.status_code in RETRY_STATUS:
|
||||||
raise requests.HTTPError(response=r)
|
raise requests.HTTPError(response=r)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
@ -99,7 +106,7 @@ def _giveup_httpx(exc: Exception) -> bool:
|
||||||
giveup=_giveup_httpx,
|
giveup=_giveup_httpx,
|
||||||
)
|
)
|
||||||
async def _get_async(client: httpx.AsyncClient, url: str) -> str:
|
async def _get_async(client: httpx.AsyncClient, url: str) -> str:
|
||||||
r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_SECONDS)
|
r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
|
|
@ -255,7 +262,7 @@ def _solve_turnstile(page) -> None:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
try:
|
try:
|
||||||
page.wait_for_function(_TURNSTILE_JS, timeout=5000)
|
page.wait_for_function(_TURNSTILE_JS, timeout=BROWSER_TURNSTILE_POLL)
|
||||||
return
|
return
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
@ -331,7 +338,7 @@ def _submit_headless(
|
||||||
page.fill('input[name="username"]', credentials.get("username", ""))
|
page.fill('input[name="username"]', credentials.get("username", ""))
|
||||||
page.fill('input[name="password"]', credentials.get("password", ""))
|
page.fill('input[name="password"]', credentials.get("password", ""))
|
||||||
page.click("#submit")
|
page.click("#submit")
|
||||||
page.wait_for_url(lambda url: "/login" not in url, timeout=60000)
|
page.wait_for_url(lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
login_error = str(e)
|
login_error = str(e)
|
||||||
|
|
||||||
|
|
@ -345,7 +352,7 @@ def _submit_headless(
|
||||||
)
|
)
|
||||||
page.locator(
|
page.locator(
|
||||||
f'select[name="data.LanguageId"] option[value="{language_id}"]'
|
f'select[name="data.LanguageId"] option[value="{language_id}"]'
|
||||||
).wait_for(state="attached", timeout=15000)
|
).wait_for(state="attached", timeout=BROWSER_ELEMENT_WAIT)
|
||||||
page.select_option('select[name="data.LanguageId"]', language_id)
|
page.select_option('select[name="data.LanguageId"]', language_id)
|
||||||
with tempfile.NamedTemporaryFile(
|
with tempfile.NamedTemporaryFile(
|
||||||
mode="w", suffix=".cpp", delete=False, prefix="atcoder_"
|
mode="w", suffix=".cpp", delete=False, prefix="atcoder_"
|
||||||
|
|
@ -354,18 +361,18 @@ def _submit_headless(
|
||||||
tmp_path = tf.name
|
tmp_path = tf.name
|
||||||
try:
|
try:
|
||||||
page.set_input_files("#input-open-file", tmp_path)
|
page.set_input_files("#input-open-file", tmp_path)
|
||||||
page.wait_for_timeout(500)
|
page.wait_for_timeout(BROWSER_SETTLE_DELAY)
|
||||||
finally:
|
finally:
|
||||||
os.unlink(tmp_path)
|
os.unlink(tmp_path)
|
||||||
page.locator('button[type="submit"]').click()
|
page.locator('button[type="submit"]').click()
|
||||||
page.wait_for_url(lambda url: "/submissions/me" in url, timeout=60000)
|
page.wait_for_url(lambda url: "/submissions/me" in url, timeout=BROWSER_NAV_TIMEOUT)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
submit_error = str(e)
|
submit_error = str(e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with StealthySession(
|
with StealthySession(
|
||||||
headless=True,
|
headless=True,
|
||||||
timeout=60000,
|
timeout=BROWSER_SESSION_TIMEOUT,
|
||||||
google_search=False,
|
google_search=False,
|
||||||
cookies=saved_cookies,
|
cookies=saved_cookies,
|
||||||
) as session:
|
) as session:
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ import httpx
|
||||||
from curl_cffi import requests as curl_requests
|
from curl_cffi import requests as curl_requests
|
||||||
|
|
||||||
from .base import BaseScraper, extract_precision
|
from .base import BaseScraper, extract_precision
|
||||||
|
from .timeouts import HTTP_TIMEOUT
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -26,7 +27,6 @@ PROBLEM_URL = "https://www.codechef.com/problems/{problem_id}"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
TIMEOUT_S = 15.0
|
|
||||||
CONNECTIONS = 8
|
CONNECTIONS = 8
|
||||||
MEMORY_LIMIT_RE = re.compile(
|
MEMORY_LIMIT_RE = re.compile(
|
||||||
r"Memory\s+[Ll]imit.*?([0-9.]+)\s*(MB|GB)", re.IGNORECASE | re.DOTALL
|
r"Memory\s+[Ll]imit.*?([0-9.]+)\s*(MB|GB)", re.IGNORECASE | re.DOTALL
|
||||||
|
|
@ -34,7 +34,7 @@ MEMORY_LIMIT_RE = re.compile(
|
||||||
|
|
||||||
|
|
||||||
async def fetch_json(client: httpx.AsyncClient, path: str) -> dict:
|
async def fetch_json(client: httpx.AsyncClient, path: str) -> dict:
|
||||||
r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S)
|
r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.json()
|
return r.json()
|
||||||
|
|
||||||
|
|
@ -51,7 +51,7 @@ def _extract_memory_limit(html: str) -> float:
|
||||||
|
|
||||||
|
|
||||||
def _fetch_html_sync(url: str) -> str:
|
def _fetch_html_sync(url: str) -> str:
|
||||||
response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_S)
|
response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.text
|
return response.text
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,10 +21,15 @@ from .models import (
|
||||||
SubmitResult,
|
SubmitResult,
|
||||||
TestCase,
|
TestCase,
|
||||||
)
|
)
|
||||||
|
from .timeouts import (
|
||||||
|
BROWSER_NAV_TIMEOUT,
|
||||||
|
BROWSER_SESSION_TIMEOUT,
|
||||||
|
BROWSER_SETTLE_DELAY,
|
||||||
|
HTTP_TIMEOUT,
|
||||||
|
)
|
||||||
|
|
||||||
BASE_URL = "https://codeforces.com"
|
BASE_URL = "https://codeforces.com"
|
||||||
API_CONTEST_LIST_URL = f"{BASE_URL}/api/contest.list"
|
API_CONTEST_LIST_URL = f"{BASE_URL}/api/contest.list"
|
||||||
TIMEOUT_SECONDS = 30
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
|
|
@ -139,7 +144,7 @@ def _is_interactive(block: Tag) -> bool:
|
||||||
|
|
||||||
def _fetch_problems_html(contest_id: str) -> str:
|
def _fetch_problems_html(contest_id: str) -> str:
|
||||||
url = f"{BASE_URL}/contest/{contest_id}/problems"
|
url = f"{BASE_URL}/contest/{contest_id}/problems"
|
||||||
response = curl_requests.get(url, impersonate="chrome", timeout=TIMEOUT_SECONDS)
|
response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response.text
|
return response.text
|
||||||
|
|
||||||
|
|
@ -226,7 +231,7 @@ class CodeforcesScraper(BaseScraper):
|
||||||
|
|
||||||
async def scrape_contest_list(self) -> ContestListResult:
|
async def scrape_contest_list(self) -> ContestListResult:
|
||||||
try:
|
try:
|
||||||
r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS)
|
r = requests.get(API_CONTEST_LIST_URL, timeout=HTTP_TIMEOUT)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
data = r.json()
|
data = r.json()
|
||||||
if data.get("status") != "OK":
|
if data.get("status") != "OK":
|
||||||
|
|
@ -349,7 +354,7 @@ def _submit_headless(
|
||||||
page.goto(
|
page.goto(
|
||||||
f"{BASE_URL}/enter",
|
f"{BASE_URL}/enter",
|
||||||
wait_until="domcontentloaded",
|
wait_until="domcontentloaded",
|
||||||
timeout=10000,
|
timeout=BROWSER_NAV_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -371,7 +376,7 @@ def _submit_headless(
|
||||||
'#enterForm input[type="submit"]'
|
'#enterForm input[type="submit"]'
|
||||||
).click()
|
).click()
|
||||||
page.wait_for_url(
|
page.wait_for_url(
|
||||||
lambda url: "/enter" not in url, timeout=10000
|
lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
login_error = str(e)
|
login_error = str(e)
|
||||||
|
|
@ -380,7 +385,7 @@ def _submit_headless(
|
||||||
page.goto(
|
page.goto(
|
||||||
f"{BASE_URL}/contest/{contest_id}/submit",
|
f"{BASE_URL}/contest/{contest_id}/submit",
|
||||||
wait_until="domcontentloaded",
|
wait_until="domcontentloaded",
|
||||||
timeout=10000,
|
timeout=BROWSER_NAV_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(json.dumps({"status": "submitting"}), flush=True)
|
print(json.dumps({"status": "submitting"}), flush=True)
|
||||||
|
|
@ -401,7 +406,7 @@ def _submit_headless(
|
||||||
page.set_input_files(
|
page.set_input_files(
|
||||||
'input[name="sourceFile"]', tmp_path
|
'input[name="sourceFile"]', tmp_path
|
||||||
)
|
)
|
||||||
page.wait_for_timeout(500)
|
page.wait_for_timeout(BROWSER_SETTLE_DELAY)
|
||||||
except Exception:
|
except Exception:
|
||||||
page.fill('textarea[name="source"]', source_code)
|
page.fill('textarea[name="source"]', source_code)
|
||||||
finally:
|
finally:
|
||||||
|
|
@ -409,7 +414,7 @@ def _submit_headless(
|
||||||
page.locator('form.submit-form input.submit').click()
|
page.locator('form.submit-form input.submit').click()
|
||||||
page.wait_for_url(
|
page.wait_for_url(
|
||||||
lambda url: "/my" in url or "/status" in url,
|
lambda url: "/my" in url or "/status" in url,
|
||||||
timeout=10000,
|
timeout=BROWSER_NAV_TIMEOUT,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
submit_error = str(e)
|
submit_error = str(e)
|
||||||
|
|
@ -417,7 +422,7 @@ def _submit_headless(
|
||||||
try:
|
try:
|
||||||
with StealthySession(
|
with StealthySession(
|
||||||
headless=True,
|
headless=True,
|
||||||
timeout=15000,
|
timeout=BROWSER_SESSION_TIMEOUT,
|
||||||
google_search=False,
|
google_search=False,
|
||||||
cookies=saved_cookies,
|
cookies=saved_cookies,
|
||||||
) as session:
|
) as session:
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ from typing import Any
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from .base import BaseScraper, extract_precision
|
from .base import BaseScraper, extract_precision
|
||||||
|
from .timeouts import HTTP_TIMEOUT, SUBMIT_POLL_TIMEOUT
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -26,7 +27,6 @@ TASK_PATH = "/problemset/task/{id}"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
TIMEOUT_S = 15.0
|
|
||||||
CONNECTIONS = 8
|
CONNECTIONS = 8
|
||||||
|
|
||||||
CSES_LANGUAGES: dict[str, dict[str, str]] = {
|
CSES_LANGUAGES: dict[str, dict[str, str]] = {
|
||||||
|
|
@ -78,7 +78,7 @@ def snake_to_title(name: str) -> str:
|
||||||
|
|
||||||
|
|
||||||
async def fetch_text(client: httpx.AsyncClient, path: str) -> str:
|
async def fetch_text(client: httpx.AsyncClient, path: str) -> str:
|
||||||
r = await client.get(BASE_URL + path, headers=HEADERS, timeout=TIMEOUT_S)
|
r = await client.get(BASE_URL + path, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
|
|
@ -290,7 +290,7 @@ class CSESScraper(BaseScraper):
|
||||||
password: str,
|
password: str,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
login_page = await client.get(
|
login_page = await client.get(
|
||||||
f"{BASE_URL}/login", headers=HEADERS, timeout=TIMEOUT_S
|
f"{BASE_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT
|
||||||
)
|
)
|
||||||
csrf_match = re.search(r'name="csrf_token" value="([^"]+)"', login_page.text)
|
csrf_match = re.search(r'name="csrf_token" value="([^"]+)"', login_page.text)
|
||||||
if not csrf_match:
|
if not csrf_match:
|
||||||
|
|
@ -304,20 +304,20 @@ class CSESScraper(BaseScraper):
|
||||||
"pass": password,
|
"pass": password,
|
||||||
},
|
},
|
||||||
headers=HEADERS,
|
headers=HEADERS,
|
||||||
timeout=TIMEOUT_S,
|
timeout=HTTP_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
if "Invalid username or password" in login_resp.text:
|
if "Invalid username or password" in login_resp.text:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
api_resp = await client.post(
|
api_resp = await client.post(
|
||||||
f"{API_URL}/login", headers=HEADERS, timeout=TIMEOUT_S
|
f"{API_URL}/login", headers=HEADERS, timeout=HTTP_TIMEOUT
|
||||||
)
|
)
|
||||||
api_data = api_resp.json()
|
api_data = api_resp.json()
|
||||||
token: str = api_data["X-Auth-Token"]
|
token: str = api_data["X-Auth-Token"]
|
||||||
auth_url: str = api_data["authentication_url"]
|
auth_url: str = api_data["authentication_url"]
|
||||||
|
|
||||||
auth_page = await client.get(auth_url, headers=HEADERS, timeout=TIMEOUT_S)
|
auth_page = await client.get(auth_url, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text)
|
auth_csrf = re.search(r'name="csrf_token" value="([^"]+)"', auth_page.text)
|
||||||
form_token = re.search(r'name="token" value="([^"]+)"', auth_page.text)
|
form_token = re.search(r'name="token" value="([^"]+)"', auth_page.text)
|
||||||
if not auth_csrf or not form_token:
|
if not auth_csrf or not form_token:
|
||||||
|
|
@ -330,13 +330,13 @@ class CSESScraper(BaseScraper):
|
||||||
"token": form_token.group(1),
|
"token": form_token.group(1),
|
||||||
},
|
},
|
||||||
headers=HEADERS,
|
headers=HEADERS,
|
||||||
timeout=TIMEOUT_S,
|
timeout=HTTP_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
check = await client.get(
|
check = await client.get(
|
||||||
f"{API_URL}/login",
|
f"{API_URL}/login",
|
||||||
headers={"X-Auth-Token": token, **HEADERS},
|
headers={"X-Auth-Token": token, **HEADERS},
|
||||||
timeout=TIMEOUT_S,
|
timeout=HTTP_TIMEOUT,
|
||||||
)
|
)
|
||||||
if check.status_code != 200:
|
if check.status_code != 200:
|
||||||
return None
|
return None
|
||||||
|
|
@ -349,7 +349,7 @@ class CSESScraper(BaseScraper):
|
||||||
r = await client.get(
|
r = await client.get(
|
||||||
f"{API_URL}/login",
|
f"{API_URL}/login",
|
||||||
headers={"X-Auth-Token": token, **HEADERS},
|
headers={"X-Auth-Token": token, **HEADERS},
|
||||||
timeout=TIMEOUT_S,
|
timeout=HTTP_TIMEOUT,
|
||||||
)
|
)
|
||||||
return r.status_code == 200
|
return r.status_code == 200
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
@ -415,7 +415,7 @@ class CSESScraper(BaseScraper):
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
**HEADERS,
|
**HEADERS,
|
||||||
},
|
},
|
||||||
timeout=TIMEOUT_S,
|
timeout=HTTP_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
if r.status_code not in range(200, 300):
|
if r.status_code not in range(200, 300):
|
||||||
|
|
@ -438,7 +438,7 @@ class CSESScraper(BaseScraper):
|
||||||
"X-Auth-Token": token,
|
"X-Auth-Token": token,
|
||||||
**HEADERS,
|
**HEADERS,
|
||||||
},
|
},
|
||||||
timeout=30.0,
|
timeout=SUBMIT_POLL_TIMEOUT,
|
||||||
)
|
)
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
info = r.json()
|
info = r.json()
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ from datetime import datetime
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from .base import BaseScraper
|
from .base import BaseScraper
|
||||||
|
from .timeouts import HTTP_TIMEOUT
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -23,7 +24,6 @@ BASE_URL = "https://open.kattis.com"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
TIMEOUT_S = 15.0
|
|
||||||
CONNECTIONS = 8
|
CONNECTIONS = 8
|
||||||
|
|
||||||
TIME_RE = re.compile(
|
TIME_RE = re.compile(
|
||||||
|
|
@ -37,13 +37,13 @@ MEM_RE = re.compile(
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
|
async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
|
||||||
r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S)
|
r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_bytes(client: httpx.AsyncClient, url: str) -> bytes:
|
async def _fetch_bytes(client: httpx.AsyncClient, url: str) -> bytes:
|
||||||
r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S)
|
r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.content
|
return r.content
|
||||||
|
|
||||||
|
|
|
||||||
9
scrapers/timeouts.py
Normal file
9
scrapers/timeouts.py
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
HTTP_TIMEOUT = 15.0
|
||||||
|
|
||||||
|
BROWSER_SESSION_TIMEOUT = 15000
|
||||||
|
BROWSER_NAV_TIMEOUT = 10000
|
||||||
|
BROWSER_TURNSTILE_POLL = 5000
|
||||||
|
BROWSER_ELEMENT_WAIT = 10000
|
||||||
|
BROWSER_SETTLE_DELAY = 500
|
||||||
|
|
||||||
|
SUBMIT_POLL_TIMEOUT = 30.0
|
||||||
|
|
@ -8,6 +8,7 @@ from typing import Any, cast
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
from .base import BaseScraper
|
from .base import BaseScraper
|
||||||
|
from .timeouts import HTTP_TIMEOUT
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -21,7 +22,6 @@ BASE_URL = "http://www.usaco.org"
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
}
|
}
|
||||||
TIMEOUT_S = 15.0
|
|
||||||
CONNECTIONS = 4
|
CONNECTIONS = 4
|
||||||
|
|
||||||
MONTHS = [
|
MONTHS = [
|
||||||
|
|
@ -58,7 +58,7 @@ RESULTS_PAGE_RE = re.compile(
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
|
async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
|
||||||
r = await client.get(url, headers=HEADERS, timeout=TIMEOUT_S, follow_redirects=True)
|
r = await client.get(url, headers=HEADERS, timeout=HTTP_TIMEOUT, follow_redirects=True)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r.text
|
return r.text
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue