fix(scrapers): make atcoder scraper resilient

This commit is contained in:
Barrett Ruth 2025-09-30 21:59:25 -04:00
parent aa1dd43e70
commit 3427bf9bbb

View file

@ -21,6 +21,44 @@ from .models import (
) )
def _make_request(url: str, timeout: int = 10) -> requests.Response:
headers = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
@backoff.on_exception(
backoff.expo,
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
max_tries=5,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Request error on {url} (attempt {details['tries']}), "
f"retrying in {details['wait']:.1f}s: {details['exception']}",
file=sys.stderr,
),
)
@backoff.on_predicate(
backoff.expo,
lambda resp: resp.status_code == 429,
max_tries=5,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Rate limited on {url}, retrying in {details['wait']:.1f}s",
file=sys.stderr,
),
)
def _req():
return requests.get(url, headers=headers, timeout=timeout)
resp = _req()
resp.raise_for_status()
return resp
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
timeout_ms = None timeout_ms = None
memory_mb = None memory_mb = None
@ -82,12 +120,7 @@ def extract_problem_from_row(row, contest_id: str) -> ProblemSummary | None:
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try: try:
contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks" contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks"
headers = { response = _make_request(contest_url)
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(contest_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
task_table = soup.find("table", class_="table") task_table = soup.find("table", class_="table")
@ -138,12 +171,7 @@ def extract_test_case_from_headers(sample_headers, i: int) -> tuple[str, str] |
def scrape(url: str) -> list[TestCase]: def scrape(url: str) -> list[TestCase]:
try: try:
headers = { response = _make_request(url)
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
sample_headers = soup.find_all( sample_headers = soup.find_all(
@ -171,14 +199,7 @@ def scrape(url: str) -> list[TestCase]:
def scrape_contests() -> list[ContestSummary]: def scrape_contests() -> list[ContestSummary]:
def get_max_pages() -> int: def get_max_pages() -> int:
try: try:
headers = { response = _make_request("https://atcoder.jp/contests/archive")
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(
"https://atcoder.jp/contests/archive", headers=headers, timeout=10
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
pagination = soup.find("ul", class_="pagination") pagination = soup.find("ul", class_="pagination")
if not pagination or not isinstance(pagination, Tag): if not pagination or not isinstance(pagination, Tag):
@ -196,37 +217,8 @@ def scrape_contests() -> list[ContestSummary]:
return 15 return 15
def scrape_page(page: int) -> list[ContestSummary]: def scrape_page(page: int) -> list[ContestSummary]:
@backoff.on_exception(
backoff.expo,
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
file=sys.stderr,
),
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Rate limited on page {page}, retrying in {details['wait']:.1f}s",
file=sys.stderr,
),
)
def make_request() -> requests.Response:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
url = f"https://atcoder.jp/contests/archive?page={page}"
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response
try: try:
response = make_request() response = _make_request(f"https://atcoder.jp/contests/archive?page={page}")
except Exception: except Exception:
return [] return []
@ -354,15 +346,7 @@ class AtCoderScraper(BaseScraper):
url = parse_problem_url(contest_id, problem_letter) url = parse_problem_url(contest_id, problem_letter)
tests = scrape(url) tests = scrape(url)
response = requests.get( response = _make_request(url)
url,
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
},
timeout=10,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup) timeout_ms, memory_mb = extract_problem_limits(soup)