fix(scrapers): make atcoder scraper resilient
This commit is contained in:
parent
aa1dd43e70
commit
3427bf9bbb
1 changed files with 43 additions and 59 deletions
|
|
@ -21,6 +21,44 @@ from .models import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_request(url: str, timeout: int = 10) -> requests.Response:
|
||||||
|
headers = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
@backoff.on_exception(
|
||||||
|
backoff.expo,
|
||||||
|
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
|
||||||
|
max_tries=5,
|
||||||
|
jitter=backoff.random_jitter,
|
||||||
|
on_backoff=lambda details: print(
|
||||||
|
f"Request error on {url} (attempt {details['tries']}), "
|
||||||
|
f"retrying in {details['wait']:.1f}s: {details['exception']}",
|
||||||
|
file=sys.stderr,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
@backoff.on_predicate(
|
||||||
|
backoff.expo,
|
||||||
|
lambda resp: resp.status_code == 429,
|
||||||
|
max_tries=5,
|
||||||
|
jitter=backoff.random_jitter,
|
||||||
|
on_backoff=lambda details: print(
|
||||||
|
f"Rate limited on {url}, retrying in {details['wait']:.1f}s",
|
||||||
|
file=sys.stderr,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
def _req():
|
||||||
|
return requests.get(url, headers=headers, timeout=timeout)
|
||||||
|
|
||||||
|
resp = _req()
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp
|
||||||
|
|
||||||
|
|
||||||
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
|
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
|
||||||
timeout_ms = None
|
timeout_ms = None
|
||||||
memory_mb = None
|
memory_mb = None
|
||||||
|
|
@ -82,12 +120,7 @@ def extract_problem_from_row(row, contest_id: str) -> ProblemSummary | None:
|
||||||
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
|
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
|
||||||
try:
|
try:
|
||||||
contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks"
|
contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks"
|
||||||
headers = {
|
response = _make_request(contest_url)
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(contest_url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
task_table = soup.find("table", class_="table")
|
task_table = soup.find("table", class_="table")
|
||||||
|
|
@ -138,12 +171,7 @@ def extract_test_case_from_headers(sample_headers, i: int) -> tuple[str, str] |
|
||||||
|
|
||||||
def scrape(url: str) -> list[TestCase]:
|
def scrape(url: str) -> list[TestCase]:
|
||||||
try:
|
try:
|
||||||
headers = {
|
response = _make_request(url)
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
sample_headers = soup.find_all(
|
sample_headers = soup.find_all(
|
||||||
|
|
@ -171,14 +199,7 @@ def scrape(url: str) -> list[TestCase]:
|
||||||
def scrape_contests() -> list[ContestSummary]:
|
def scrape_contests() -> list[ContestSummary]:
|
||||||
def get_max_pages() -> int:
|
def get_max_pages() -> int:
|
||||||
try:
|
try:
|
||||||
headers = {
|
response = _make_request("https://atcoder.jp/contests/archive")
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
response = requests.get(
|
|
||||||
"https://atcoder.jp/contests/archive", headers=headers, timeout=10
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
pagination = soup.find("ul", class_="pagination")
|
pagination = soup.find("ul", class_="pagination")
|
||||||
if not pagination or not isinstance(pagination, Tag):
|
if not pagination or not isinstance(pagination, Tag):
|
||||||
|
|
@ -196,37 +217,8 @@ def scrape_contests() -> list[ContestSummary]:
|
||||||
return 15
|
return 15
|
||||||
|
|
||||||
def scrape_page(page: int) -> list[ContestSummary]:
|
def scrape_page(page: int) -> list[ContestSummary]:
|
||||||
@backoff.on_exception(
|
|
||||||
backoff.expo,
|
|
||||||
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
|
|
||||||
max_tries=4,
|
|
||||||
jitter=backoff.random_jitter,
|
|
||||||
on_backoff=lambda details: print(
|
|
||||||
f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
|
|
||||||
file=sys.stderr,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
@backoff.on_predicate(
|
|
||||||
backoff.expo,
|
|
||||||
lambda response: response.status_code == 429,
|
|
||||||
max_tries=4,
|
|
||||||
jitter=backoff.random_jitter,
|
|
||||||
on_backoff=lambda details: print(
|
|
||||||
f"Rate limited on page {page}, retrying in {details['wait']:.1f}s",
|
|
||||||
file=sys.stderr,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
def make_request() -> requests.Response:
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
url = f"https://atcoder.jp/contests/archive?page={page}"
|
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
return response
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = make_request()
|
response = _make_request(f"https://atcoder.jp/contests/archive?page={page}")
|
||||||
except Exception:
|
except Exception:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
@ -354,15 +346,7 @@ class AtCoderScraper(BaseScraper):
|
||||||
url = parse_problem_url(contest_id, problem_letter)
|
url = parse_problem_url(contest_id, problem_letter)
|
||||||
tests = scrape(url)
|
tests = scrape(url)
|
||||||
|
|
||||||
response = requests.get(
|
response = _make_request(url)
|
||||||
url,
|
|
||||||
headers={
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
},
|
|
||||||
timeout=10,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue