feat(scraper): use backoff
This commit is contained in:
parent
58f9be5f9a
commit
46c615416f
6 changed files with 186 additions and 193 deletions
|
|
@ -5,6 +5,7 @@ description = "Add your description here"
|
|||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"backoff>=2.2.1",
|
||||
"beautifulsoup4>=4.13.5",
|
||||
"cloudscraper>=1.2.71",
|
||||
"requests>=2.32.5",
|
||||
|
|
|
|||
|
|
@ -3,9 +3,9 @@
|
|||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict
|
||||
|
||||
import backoff
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
|
|
@ -169,7 +169,6 @@ def scrape(url: str) -> list[TestCase]:
|
|||
|
||||
def scrape_contests() -> list[ContestSummary]:
|
||||
import concurrent.futures
|
||||
import random
|
||||
|
||||
def get_max_pages() -> int:
|
||||
try:
|
||||
|
|
@ -197,168 +196,161 @@ def scrape_contests() -> list[ContestSummary]:
|
|||
except Exception:
|
||||
return 15
|
||||
|
||||
def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]:
|
||||
for attempt in range(max_retries):
|
||||
def scrape_page(page: int) -> list[ContestSummary]:
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
|
||||
max_tries=4,
|
||||
jitter=backoff.random_jitter,
|
||||
on_backoff=lambda details: print(
|
||||
f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
|
||||
file=sys.stderr,
|
||||
),
|
||||
)
|
||||
@backoff.on_predicate(
|
||||
backoff.expo,
|
||||
lambda response: response.status_code == 429,
|
||||
max_tries=4,
|
||||
jitter=backoff.random_jitter,
|
||||
on_backoff=lambda details: print(
|
||||
f"Rate limited on page {page}, retrying in {details['wait']:.1f}s",
|
||||
file=sys.stderr,
|
||||
),
|
||||
)
|
||||
def make_request() -> requests.Response:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
url = f"https://atcoder.jp/contests/archive?page={page}"
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
try:
|
||||
response = make_request()
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
table = soup.find("table", class_="table")
|
||||
if not table:
|
||||
return []
|
||||
|
||||
tbody = table.find("tbody")
|
||||
if not tbody or not isinstance(tbody, Tag):
|
||||
return []
|
||||
|
||||
rows = tbody.find_all("tr")
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
contests = []
|
||||
for row in rows:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 2:
|
||||
continue
|
||||
|
||||
contest_cell = cells[1]
|
||||
link = contest_cell.find("a")
|
||||
if not link or not link.get("href"):
|
||||
continue
|
||||
|
||||
href = link.get("href")
|
||||
contest_id = href.split("/")[-1]
|
||||
name = link.get_text().strip()
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
url = f"https://atcoder.jp/contests/archive?page={page}"
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
name = name.encode().decode("unicode_escape")
|
||||
except (UnicodeDecodeError, UnicodeEncodeError):
|
||||
pass
|
||||
|
||||
if response.status_code == 429:
|
||||
backoff_time = (2**attempt) + random.uniform(0, 1)
|
||||
print(
|
||||
f"Rate limited on page {page}, retrying in {backoff_time:.1f}s",
|
||||
file=sys.stderr,
|
||||
)
|
||||
time.sleep(backoff_time)
|
||||
continue
|
||||
name = (
|
||||
name.replace("\uff08", "(")
|
||||
.replace("\uff09", ")")
|
||||
.replace("\u3000", " ")
|
||||
)
|
||||
name = re.sub(
|
||||
r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
def generate_display_name_from_id(contest_id: str) -> str:
|
||||
parts = contest_id.replace("-", " ").replace("_", " ")
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
table = soup.find("table", class_="table")
|
||||
if not table:
|
||||
return []
|
||||
|
||||
tbody = table.find("tbody")
|
||||
if not tbody or not isinstance(tbody, Tag):
|
||||
return []
|
||||
|
||||
rows = tbody.find_all("tr")
|
||||
if not rows:
|
||||
return []
|
||||
|
||||
contests = []
|
||||
for row in rows:
|
||||
cells = row.find_all("td")
|
||||
if len(cells) < 2:
|
||||
continue
|
||||
|
||||
contest_cell = cells[1]
|
||||
link = contest_cell.find("a")
|
||||
if not link or not link.get("href"):
|
||||
continue
|
||||
|
||||
href = link.get("href")
|
||||
contest_id = href.split("/")[-1]
|
||||
name = link.get_text().strip()
|
||||
|
||||
try:
|
||||
name = name.encode().decode("unicode_escape")
|
||||
except (UnicodeDecodeError, UnicodeEncodeError):
|
||||
pass
|
||||
|
||||
name = (
|
||||
name.replace("\uff08", "(")
|
||||
.replace("\uff09", ")")
|
||||
.replace("\u3000", " ")
|
||||
)
|
||||
name = re.sub(
|
||||
r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
|
||||
)
|
||||
|
||||
def generate_display_name_from_id(contest_id: str) -> str:
|
||||
parts = contest_id.replace("-", " ").replace("_", " ")
|
||||
|
||||
parts = re.sub(
|
||||
r"\b(jsc|JSC)\b",
|
||||
"Japanese Student Championship",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(wtf|WTF)\b",
|
||||
"World Tour Finals",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(ahc)(\d+)\b",
|
||||
r"Heuristic Contest \2 (AHC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(arc)(\d+)\b",
|
||||
r"Regular Contest \2 (ARC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(abc)(\d+)\b",
|
||||
r"Beginner Contest \2 (ABC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(agc)(\d+)\b",
|
||||
r"Grand Contest \2 (AGC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
return parts.title()
|
||||
|
||||
english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
|
||||
total_chars = len(re.sub(r"\s+", "", name))
|
||||
|
||||
if total_chars > 0 and english_chars / total_chars < 0.3:
|
||||
display_name = generate_display_name_from_id(contest_id)
|
||||
else:
|
||||
display_name = name
|
||||
if "AtCoder Beginner Contest" in name:
|
||||
match = re.search(r"AtCoder Beginner Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = (
|
||||
f"Beginner Contest {match.group(1)} (ABC)"
|
||||
)
|
||||
elif "AtCoder Regular Contest" in name:
|
||||
match = re.search(r"AtCoder Regular Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = f"Regular Contest {match.group(1)} (ARC)"
|
||||
elif "AtCoder Grand Contest" in name:
|
||||
match = re.search(r"AtCoder Grand Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = f"Grand Contest {match.group(1)} (AGC)"
|
||||
elif "AtCoder Heuristic Contest" in name:
|
||||
match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = (
|
||||
f"Heuristic Contest {match.group(1)} (AHC)"
|
||||
)
|
||||
|
||||
contests.append(
|
||||
ContestSummary(
|
||||
id=contest_id, name=name, display_name=display_name
|
||||
)
|
||||
)
|
||||
|
||||
return contests
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
if response.status_code == 429:
|
||||
continue
|
||||
print(
|
||||
f"Failed to scrape page {page} (attempt {attempt + 1}): {e}",
|
||||
file=sys.stderr,
|
||||
parts = re.sub(
|
||||
r"\b(jsc|JSC)\b",
|
||||
"Japanese Student Championship",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(wtf|WTF)\b",
|
||||
"World Tour Finals",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(ahc)(\d+)\b",
|
||||
r"Heuristic Contest \2 (AHC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(arc)(\d+)\b",
|
||||
r"Regular Contest \2 (ARC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(abc)(\d+)\b",
|
||||
r"Beginner Contest \2 (ABC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
parts = re.sub(
|
||||
r"\b(agc)(\d+)\b",
|
||||
r"Grand Contest \2 (AGC)",
|
||||
parts,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if attempt == max_retries - 1:
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Unexpected error on page {page}: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
return []
|
||||
return parts.title()
|
||||
|
||||
english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
|
||||
total_chars = len(re.sub(r"\s+", "", name))
|
||||
|
||||
if total_chars > 0 and english_chars / total_chars < 0.3:
|
||||
display_name = generate_display_name_from_id(contest_id)
|
||||
else:
|
||||
display_name = name
|
||||
if "AtCoder Beginner Contest" in name:
|
||||
match = re.search(r"AtCoder Beginner Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = f"Beginner Contest {match.group(1)} (ABC)"
|
||||
elif "AtCoder Regular Contest" in name:
|
||||
match = re.search(r"AtCoder Regular Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = f"Regular Contest {match.group(1)} (ARC)"
|
||||
elif "AtCoder Grand Contest" in name:
|
||||
match = re.search(r"AtCoder Grand Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = f"Grand Contest {match.group(1)} (AGC)"
|
||||
elif "AtCoder Heuristic Contest" in name:
|
||||
match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
|
||||
if match:
|
||||
display_name = f"Heuristic Contest {match.group(1)} (AHC)"
|
||||
|
||||
contests.append(
|
||||
ContestSummary(id=contest_id, name=name, display_name=display_name)
|
||||
)
|
||||
|
||||
return contests
|
||||
|
||||
max_pages = get_max_pages()
|
||||
page_results = {}
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
future_to_page = {
|
||||
executor.submit(scrape_page_with_retry, page): page
|
||||
for page in range(1, max_pages + 1)
|
||||
executor.submit(scrape_page, page): page for page in range(1, max_pages + 1)
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(future_to_page):
|
||||
|
|
|
|||
|
|
@ -1,12 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import asdict
|
||||
|
||||
import backoff
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
|
|
@ -41,36 +40,29 @@ def denormalize_category_name(category_id: str) -> str:
|
|||
return category_map.get(category_id, category_id.replace("_", " ").title())
|
||||
|
||||
|
||||
def request_with_retry(
|
||||
url: str, headers: dict, max_retries: int = 3
|
||||
) -> requests.Response:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
delay = 0.5 + random.uniform(0, 0.3)
|
||||
time.sleep(delay)
|
||||
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
|
||||
if response.status_code == 429:
|
||||
backoff = (2**attempt) + random.uniform(0, 1)
|
||||
print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr)
|
||||
time.sleep(backoff)
|
||||
continue
|
||||
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise
|
||||
backoff = 2**attempt
|
||||
print(
|
||||
f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
time.sleep(backoff)
|
||||
|
||||
raise Exception("All retry attempts failed")
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
|
||||
max_tries=4,
|
||||
jitter=backoff.random_jitter,
|
||||
on_backoff=lambda details: print(
|
||||
f"Request failed (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
|
||||
file=sys.stderr,
|
||||
),
|
||||
)
|
||||
@backoff.on_predicate(
|
||||
backoff.expo,
|
||||
lambda response: response.status_code == 429,
|
||||
max_tries=4,
|
||||
jitter=backoff.random_jitter,
|
||||
on_backoff=lambda details: print(
|
||||
f"Rate limited, retrying in {details['wait']:.1f}s", file=sys.stderr
|
||||
),
|
||||
)
|
||||
def make_request(url: str, headers: dict) -> requests.Response:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
|
||||
def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
|
||||
|
|
@ -82,7 +74,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
|
|||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
response = request_with_retry(problemset_url, headers)
|
||||
response = make_request(problemset_url, headers)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
|
|
@ -176,7 +168,7 @@ def scrape_categories() -> list[ContestSummary]:
|
|||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
response = request_with_retry("https://cses.fi/problemset/", headers)
|
||||
response = make_request("https://cses.fi/problemset/", headers)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
categories = []
|
||||
|
|
@ -193,7 +185,7 @@ def scrape_categories() -> list[ContestSummary]:
|
|||
if ul:
|
||||
problem_count = len(ul.find_all("li", class_="task"))
|
||||
|
||||
display_name = f"{category_name} ({problem_count} problems)"
|
||||
display_name = category_name
|
||||
|
||||
categories.append(
|
||||
ContestSummary(
|
||||
|
|
@ -323,7 +315,7 @@ def scrape(url: str) -> list[TestCase]:
|
|||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
response = request_with_retry(url, headers)
|
||||
response = make_request(url, headers)
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
|
|
|
|||
|
|
@ -94,7 +94,6 @@ def test_scrape_contests_success(mocker):
|
|||
return mock_response
|
||||
|
||||
mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect)
|
||||
mocker.patch("scrapers.atcoder.time.sleep")
|
||||
|
||||
result = scrape_contests()
|
||||
|
||||
|
|
@ -116,7 +115,6 @@ def test_scrape_contests_no_table(mocker):
|
|||
mock_response.text = "<html><body>No table found</body></html>"
|
||||
|
||||
mocker.patch("scrapers.atcoder.requests.get", return_value=mock_response)
|
||||
mocker.patch("scrapers.atcoder.time.sleep")
|
||||
|
||||
result = scrape_contests()
|
||||
|
||||
|
|
@ -127,7 +125,6 @@ def test_scrape_contests_network_error(mocker):
|
|||
mocker.patch(
|
||||
"scrapers.atcoder.requests.get", side_effect=Exception("Network error")
|
||||
)
|
||||
mocker.patch("scrapers.atcoder.time.sleep")
|
||||
|
||||
result = scrape_contests()
|
||||
|
||||
|
|
|
|||
|
|
@ -168,12 +168,12 @@ def test_scrape_categories_success(mocker):
|
|||
assert result[0] == ContestSummary(
|
||||
id="introductory_problems",
|
||||
name="Introductory Problems",
|
||||
display_name="Introductory Problems (2 problems)",
|
||||
display_name="Introductory Problems",
|
||||
)
|
||||
assert result[1] == ContestSummary(
|
||||
id="sorting_and_searching",
|
||||
name="Sorting and Searching",
|
||||
display_name="Sorting and Searching (3 problems)",
|
||||
display_name="Sorting and Searching",
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
11
uv.lock
generated
11
uv.lock
generated
|
|
@ -2,6 +2,15 @@ version = 1
|
|||
revision = 3
|
||||
requires-python = ">=3.11"
|
||||
|
||||
[[package]]
|
||||
name = "backoff"
|
||||
version = "2.2.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.13.5"
|
||||
|
|
@ -375,6 +384,7 @@ name = "scrapers"
|
|||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "backoff" },
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "cloudscraper" },
|
||||
{ name = "requests" },
|
||||
|
|
@ -392,6 +402,7 @@ dev = [
|
|||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "backoff", specifier = ">=2.2.1" },
|
||||
{ name = "beautifulsoup4", specifier = ">=4.13.5" },
|
||||
{ name = "cloudscraper", specifier = ">=1.2.71" },
|
||||
{ name = "requests", specifier = ">=2.32.5" },
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue