feat(scraper): use backoff

This commit is contained in:
Barrett Ruth 2025-09-21 11:26:54 -04:00
parent 58f9be5f9a
commit 46c615416f
6 changed files with 186 additions and 193 deletions

View file

@ -5,6 +5,7 @@ description = "Add your description here"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = ">=3.11"
dependencies = [ dependencies = [
"backoff>=2.2.1",
"beautifulsoup4>=4.13.5", "beautifulsoup4>=4.13.5",
"cloudscraper>=1.2.71", "cloudscraper>=1.2.71",
"requests>=2.32.5", "requests>=2.32.5",

View file

@ -3,9 +3,9 @@
import json import json
import re import re
import sys import sys
import time
from dataclasses import asdict from dataclasses import asdict
import backoff
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -169,7 +169,6 @@ def scrape(url: str) -> list[TestCase]:
def scrape_contests() -> list[ContestSummary]: def scrape_contests() -> list[ContestSummary]:
import concurrent.futures import concurrent.futures
import random
def get_max_pages() -> int: def get_max_pages() -> int:
try: try:
@ -197,168 +196,161 @@ def scrape_contests() -> list[ContestSummary]:
except Exception: except Exception:
return 15 return 15
def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]: def scrape_page(page: int) -> list[ContestSummary]:
for attempt in range(max_retries): @backoff.on_exception(
backoff.expo,
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
file=sys.stderr,
),
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Rate limited on page {page}, retrying in {details['wait']:.1f}s",
file=sys.stderr,
),
)
def make_request() -> requests.Response:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
url = f"https://atcoder.jp/contests/archive?page={page}"
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response
try:
response = make_request()
except Exception:
return []
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", class_="table")
if not table:
return []
tbody = table.find("tbody")
if not tbody or not isinstance(tbody, Tag):
return []
rows = tbody.find_all("tr")
if not rows:
return []
contests = []
for row in rows:
cells = row.find_all("td")
if len(cells) < 2:
continue
contest_cell = cells[1]
link = contest_cell.find("a")
if not link or not link.get("href"):
continue
href = link.get("href")
contest_id = href.split("/")[-1]
name = link.get_text().strip()
try: try:
headers = { name = name.encode().decode("unicode_escape")
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" except (UnicodeDecodeError, UnicodeEncodeError):
} pass
url = f"https://atcoder.jp/contests/archive?page={page}"
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 429: name = (
backoff_time = (2**attempt) + random.uniform(0, 1) name.replace("\uff08", "(")
print( .replace("\uff09", ")")
f"Rate limited on page {page}, retrying in {backoff_time:.1f}s", .replace("\u3000", " ")
file=sys.stderr, )
) name = re.sub(
time.sleep(backoff_time) r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
continue )
response.raise_for_status() def generate_display_name_from_id(contest_id: str) -> str:
parts = contest_id.replace("-", " ").replace("_", " ")
soup = BeautifulSoup(response.text, "html.parser") parts = re.sub(
table = soup.find("table", class_="table") r"\b(jsc|JSC)\b",
if not table: "Japanese Student Championship",
return [] parts,
flags=re.IGNORECASE,
tbody = table.find("tbody") )
if not tbody or not isinstance(tbody, Tag): parts = re.sub(
return [] r"\b(wtf|WTF)\b",
"World Tour Finals",
rows = tbody.find_all("tr") parts,
if not rows: flags=re.IGNORECASE,
return [] )
parts = re.sub(
contests = [] r"\b(ahc)(\d+)\b",
for row in rows: r"Heuristic Contest \2 (AHC)",
cells = row.find_all("td") parts,
if len(cells) < 2: flags=re.IGNORECASE,
continue )
parts = re.sub(
contest_cell = cells[1] r"\b(arc)(\d+)\b",
link = contest_cell.find("a") r"Regular Contest \2 (ARC)",
if not link or not link.get("href"): parts,
continue flags=re.IGNORECASE,
)
href = link.get("href") parts = re.sub(
contest_id = href.split("/")[-1] r"\b(abc)(\d+)\b",
name = link.get_text().strip() r"Beginner Contest \2 (ABC)",
parts,
try: flags=re.IGNORECASE,
name = name.encode().decode("unicode_escape") )
except (UnicodeDecodeError, UnicodeEncodeError): parts = re.sub(
pass r"\b(agc)(\d+)\b",
r"Grand Contest \2 (AGC)",
name = ( parts,
name.replace("\uff08", "(") flags=re.IGNORECASE,
.replace("\uff09", ")")
.replace("\u3000", " ")
)
name = re.sub(
r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
)
def generate_display_name_from_id(contest_id: str) -> str:
parts = contest_id.replace("-", " ").replace("_", " ")
parts = re.sub(
r"\b(jsc|JSC)\b",
"Japanese Student Championship",
parts,
flags=re.IGNORECASE,
)
parts = re.sub(
r"\b(wtf|WTF)\b",
"World Tour Finals",
parts,
flags=re.IGNORECASE,
)
parts = re.sub(
r"\b(ahc)(\d+)\b",
r"Heuristic Contest \2 (AHC)",
parts,
flags=re.IGNORECASE,
)
parts = re.sub(
r"\b(arc)(\d+)\b",
r"Regular Contest \2 (ARC)",
parts,
flags=re.IGNORECASE,
)
parts = re.sub(
r"\b(abc)(\d+)\b",
r"Beginner Contest \2 (ABC)",
parts,
flags=re.IGNORECASE,
)
parts = re.sub(
r"\b(agc)(\d+)\b",
r"Grand Contest \2 (AGC)",
parts,
flags=re.IGNORECASE,
)
return parts.title()
english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
total_chars = len(re.sub(r"\s+", "", name))
if total_chars > 0 and english_chars / total_chars < 0.3:
display_name = generate_display_name_from_id(contest_id)
else:
display_name = name
if "AtCoder Beginner Contest" in name:
match = re.search(r"AtCoder Beginner Contest (\d+)", name)
if match:
display_name = (
f"Beginner Contest {match.group(1)} (ABC)"
)
elif "AtCoder Regular Contest" in name:
match = re.search(r"AtCoder Regular Contest (\d+)", name)
if match:
display_name = f"Regular Contest {match.group(1)} (ARC)"
elif "AtCoder Grand Contest" in name:
match = re.search(r"AtCoder Grand Contest (\d+)", name)
if match:
display_name = f"Grand Contest {match.group(1)} (AGC)"
elif "AtCoder Heuristic Contest" in name:
match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
if match:
display_name = (
f"Heuristic Contest {match.group(1)} (AHC)"
)
contests.append(
ContestSummary(
id=contest_id, name=name, display_name=display_name
)
)
return contests
except requests.exceptions.RequestException as e:
if response.status_code == 429:
continue
print(
f"Failed to scrape page {page} (attempt {attempt + 1}): {e}",
file=sys.stderr,
) )
if attempt == max_retries - 1:
return []
except Exception as e:
print(f"Unexpected error on page {page}: {e}", file=sys.stderr)
return []
return [] return parts.title()
english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
total_chars = len(re.sub(r"\s+", "", name))
if total_chars > 0 and english_chars / total_chars < 0.3:
display_name = generate_display_name_from_id(contest_id)
else:
display_name = name
if "AtCoder Beginner Contest" in name:
match = re.search(r"AtCoder Beginner Contest (\d+)", name)
if match:
display_name = f"Beginner Contest {match.group(1)} (ABC)"
elif "AtCoder Regular Contest" in name:
match = re.search(r"AtCoder Regular Contest (\d+)", name)
if match:
display_name = f"Regular Contest {match.group(1)} (ARC)"
elif "AtCoder Grand Contest" in name:
match = re.search(r"AtCoder Grand Contest (\d+)", name)
if match:
display_name = f"Grand Contest {match.group(1)} (AGC)"
elif "AtCoder Heuristic Contest" in name:
match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
if match:
display_name = f"Heuristic Contest {match.group(1)} (AHC)"
contests.append(
ContestSummary(id=contest_id, name=name, display_name=display_name)
)
return contests
max_pages = get_max_pages() max_pages = get_max_pages()
page_results = {} page_results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_page = { future_to_page = {
executor.submit(scrape_page_with_retry, page): page executor.submit(scrape_page, page): page for page in range(1, max_pages + 1)
for page in range(1, max_pages + 1)
} }
for future in concurrent.futures.as_completed(future_to_page): for future in concurrent.futures.as_completed(future_to_page):

View file

@ -1,12 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json import json
import random
import re import re
import sys import sys
import time
from dataclasses import asdict from dataclasses import asdict
import backoff
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -41,36 +40,29 @@ def denormalize_category_name(category_id: str) -> str:
return category_map.get(category_id, category_id.replace("_", " ").title()) return category_map.get(category_id, category_id.replace("_", " ").title())
def request_with_retry( @backoff.on_exception(
url: str, headers: dict, max_retries: int = 3 backoff.expo,
) -> requests.Response: (requests.exceptions.RequestException, requests.exceptions.HTTPError),
for attempt in range(max_retries): max_tries=4,
try: jitter=backoff.random_jitter,
delay = 0.5 + random.uniform(0, 0.3) on_backoff=lambda details: print(
time.sleep(delay) f"Request failed (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
file=sys.stderr,
response = requests.get(url, headers=headers, timeout=10) ),
)
if response.status_code == 429: @backoff.on_predicate(
backoff = (2**attempt) + random.uniform(0, 1) backoff.expo,
print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr) lambda response: response.status_code == 429,
time.sleep(backoff) max_tries=4,
continue jitter=backoff.random_jitter,
on_backoff=lambda details: print(
response.raise_for_status() f"Rate limited, retrying in {details['wait']:.1f}s", file=sys.stderr
return response ),
)
except requests.exceptions.RequestException as e: def make_request(url: str, headers: dict) -> requests.Response:
if attempt == max_retries - 1: response = requests.get(url, headers=headers, timeout=10)
raise response.raise_for_status()
backoff = 2**attempt return response
print(
f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}",
file=sys.stderr,
)
time.sleep(backoff)
raise Exception("All retry attempts failed")
def scrape_category_problems(category_id: str) -> list[ProblemSummary]: def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
@ -82,7 +74,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
response = request_with_retry(problemset_url, headers) response = make_request(problemset_url, headers)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -176,7 +168,7 @@ def scrape_categories() -> list[ContestSummary]:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
response = request_with_retry("https://cses.fi/problemset/", headers) response = make_request("https://cses.fi/problemset/", headers)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
categories = [] categories = []
@ -193,7 +185,7 @@ def scrape_categories() -> list[ContestSummary]:
if ul: if ul:
problem_count = len(ul.find_all("li", class_="task")) problem_count = len(ul.find_all("li", class_="task"))
display_name = f"{category_name} ({problem_count} problems)" display_name = category_name
categories.append( categories.append(
ContestSummary( ContestSummary(
@ -323,7 +315,7 @@ def scrape(url: str) -> list[TestCase]:
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
response = request_with_retry(url, headers) response = make_request(url, headers)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")

View file

@ -94,7 +94,6 @@ def test_scrape_contests_success(mocker):
return mock_response return mock_response
mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect) mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect)
mocker.patch("scrapers.atcoder.time.sleep")
result = scrape_contests() result = scrape_contests()
@ -116,7 +115,6 @@ def test_scrape_contests_no_table(mocker):
mock_response.text = "<html><body>No table found</body></html>" mock_response.text = "<html><body>No table found</body></html>"
mocker.patch("scrapers.atcoder.requests.get", return_value=mock_response) mocker.patch("scrapers.atcoder.requests.get", return_value=mock_response)
mocker.patch("scrapers.atcoder.time.sleep")
result = scrape_contests() result = scrape_contests()
@ -127,7 +125,6 @@ def test_scrape_contests_network_error(mocker):
mocker.patch( mocker.patch(
"scrapers.atcoder.requests.get", side_effect=Exception("Network error") "scrapers.atcoder.requests.get", side_effect=Exception("Network error")
) )
mocker.patch("scrapers.atcoder.time.sleep")
result = scrape_contests() result = scrape_contests()

View file

@ -168,12 +168,12 @@ def test_scrape_categories_success(mocker):
assert result[0] == ContestSummary( assert result[0] == ContestSummary(
id="introductory_problems", id="introductory_problems",
name="Introductory Problems", name="Introductory Problems",
display_name="Introductory Problems (2 problems)", display_name="Introductory Problems",
) )
assert result[1] == ContestSummary( assert result[1] == ContestSummary(
id="sorting_and_searching", id="sorting_and_searching",
name="Sorting and Searching", name="Sorting and Searching",
display_name="Sorting and Searching (3 problems)", display_name="Sorting and Searching",
) )

11
uv.lock generated
View file

@ -2,6 +2,15 @@ version = 1
revision = 3 revision = 3
requires-python = ">=3.11" requires-python = ">=3.11"
[[package]]
name = "backoff"
version = "2.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" },
]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.13.5" version = "4.13.5"
@ -375,6 +384,7 @@ name = "scrapers"
version = "0.1.0" version = "0.1.0"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "backoff" },
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },
{ name = "cloudscraper" }, { name = "cloudscraper" },
{ name = "requests" }, { name = "requests" },
@ -392,6 +402,7 @@ dev = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "backoff", specifier = ">=2.2.1" },
{ name = "beautifulsoup4", specifier = ">=4.13.5" }, { name = "beautifulsoup4", specifier = ">=4.13.5" },
{ name = "cloudscraper", specifier = ">=1.2.71" }, { name = "cloudscraper", specifier = ">=1.2.71" },
{ name = "requests", specifier = ">=2.32.5" }, { name = "requests", specifier = ">=2.32.5" },