feat(scraper): use backoff

This commit is contained in:
Barrett Ruth 2025-09-21 11:26:54 -04:00
parent 58f9be5f9a
commit 46c615416f
6 changed files with 186 additions and 193 deletions

View file

@ -5,6 +5,7 @@ description = "Add your description here"
readme = "README.md" readme = "README.md"
requires-python = ">=3.11" requires-python = ">=3.11"
dependencies = [ dependencies = [
"backoff>=2.2.1",
"beautifulsoup4>=4.13.5", "beautifulsoup4>=4.13.5",
"cloudscraper>=1.2.71", "cloudscraper>=1.2.71",
"requests>=2.32.5", "requests>=2.32.5",

View file

@ -3,9 +3,9 @@
import json import json
import re import re
import sys import sys
import time
from dataclasses import asdict from dataclasses import asdict
import backoff
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -169,7 +169,6 @@ def scrape(url: str) -> list[TestCase]:
def scrape_contests() -> list[ContestSummary]: def scrape_contests() -> list[ContestSummary]:
import concurrent.futures import concurrent.futures
import random
def get_max_pages() -> int: def get_max_pages() -> int:
try: try:
@ -197,25 +196,40 @@ def scrape_contests() -> list[ContestSummary]:
except Exception: except Exception:
return 15 return 15
def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]: def scrape_page(page: int) -> list[ContestSummary]:
for attempt in range(max_retries): @backoff.on_exception(
try: backoff.expo,
(requests.exceptions.RequestException, requests.exceptions.HTTPError),
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
file=sys.stderr,
),
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Rate limited on page {page}, retrying in {details['wait']:.1f}s",
file=sys.stderr,
),
)
def make_request() -> requests.Response:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
url = f"https://atcoder.jp/contests/archive?page={page}" url = f"https://atcoder.jp/contests/archive?page={page}"
response = requests.get(url, headers=headers, timeout=10) response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 429:
backoff_time = (2**attempt) + random.uniform(0, 1)
print(
f"Rate limited on page {page}, retrying in {backoff_time:.1f}s",
file=sys.stderr,
)
time.sleep(backoff_time)
continue
response.raise_for_status() response.raise_for_status()
return response
try:
response = make_request()
except Exception:
return []
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", class_="table") table = soup.find("table", class_="table")
@ -311,9 +325,7 @@ def scrape_contests() -> list[ContestSummary]:
if "AtCoder Beginner Contest" in name: if "AtCoder Beginner Contest" in name:
match = re.search(r"AtCoder Beginner Contest (\d+)", name) match = re.search(r"AtCoder Beginner Contest (\d+)", name)
if match: if match:
display_name = ( display_name = f"Beginner Contest {match.group(1)} (ABC)"
f"Beginner Contest {match.group(1)} (ABC)"
)
elif "AtCoder Regular Contest" in name: elif "AtCoder Regular Contest" in name:
match = re.search(r"AtCoder Regular Contest (\d+)", name) match = re.search(r"AtCoder Regular Contest (\d+)", name)
if match: if match:
@ -325,40 +337,20 @@ def scrape_contests() -> list[ContestSummary]:
elif "AtCoder Heuristic Contest" in name: elif "AtCoder Heuristic Contest" in name:
match = re.search(r"AtCoder Heuristic Contest (\d+)", name) match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
if match: if match:
display_name = ( display_name = f"Heuristic Contest {match.group(1)} (AHC)"
f"Heuristic Contest {match.group(1)} (AHC)"
)
contests.append( contests.append(
ContestSummary( ContestSummary(id=contest_id, name=name, display_name=display_name)
id=contest_id, name=name, display_name=display_name
)
) )
return contests return contests
except requests.exceptions.RequestException as e:
if response.status_code == 429:
continue
print(
f"Failed to scrape page {page} (attempt {attempt + 1}): {e}",
file=sys.stderr,
)
if attempt == max_retries - 1:
return []
except Exception as e:
print(f"Unexpected error on page {page}: {e}", file=sys.stderr)
return []
return []
max_pages = get_max_pages() max_pages = get_max_pages()
page_results = {} page_results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_page = { future_to_page = {
executor.submit(scrape_page_with_retry, page): page executor.submit(scrape_page, page): page for page in range(1, max_pages + 1)
for page in range(1, max_pages + 1)
} }
for future in concurrent.futures.as_completed(future_to_page): for future in concurrent.futures.as_completed(future_to_page):

View file

@ -1,12 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import json import json
import random
import re import re
import sys import sys
import time
from dataclasses import asdict from dataclasses import asdict
import backoff
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
@ -41,37 +40,30 @@ def denormalize_category_name(category_id: str) -> str:
return category_map.get(category_id, category_id.replace("_", " ").title()) return category_map.get(category_id, category_id.replace("_", " ").title())
def request_with_retry( @backoff.on_exception(
url: str, headers: dict, max_retries: int = 3 backoff.expo,
) -> requests.Response: (requests.exceptions.RequestException, requests.exceptions.HTTPError),
for attempt in range(max_retries): max_tries=4,
try: jitter=backoff.random_jitter,
delay = 0.5 + random.uniform(0, 0.3) on_backoff=lambda details: print(
time.sleep(delay) f"Request failed (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
file=sys.stderr,
),
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=4,
jitter=backoff.random_jitter,
on_backoff=lambda details: print(
f"Rate limited, retrying in {details['wait']:.1f}s", file=sys.stderr
),
)
def make_request(url: str, headers: dict) -> requests.Response:
response = requests.get(url, headers=headers, timeout=10) response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 429:
backoff = (2**attempt) + random.uniform(0, 1)
print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr)
time.sleep(backoff)
continue
response.raise_for_status() response.raise_for_status()
return response return response
except requests.exceptions.RequestException as e:
if attempt == max_retries - 1:
raise
backoff = 2**attempt
print(
f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}",
file=sys.stderr,
)
time.sleep(backoff)
raise Exception("All retry attempts failed")
def scrape_category_problems(category_id: str) -> list[ProblemSummary]: def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
category_name = denormalize_category_name(category_id) category_name = denormalize_category_name(category_id)
@ -82,7 +74,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
response = request_with_retry(problemset_url, headers) response = make_request(problemset_url, headers)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
@ -176,7 +168,7 @@ def scrape_categories() -> list[ContestSummary]:
headers = { headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
response = request_with_retry("https://cses.fi/problemset/", headers) response = make_request("https://cses.fi/problemset/", headers)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
categories = [] categories = []
@ -193,7 +185,7 @@ def scrape_categories() -> list[ContestSummary]:
if ul: if ul:
problem_count = len(ul.find_all("li", class_="task")) problem_count = len(ul.find_all("li", class_="task"))
display_name = f"{category_name} ({problem_count} problems)" display_name = category_name
categories.append( categories.append(
ContestSummary( ContestSummary(
@ -323,7 +315,7 @@ def scrape(url: str) -> list[TestCase]:
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
} }
response = request_with_retry(url, headers) response = make_request(url, headers)
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")

View file

@ -94,7 +94,6 @@ def test_scrape_contests_success(mocker):
return mock_response return mock_response
mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect) mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect)
mocker.patch("scrapers.atcoder.time.sleep")
result = scrape_contests() result = scrape_contests()
@ -116,7 +115,6 @@ def test_scrape_contests_no_table(mocker):
mock_response.text = "<html><body>No table found</body></html>" mock_response.text = "<html><body>No table found</body></html>"
mocker.patch("scrapers.atcoder.requests.get", return_value=mock_response) mocker.patch("scrapers.atcoder.requests.get", return_value=mock_response)
mocker.patch("scrapers.atcoder.time.sleep")
result = scrape_contests() result = scrape_contests()
@ -127,7 +125,6 @@ def test_scrape_contests_network_error(mocker):
mocker.patch( mocker.patch(
"scrapers.atcoder.requests.get", side_effect=Exception("Network error") "scrapers.atcoder.requests.get", side_effect=Exception("Network error")
) )
mocker.patch("scrapers.atcoder.time.sleep")
result = scrape_contests() result = scrape_contests()

View file

@ -168,12 +168,12 @@ def test_scrape_categories_success(mocker):
assert result[0] == ContestSummary( assert result[0] == ContestSummary(
id="introductory_problems", id="introductory_problems",
name="Introductory Problems", name="Introductory Problems",
display_name="Introductory Problems (2 problems)", display_name="Introductory Problems",
) )
assert result[1] == ContestSummary( assert result[1] == ContestSummary(
id="sorting_and_searching", id="sorting_and_searching",
name="Sorting and Searching", name="Sorting and Searching",
display_name="Sorting and Searching (3 problems)", display_name="Sorting and Searching",
) )

11
uv.lock generated
View file

@ -2,6 +2,15 @@ version = 1
revision = 3 revision = 3
requires-python = ">=3.11" requires-python = ">=3.11"
[[package]]
name = "backoff"
version = "2.2.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" },
]
[[package]] [[package]]
name = "beautifulsoup4" name = "beautifulsoup4"
version = "4.13.5" version = "4.13.5"
@ -375,6 +384,7 @@ name = "scrapers"
version = "0.1.0" version = "0.1.0"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "backoff" },
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },
{ name = "cloudscraper" }, { name = "cloudscraper" },
{ name = "requests" }, { name = "requests" },
@ -392,6 +402,7 @@ dev = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "backoff", specifier = ">=2.2.1" },
{ name = "beautifulsoup4", specifier = ">=4.13.5" }, { name = "beautifulsoup4", specifier = ">=4.13.5" },
{ name = "cloudscraper", specifier = ">=1.2.71" }, { name = "cloudscraper", specifier = ">=1.2.71" },
{ name = "requests", specifier = ">=2.32.5" }, { name = "requests", specifier = ">=2.32.5" },