From 46c615416f559237a555f4c61363ea5846fc719c Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Sun, 21 Sep 2025 11:26:54 -0400 Subject: [PATCH] feat(scraper): use backoff --- pyproject.toml | 1 + scrapers/atcoder.py | 296 ++++++++++++++++----------------- scrapers/cses.py | 64 ++++--- tests/scrapers/test_atcoder.py | 3 - tests/scrapers/test_cses.py | 4 +- uv.lock | 11 ++ 6 files changed, 186 insertions(+), 193 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5c731b5..92c1cbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.11" dependencies = [ + "backoff>=2.2.1", "beautifulsoup4>=4.13.5", "cloudscraper>=1.2.71", "requests>=2.32.5", diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 3dc1d16..eef91ff 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -3,9 +3,9 @@ import json import re import sys -import time from dataclasses import asdict +import backoff import requests from bs4 import BeautifulSoup, Tag @@ -169,7 +169,6 @@ def scrape(url: str) -> list[TestCase]: def scrape_contests() -> list[ContestSummary]: import concurrent.futures - import random def get_max_pages() -> int: try: @@ -197,168 +196,161 @@ def scrape_contests() -> list[ContestSummary]: except Exception: return 15 - def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]: - for attempt in range(max_retries): + def scrape_page(page: int) -> list[ContestSummary]: + @backoff.on_exception( + backoff.expo, + (requests.exceptions.RequestException, requests.exceptions.HTTPError), + max_tries=4, + jitter=backoff.random_jitter, + on_backoff=lambda details: print( + f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}", + file=sys.stderr, + ), + ) + @backoff.on_predicate( + backoff.expo, + lambda response: response.status_code == 429, + max_tries=4, + jitter=backoff.random_jitter, + on_backoff=lambda details: print( + f"Rate limited on page {page}, retrying in {details['wait']:.1f}s", + file=sys.stderr, + ), + ) + def make_request() -> requests.Response: + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + url = f"https://atcoder.jp/contests/archive?page={page}" + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + return response + + try: + response = make_request() + except Exception: + return [] + + soup = BeautifulSoup(response.text, "html.parser") + table = soup.find("table", class_="table") + if not table: + return [] + + tbody = table.find("tbody") + if not tbody or not isinstance(tbody, Tag): + return [] + + rows = tbody.find_all("tr") + if not rows: + return [] + + contests = [] + for row in rows: + cells = row.find_all("td") + if len(cells) < 2: + continue + + contest_cell = cells[1] + link = contest_cell.find("a") + if not link or not link.get("href"): + continue + + href = link.get("href") + contest_id = href.split("/")[-1] + name = link.get_text().strip() + try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - url = f"https://atcoder.jp/contests/archive?page={page}" - response = requests.get(url, headers=headers, timeout=10) + name = name.encode().decode("unicode_escape") + except (UnicodeDecodeError, UnicodeEncodeError): + pass - if response.status_code == 429: - backoff_time = (2**attempt) + random.uniform(0, 1) - print( - f"Rate limited on page {page}, retrying in {backoff_time:.1f}s", - file=sys.stderr, - ) - time.sleep(backoff_time) - continue + name = ( + name.replace("\uff08", "(") + .replace("\uff09", ")") + .replace("\u3000", " ") + ) + name = re.sub( + r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name + ) - response.raise_for_status() + def generate_display_name_from_id(contest_id: str) -> str: + parts = contest_id.replace("-", " ").replace("_", " ") - soup = BeautifulSoup(response.text, "html.parser") - table = soup.find("table", class_="table") - if not table: - return [] - - tbody = table.find("tbody") - if not tbody or not isinstance(tbody, Tag): - return [] - - rows = tbody.find_all("tr") - if not rows: - return [] - - contests = [] - for row in rows: - cells = row.find_all("td") - if len(cells) < 2: - continue - - contest_cell = cells[1] - link = contest_cell.find("a") - if not link or not link.get("href"): - continue - - href = link.get("href") - contest_id = href.split("/")[-1] - name = link.get_text().strip() - - try: - name = name.encode().decode("unicode_escape") - except (UnicodeDecodeError, UnicodeEncodeError): - pass - - name = ( - name.replace("\uff08", "(") - .replace("\uff09", ")") - .replace("\u3000", " ") - ) - name = re.sub( - r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name - ) - - def generate_display_name_from_id(contest_id: str) -> str: - parts = contest_id.replace("-", " ").replace("_", " ") - - parts = re.sub( - r"\b(jsc|JSC)\b", - "Japanese Student Championship", - parts, - flags=re.IGNORECASE, - ) - parts = re.sub( - r"\b(wtf|WTF)\b", - "World Tour Finals", - parts, - flags=re.IGNORECASE, - ) - parts = re.sub( - r"\b(ahc)(\d+)\b", - r"Heuristic Contest \2 (AHC)", - parts, - flags=re.IGNORECASE, - ) - parts = re.sub( - r"\b(arc)(\d+)\b", - r"Regular Contest \2 (ARC)", - parts, - flags=re.IGNORECASE, - ) - parts = re.sub( - r"\b(abc)(\d+)\b", - r"Beginner Contest \2 (ABC)", - parts, - flags=re.IGNORECASE, - ) - parts = re.sub( - r"\b(agc)(\d+)\b", - r"Grand Contest \2 (AGC)", - parts, - flags=re.IGNORECASE, - ) - - return parts.title() - - english_chars = sum(1 for c in name if c.isascii() and c.isalpha()) - total_chars = len(re.sub(r"\s+", "", name)) - - if total_chars > 0 and english_chars / total_chars < 0.3: - display_name = generate_display_name_from_id(contest_id) - else: - display_name = name - if "AtCoder Beginner Contest" in name: - match = re.search(r"AtCoder Beginner Contest (\d+)", name) - if match: - display_name = ( - f"Beginner Contest {match.group(1)} (ABC)" - ) - elif "AtCoder Regular Contest" in name: - match = re.search(r"AtCoder Regular Contest (\d+)", name) - if match: - display_name = f"Regular Contest {match.group(1)} (ARC)" - elif "AtCoder Grand Contest" in name: - match = re.search(r"AtCoder Grand Contest (\d+)", name) - if match: - display_name = f"Grand Contest {match.group(1)} (AGC)" - elif "AtCoder Heuristic Contest" in name: - match = re.search(r"AtCoder Heuristic Contest (\d+)", name) - if match: - display_name = ( - f"Heuristic Contest {match.group(1)} (AHC)" - ) - - contests.append( - ContestSummary( - id=contest_id, name=name, display_name=display_name - ) - ) - - return contests - - except requests.exceptions.RequestException as e: - if response.status_code == 429: - continue - print( - f"Failed to scrape page {page} (attempt {attempt + 1}): {e}", - file=sys.stderr, + parts = re.sub( + r"\b(jsc|JSC)\b", + "Japanese Student Championship", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(wtf|WTF)\b", + "World Tour Finals", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(ahc)(\d+)\b", + r"Heuristic Contest \2 (AHC)", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(arc)(\d+)\b", + r"Regular Contest \2 (ARC)", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(abc)(\d+)\b", + r"Beginner Contest \2 (ABC)", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(agc)(\d+)\b", + r"Grand Contest \2 (AGC)", + parts, + flags=re.IGNORECASE, ) - if attempt == max_retries - 1: - return [] - except Exception as e: - print(f"Unexpected error on page {page}: {e}", file=sys.stderr) - return [] - return [] + return parts.title() + + english_chars = sum(1 for c in name if c.isascii() and c.isalpha()) + total_chars = len(re.sub(r"\s+", "", name)) + + if total_chars > 0 and english_chars / total_chars < 0.3: + display_name = generate_display_name_from_id(contest_id) + else: + display_name = name + if "AtCoder Beginner Contest" in name: + match = re.search(r"AtCoder Beginner Contest (\d+)", name) + if match: + display_name = f"Beginner Contest {match.group(1)} (ABC)" + elif "AtCoder Regular Contest" in name: + match = re.search(r"AtCoder Regular Contest (\d+)", name) + if match: + display_name = f"Regular Contest {match.group(1)} (ARC)" + elif "AtCoder Grand Contest" in name: + match = re.search(r"AtCoder Grand Contest (\d+)", name) + if match: + display_name = f"Grand Contest {match.group(1)} (AGC)" + elif "AtCoder Heuristic Contest" in name: + match = re.search(r"AtCoder Heuristic Contest (\d+)", name) + if match: + display_name = f"Heuristic Contest {match.group(1)} (AHC)" + + contests.append( + ContestSummary(id=contest_id, name=name, display_name=display_name) + ) + + return contests max_pages = get_max_pages() page_results = {} with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_page = { - executor.submit(scrape_page_with_retry, page): page - for page in range(1, max_pages + 1) + executor.submit(scrape_page, page): page for page in range(1, max_pages + 1) } for future in concurrent.futures.as_completed(future_to_page): diff --git a/scrapers/cses.py b/scrapers/cses.py index b2f1733..5393e85 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -1,12 +1,11 @@ #!/usr/bin/env python3 import json -import random import re import sys -import time from dataclasses import asdict +import backoff import requests from bs4 import BeautifulSoup, Tag @@ -41,36 +40,29 @@ def denormalize_category_name(category_id: str) -> str: return category_map.get(category_id, category_id.replace("_", " ").title()) -def request_with_retry( - url: str, headers: dict, max_retries: int = 3 -) -> requests.Response: - for attempt in range(max_retries): - try: - delay = 0.5 + random.uniform(0, 0.3) - time.sleep(delay) - - response = requests.get(url, headers=headers, timeout=10) - - if response.status_code == 429: - backoff = (2**attempt) + random.uniform(0, 1) - print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr) - time.sleep(backoff) - continue - - response.raise_for_status() - return response - - except requests.exceptions.RequestException as e: - if attempt == max_retries - 1: - raise - backoff = 2**attempt - print( - f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}", - file=sys.stderr, - ) - time.sleep(backoff) - - raise Exception("All retry attempts failed") +@backoff.on_exception( + backoff.expo, + (requests.exceptions.RequestException, requests.exceptions.HTTPError), + max_tries=4, + jitter=backoff.random_jitter, + on_backoff=lambda details: print( + f"Request failed (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}", + file=sys.stderr, + ), +) +@backoff.on_predicate( + backoff.expo, + lambda response: response.status_code == 429, + max_tries=4, + jitter=backoff.random_jitter, + on_backoff=lambda details: print( + f"Rate limited, retrying in {details['wait']:.1f}s", file=sys.stderr + ), +) +def make_request(url: str, headers: dict) -> requests.Response: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + return response def scrape_category_problems(category_id: str) -> list[ProblemSummary]: @@ -82,7 +74,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]: "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = request_with_retry(problemset_url, headers) + response = make_request(problemset_url, headers) soup = BeautifulSoup(response.text, "html.parser") @@ -176,7 +168,7 @@ def scrape_categories() -> list[ContestSummary]: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = request_with_retry("https://cses.fi/problemset/", headers) + response = make_request("https://cses.fi/problemset/", headers) soup = BeautifulSoup(response.text, "html.parser") categories = [] @@ -193,7 +185,7 @@ def scrape_categories() -> list[ContestSummary]: if ul: problem_count = len(ul.find_all("li", class_="task")) - display_name = f"{category_name} ({problem_count} problems)" + display_name = category_name categories.append( ContestSummary( @@ -323,7 +315,7 @@ def scrape(url: str) -> list[TestCase]: "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = request_with_retry(url, headers) + response = make_request(url, headers) soup = BeautifulSoup(response.text, "html.parser") diff --git a/tests/scrapers/test_atcoder.py b/tests/scrapers/test_atcoder.py index 5ff91d9..a2a88e5 100644 --- a/tests/scrapers/test_atcoder.py +++ b/tests/scrapers/test_atcoder.py @@ -94,7 +94,6 @@ def test_scrape_contests_success(mocker): return mock_response mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect) - mocker.patch("scrapers.atcoder.time.sleep") result = scrape_contests() @@ -116,7 +115,6 @@ def test_scrape_contests_no_table(mocker): mock_response.text = "No table found" mocker.patch("scrapers.atcoder.requests.get", return_value=mock_response) - mocker.patch("scrapers.atcoder.time.sleep") result = scrape_contests() @@ -127,7 +125,6 @@ def test_scrape_contests_network_error(mocker): mocker.patch( "scrapers.atcoder.requests.get", side_effect=Exception("Network error") ) - mocker.patch("scrapers.atcoder.time.sleep") result = scrape_contests() diff --git a/tests/scrapers/test_cses.py b/tests/scrapers/test_cses.py index a1e84a2..545176d 100644 --- a/tests/scrapers/test_cses.py +++ b/tests/scrapers/test_cses.py @@ -168,12 +168,12 @@ def test_scrape_categories_success(mocker): assert result[0] == ContestSummary( id="introductory_problems", name="Introductory Problems", - display_name="Introductory Problems (2 problems)", + display_name="Introductory Problems", ) assert result[1] == ContestSummary( id="sorting_and_searching", name="Sorting and Searching", - display_name="Sorting and Searching (3 problems)", + display_name="Sorting and Searching", ) diff --git a/uv.lock b/uv.lock index 744b4ae..aa9248d 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,15 @@ version = 1 revision = 3 requires-python = ">=3.11" +[[package]] +name = "backoff" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/d7/5bbeb12c44d7c4f2fb5b56abce497eb5ed9f34d85701de869acedd602619/backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba", size = 17001, upload-time = "2022-10-05T19:19:32.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/73/b6e24bd22e6720ca8ee9a85a0c4a2971af8497d8f3193fa05390cbd46e09/backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8", size = 15148, upload-time = "2022-10-05T19:19:30.546Z" }, +] + [[package]] name = "beautifulsoup4" version = "4.13.5" @@ -375,6 +384,7 @@ name = "scrapers" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "backoff" }, { name = "beautifulsoup4" }, { name = "cloudscraper" }, { name = "requests" }, @@ -392,6 +402,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "backoff", specifier = ">=2.2.1" }, { name = "beautifulsoup4", specifier = ">=4.13.5" }, { name = "cloudscraper", specifier = ">=1.2.71" }, { name = "requests", specifier = ">=2.32.5" },