feat(scraper): use backoff

2025-09-21 11:26:54 -04:00 · 2025-09-21 11:26:54 -04:00 · 46c615416f
commit 46c615416f
parent 58f9be5f9a
6 changed files with 186 additions and 193 deletions
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@ -3,9 +3,9 @@
 import json
 import re
 import sys
-import time
 from dataclasses import asdict

+import backoff
 import requests
 from bs4 import BeautifulSoup, Tag

@ -169,7 +169,6 @@ def scrape(url: str) -> list[TestCase]:

 def scrape_contests() -> list[ContestSummary]:
    import concurrent.futures
-    import random

    def get_max_pages() -> int:
        try:
@ -197,168 +196,161 @@ def scrape_contests() -> list[ContestSummary]:
        except Exception:
            return 15

-    def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]:
-        for attempt in range(max_retries):
+    def scrape_page(page: int) -> list[ContestSummary]:
+        @backoff.on_exception(
+            backoff.expo,
+            (requests.exceptions.RequestException, requests.exceptions.HTTPError),
+            max_tries=4,
+            jitter=backoff.random_jitter,
+            on_backoff=lambda details: print(
+                f"Request failed on page {page} (attempt {details['tries']}), retrying in {details['wait']:.1f}s: {details['exception']}",
+                file=sys.stderr,
+            ),
+        )
+        @backoff.on_predicate(
+            backoff.expo,
+            lambda response: response.status_code == 429,
+            max_tries=4,
+            jitter=backoff.random_jitter,
+            on_backoff=lambda details: print(
+                f"Rate limited on page {page}, retrying in {details['wait']:.1f}s",
+                file=sys.stderr,
+            ),
+        )
+        def make_request() -> requests.Response:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            }
+            url = f"https://atcoder.jp/contests/archive?page={page}"
+            response = requests.get(url, headers=headers, timeout=10)
+            response.raise_for_status()
+            return response
+
+        try:
+            response = make_request()
+        except Exception:
+            return []
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        table = soup.find("table", class_="table")
+        if not table:
+            return []
+
+        tbody = table.find("tbody")
+        if not tbody or not isinstance(tbody, Tag):
+            return []
+
+        rows = tbody.find_all("tr")
+        if not rows:
+            return []
+
+        contests = []
+        for row in rows:
+            cells = row.find_all("td")
+            if len(cells) < 2:
+                continue
+
+            contest_cell = cells[1]
+            link = contest_cell.find("a")
+            if not link or not link.get("href"):
+                continue
+
+            href = link.get("href")
+            contest_id = href.split("/")[-1]
+            name = link.get_text().strip()
+
            try:
-                headers = {
-                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-                }
-                url = f"https://atcoder.jp/contests/archive?page={page}"
-                response = requests.get(url, headers=headers, timeout=10)
+                name = name.encode().decode("unicode_escape")
+            except (UnicodeDecodeError, UnicodeEncodeError):
+                pass

-                if response.status_code == 429:
-                    backoff_time = (2**attempt) + random.uniform(0, 1)
-                    print(
-                        f"Rate limited on page {page}, retrying in {backoff_time:.1f}s",
-                        file=sys.stderr,
-                    )
-                    time.sleep(backoff_time)
-                    continue
+            name = (
+                name.replace("\uff08", "(")
+                .replace("\uff09", ")")
+                .replace("\u3000", " ")
+            )
+            name = re.sub(
+                r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
+            )

-                response.raise_for_status()
+            def generate_display_name_from_id(contest_id: str) -> str:
+                parts = contest_id.replace("-", " ").replace("_", " ")

-                soup = BeautifulSoup(response.text, "html.parser")
-                table = soup.find("table", class_="table")
-                if not table:
-                    return []
-
-                tbody = table.find("tbody")
-                if not tbody or not isinstance(tbody, Tag):
-                    return []
-
-                rows = tbody.find_all("tr")
-                if not rows:
-                    return []
-
-                contests = []
-                for row in rows:
-                    cells = row.find_all("td")
-                    if len(cells) < 2:
-                        continue
-
-                    contest_cell = cells[1]
-                    link = contest_cell.find("a")
-                    if not link or not link.get("href"):
-                        continue
-
-                    href = link.get("href")
-                    contest_id = href.split("/")[-1]
-                    name = link.get_text().strip()
-
-                    try:
-                        name = name.encode().decode("unicode_escape")
-                    except (UnicodeDecodeError, UnicodeEncodeError):
-                        pass
-
-                    name = (
-                        name.replace("\uff08", "(")
-                        .replace("\uff09", ")")
-                        .replace("\u3000", " ")
-                    )
-                    name = re.sub(
-                        r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
-                    )
-
-                    def generate_display_name_from_id(contest_id: str) -> str:
-                        parts = contest_id.replace("-", " ").replace("_", " ")
-
-                        parts = re.sub(
-                            r"\b(jsc|JSC)\b",
-                            "Japanese Student Championship",
-                            parts,
-                            flags=re.IGNORECASE,
-                        )
-                        parts = re.sub(
-                            r"\b(wtf|WTF)\b",
-                            "World Tour Finals",
-                            parts,
-                            flags=re.IGNORECASE,
-                        )
-                        parts = re.sub(
-                            r"\b(ahc)(\d+)\b",
-                            r"Heuristic Contest \2 (AHC)",
-                            parts,
-                            flags=re.IGNORECASE,
-                        )
-                        parts = re.sub(
-                            r"\b(arc)(\d+)\b",
-                            r"Regular Contest \2 (ARC)",
-                            parts,
-                            flags=re.IGNORECASE,
-                        )
-                        parts = re.sub(
-                            r"\b(abc)(\d+)\b",
-                            r"Beginner Contest \2 (ABC)",
-                            parts,
-                            flags=re.IGNORECASE,
-                        )
-                        parts = re.sub(
-                            r"\b(agc)(\d+)\b",
-                            r"Grand Contest \2 (AGC)",
-                            parts,
-                            flags=re.IGNORECASE,
-                        )
-
-                        return parts.title()
-
-                    english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
-                    total_chars = len(re.sub(r"\s+", "", name))
-
-                    if total_chars > 0 and english_chars / total_chars < 0.3:
-                        display_name = generate_display_name_from_id(contest_id)
-                    else:
-                        display_name = name
-                        if "AtCoder Beginner Contest" in name:
-                            match = re.search(r"AtCoder Beginner Contest (\d+)", name)
-                            if match:
-                                display_name = (
-                                    f"Beginner Contest {match.group(1)} (ABC)"
-                                )
-                        elif "AtCoder Regular Contest" in name:
-                            match = re.search(r"AtCoder Regular Contest (\d+)", name)
-                            if match:
-                                display_name = f"Regular Contest {match.group(1)} (ARC)"
-                        elif "AtCoder Grand Contest" in name:
-                            match = re.search(r"AtCoder Grand Contest (\d+)", name)
-                            if match:
-                                display_name = f"Grand Contest {match.group(1)} (AGC)"
-                        elif "AtCoder Heuristic Contest" in name:
-                            match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
-                            if match:
-                                display_name = (
-                                    f"Heuristic Contest {match.group(1)} (AHC)"
-                                )
-
-                    contests.append(
-                        ContestSummary(
-                            id=contest_id, name=name, display_name=display_name
-                        )
-                    )
-
-                return contests
-
-            except requests.exceptions.RequestException as e:
-                if response.status_code == 429:
-                    continue
-                print(
-                    f"Failed to scrape page {page} (attempt {attempt + 1}): {e}",
-                    file=sys.stderr,
+                parts = re.sub(
+                    r"\b(jsc|JSC)\b",
+                    "Japanese Student Championship",
+                    parts,
+                    flags=re.IGNORECASE,
+                )
+                parts = re.sub(
+                    r"\b(wtf|WTF)\b",
+                    "World Tour Finals",
+                    parts,
+                    flags=re.IGNORECASE,
+                )
+                parts = re.sub(
+                    r"\b(ahc)(\d+)\b",
+                    r"Heuristic Contest \2 (AHC)",
+                    parts,
+                    flags=re.IGNORECASE,
+                )
+                parts = re.sub(
+                    r"\b(arc)(\d+)\b",
+                    r"Regular Contest \2 (ARC)",
+                    parts,
+                    flags=re.IGNORECASE,
+                )
+                parts = re.sub(
+                    r"\b(abc)(\d+)\b",
+                    r"Beginner Contest \2 (ABC)",
+                    parts,
+                    flags=re.IGNORECASE,
+                )
+                parts = re.sub(
+                    r"\b(agc)(\d+)\b",
+                    r"Grand Contest \2 (AGC)",
+                    parts,
+                    flags=re.IGNORECASE,
                )
-                if attempt == max_retries - 1:
-                    return []
-            except Exception as e:
-                print(f"Unexpected error on page {page}: {e}", file=sys.stderr)
-                return []

-        return []
+                return parts.title()
+
+            english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
+            total_chars = len(re.sub(r"\s+", "", name))
+
+            if total_chars > 0 and english_chars / total_chars < 0.3:
+                display_name = generate_display_name_from_id(contest_id)
+            else:
+                display_name = name
+                if "AtCoder Beginner Contest" in name:
+                    match = re.search(r"AtCoder Beginner Contest (\d+)", name)
+                    if match:
+                        display_name = f"Beginner Contest {match.group(1)} (ABC)"
+                elif "AtCoder Regular Contest" in name:
+                    match = re.search(r"AtCoder Regular Contest (\d+)", name)
+                    if match:
+                        display_name = f"Regular Contest {match.group(1)} (ARC)"
+                elif "AtCoder Grand Contest" in name:
+                    match = re.search(r"AtCoder Grand Contest (\d+)", name)
+                    if match:
+                        display_name = f"Grand Contest {match.group(1)} (AGC)"
+                elif "AtCoder Heuristic Contest" in name:
+                    match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
+                    if match:
+                        display_name = f"Heuristic Contest {match.group(1)} (AHC)"
+
+            contests.append(
+                ContestSummary(id=contest_id, name=name, display_name=display_name)
+            )
+
+        return contests

    max_pages = get_max_pages()
    page_results = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_page = {
-            executor.submit(scrape_page_with_retry, page): page
-            for page in range(1, max_pages + 1)
+            executor.submit(scrape_page, page): page for page in range(1, max_pages + 1)
        }

        for future in concurrent.futures.as_completed(future_to_page):