fix(ci): move imports

2025-09-20 23:52:32 -04:00 · 2025-09-20 23:52:32 -04:00 · 7b8aae7921
commit 7b8aae7921
parent 847307bd1f
5 changed files with 475 additions and 95 deletions
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@ -168,70 +168,210 @@ def scrape(url: str) -> list[TestCase]:


 def scrape_contests() -> list[ContestSummary]:
-    contests = []
-    max_pages = 15
+    import concurrent.futures
+    import random

-    for page in range(1, max_pages + 1):
+    def get_max_pages() -> int:
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            }
-            url = f"https://atcoder.jp/contests/archive?page={page}"
-            response = requests.get(url, headers=headers, timeout=10)
+            response = requests.get(
+                "https://atcoder.jp/contests/archive", headers=headers, timeout=10
+            )
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")
-            table = soup.find("table", class_="table")
-            if not table:
-                break
+            pagination = soup.find("ul", class_="pagination")
+            if not pagination or not isinstance(pagination, Tag):
+                return 15

-            tbody = table.find("tbody")
-            if not tbody or not isinstance(tbody, Tag):
-                break
+            lis = pagination.find_all("li")
+            if lis and isinstance(lis[-1], Tag):
+                last_li_text = lis[-1].get_text().strip()
+                try:
+                    return int(last_li_text)
+                except ValueError:
+                    return 15
+            return 15
+        except Exception:
+            return 15

-            rows = tbody.find_all("tr")
-            if not rows:
-                break
+    def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]:
+        for attempt in range(max_retries):
+            try:
+                headers = {
+                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+                }
+                url = f"https://atcoder.jp/contests/archive?page={page}"
+                response = requests.get(url, headers=headers, timeout=10)

-            for row in rows:
-                cells = row.find_all("td")
-                if len(cells) < 2:
+                if response.status_code == 429:
+                    backoff_time = (2**attempt) + random.uniform(0, 1)
+                    print(
+                        f"Rate limited on page {page}, retrying in {backoff_time:.1f}s",
+                        file=sys.stderr,
+                    )
+                    time.sleep(backoff_time)
                    continue

-                contest_cell = cells[1]
-                link = contest_cell.find("a")
-                if not link or not link.get("href"):
+                response.raise_for_status()
+
+                soup = BeautifulSoup(response.text, "html.parser")
+                table = soup.find("table", class_="table")
+                if not table:
+                    return []
+
+                tbody = table.find("tbody")
+                if not tbody or not isinstance(tbody, Tag):
+                    return []
+
+                rows = tbody.find_all("tr")
+                if not rows:
+                    return []
+
+                contests = []
+                for row in rows:
+                    cells = row.find_all("td")
+                    if len(cells) < 2:
+                        continue
+
+                    contest_cell = cells[1]
+                    link = contest_cell.find("a")
+                    if not link or not link.get("href"):
+                        continue
+
+                    href = link.get("href")
+                    contest_id = href.split("/")[-1]
+                    name = link.get_text().strip()
+
+                    try:
+                        name = name.encode().decode("unicode_escape")
+                    except:
+                        pass
+
+                    name = (
+                        name.replace("\uff08", "(")
+                        .replace("\uff09", ")")
+                        .replace("\u3000", " ")
+                    )
+                    name = re.sub(
+                        r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
+                    )
+
+                    def generate_display_name_from_id(contest_id: str) -> str:
+                        parts = contest_id.replace("-", " ").replace("_", " ")
+
+                        parts = re.sub(
+                            r"\b(jsc|JSC)\b",
+                            "Japanese Student Championship",
+                            parts,
+                            flags=re.IGNORECASE,
+                        )
+                        parts = re.sub(
+                            r"\b(wtf|WTF)\b",
+                            "World Tour Finals",
+                            parts,
+                            flags=re.IGNORECASE,
+                        )
+                        parts = re.sub(
+                            r"\b(ahc)(\d+)\b",
+                            r"Heuristic Contest \2 (AHC)",
+                            parts,
+                            flags=re.IGNORECASE,
+                        )
+                        parts = re.sub(
+                            r"\b(arc)(\d+)\b",
+                            r"Regular Contest \2 (ARC)",
+                            parts,
+                            flags=re.IGNORECASE,
+                        )
+                        parts = re.sub(
+                            r"\b(abc)(\d+)\b",
+                            r"Beginner Contest \2 (ABC)",
+                            parts,
+                            flags=re.IGNORECASE,
+                        )
+                        parts = re.sub(
+                            r"\b(agc)(\d+)\b",
+                            r"Grand Contest \2 (AGC)",
+                            parts,
+                            flags=re.IGNORECASE,
+                        )
+
+                        return parts.title()
+
+                    english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
+                    total_chars = len(re.sub(r"\s+", "", name))
+
+                    if total_chars > 0 and english_chars / total_chars < 0.3:
+                        display_name = generate_display_name_from_id(contest_id)
+                    else:
+                        display_name = name
+                        if "AtCoder Beginner Contest" in name:
+                            match = re.search(r"AtCoder Beginner Contest (\d+)", name)
+                            if match:
+                                display_name = (
+                                    f"Beginner Contest {match.group(1)} (ABC)"
+                                )
+                        elif "AtCoder Regular Contest" in name:
+                            match = re.search(r"AtCoder Regular Contest (\d+)", name)
+                            if match:
+                                display_name = f"Regular Contest {match.group(1)} (ARC)"
+                        elif "AtCoder Grand Contest" in name:
+                            match = re.search(r"AtCoder Grand Contest (\d+)", name)
+                            if match:
+                                display_name = f"Grand Contest {match.group(1)} (AGC)"
+                        elif "AtCoder Heuristic Contest" in name:
+                            match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
+                            if match:
+                                display_name = (
+                                    f"Heuristic Contest {match.group(1)} (AHC)"
+                                )
+
+                    contests.append(
+                        ContestSummary(
+                            id=contest_id, name=name, display_name=display_name
+                        )
+                    )
+
+                return contests
+
+            except requests.exceptions.RequestException as e:
+                if response.status_code == 429:
                    continue
-
-                href = link.get("href")
-                contest_id = href.split("/")[-1]
-                name = link.get_text().strip()
-
-                display_name = name
-                if "AtCoder Beginner Contest" in name:
-                    match = re.search(r"AtCoder Beginner Contest (\d+)", name)
-                    if match:
-                        display_name = f"Beginner Contest {match.group(1)} (ABC)"
-                elif "AtCoder Regular Contest" in name:
-                    match = re.search(r"AtCoder Regular Contest (\d+)", name)
-                    if match:
-                        display_name = f"Regular Contest {match.group(1)} (ARC)"
-                elif "AtCoder Grand Contest" in name:
-                    match = re.search(r"AtCoder Grand Contest (\d+)", name)
-                    if match:
-                        display_name = f"Grand Contest {match.group(1)} (AGC)"
-
-                contests.append(
-                    ContestSummary(id=contest_id, name=name, display_name=display_name)
+                print(
+                    f"Failed to scrape page {page} (attempt {attempt + 1}): {e}",
+                    file=sys.stderr,
                )
+                if attempt == max_retries - 1:
+                    return []
+            except Exception as e:
+                print(f"Unexpected error on page {page}: {e}", file=sys.stderr)
+                return []

-            time.sleep(0.5)
+        return []

-        except Exception as e:
-            print(f"Failed to scrape page {page}: {e}", file=sys.stderr)
-            continue
+    max_pages = get_max_pages()
+    page_results = {}

-    return contests
+    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+        future_to_page = {
+            executor.submit(scrape_page_with_retry, page): page
+            for page in range(1, max_pages + 1)
+        }
+
+        for future in concurrent.futures.as_completed(future_to_page):
+            page = future_to_page[future]
+            page_contests = future.result()
+            page_results[page] = page_contests
+
+    # Sort by page number to maintain order
+    all_contests = []
+    for page in sorted(page_results.keys()):
+        all_contests.extend(page_results[page])
+
+    return all_contests


 def main() -> None:
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import json
+import re
 import sys
 from dataclasses import asdict

@ -148,8 +149,6 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str:


 def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
-    import re
-
    timeout_ms = None
    memory_mb = None

@ -240,22 +239,43 @@ def scrape_contests() -> list[ContestSummary]:
            contest_id = str(contest["id"])
            name = contest["name"]

-            # Clean up contest names for display
            display_name = name
            if "Educational Codeforces Round" in name:
-                import re
-
                match = re.search(r"Educational Codeforces Round (\d+)", name)
                if match:
                    display_name = f"Educational Round {match.group(1)}"
-            elif "Codeforces Round" in name and "Div" in name:
-                match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
-                if match:
-                    display_name = f"Round {match.group(1)} (Div. {match.group(2)})"
            elif "Codeforces Global Round" in name:
                match = re.search(r"Codeforces Global Round (\d+)", name)
                if match:
                    display_name = f"Global Round {match.group(1)}"
+            elif "Codeforces Round" in name:
+                # Handle various Div patterns
+                div_match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
+                if div_match:
+                    display_name = (
+                        f"Round {div_match.group(1)} (Div. {div_match.group(2)})"
+                    )
+                else:
+                    # Handle combined divs like "Div. 1 + Div. 2"
+                    combined_match = re.search(
+                        r"Codeforces Round (\d+) \(Div\. 1 \+ Div\. 2\)", name
+                    )
+                    if combined_match:
+                        display_name = (
+                            f"Round {combined_match.group(1)} (Div. 1 + Div. 2)"
+                        )
+                    else:
+                        # Handle single div like "Div. 1"
+                        single_div_match = re.search(
+                            r"Codeforces Round (\d+) \(Div\. 1\)", name
+                        )
+                        if single_div_match:
+                            display_name = f"Round {single_div_match.group(1)} (Div. 1)"
+                        else:
+                            # Fallback: extract just the round number
+                            round_match = re.search(r"Codeforces Round (\d+)", name)
+                            if round_match:
+                                display_name = f"Round {round_match.group(1)}"

            contests.append(
                ContestSummary(id=contest_id, name=name, display_name=display_name)
--- a/scrapers/cses.py
+++ b/scrapers/cses.py
@ -1,8 +1,10 @@
 #!/usr/bin/env python3

 import json
+import random
 import re
 import sys
+import time
 from dataclasses import asdict

 import requests
@ -39,6 +41,38 @@ def denormalize_category_name(category_id: str) -> str:
    return category_map.get(category_id, category_id.replace("_", " ").title())


+def request_with_retry(
+    url: str, headers: dict, max_retries: int = 3
+) -> requests.Response:
+    for attempt in range(max_retries):
+        try:
+            delay = 0.5 + random.uniform(0, 0.3)
+            time.sleep(delay)
+
+            response = requests.get(url, headers=headers, timeout=10)
+
+            if response.status_code == 429:
+                backoff = (2**attempt) + random.uniform(0, 1)
+                print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr)
+                time.sleep(backoff)
+                continue
+
+            response.raise_for_status()
+            return response
+
+        except requests.exceptions.RequestException as e:
+            if attempt == max_retries - 1:
+                raise
+            backoff = 2**attempt
+            print(
+                f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}",
+                file=sys.stderr,
+            )
+            time.sleep(backoff)
+
+    raise Exception("All retry attempts failed")
+
+
 def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
    category_name = denormalize_category_name(category_id)

@ -48,8 +82,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }

-        response = requests.get(problemset_url, headers=headers, timeout=10)
-        response.raise_for_status()
+        response = request_with_retry(problemset_url, headers)

        soup = BeautifulSoup(response.text, "html.parser")

@ -143,10 +176,7 @@ def scrape_categories() -> list[ContestSummary]:
        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
-        response = requests.get(
-            "https://cses.fi/problemset/", headers=headers, timeout=10
-        )
-        response.raise_for_status()
+        response = request_with_retry("https://cses.fi/problemset/", headers)

        soup = BeautifulSoup(response.text, "html.parser")
        categories = []
@ -293,8 +323,7 @@ def scrape(url: str) -> list[TestCase]:
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }

-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()
+        response = request_with_retry(url, headers)

        soup = BeautifulSoup(response.text, "html.parser")