feat(kattis): rewrite scraper to support real contests

Problem: scrape_contest_list paginated the entire Kattis problem database (3000+ problems) treating each as a "contest". scrape_contest_metadata only handled single-problem access. stream_tests_for_category_async could not fetch tests for multiple problems in a real contest. Solution: replace the paginated problem loop with a single GET to /contests that returns ~150 real timed contests. Add contest-aware path to scrape_contest_metadata that fetches /contests/{id}/problems and returns all problem slugs; fall back to single-problem path when the ID is not a contest. Add _stream_single_problem helper and update stream_tests_for_category_async to fan out concurrently over all contest problem slugs before falling back to the single-problem path.
2026-03-03 15:41:43 -05:00 · 2026-03-03 15:41:43 -05:00 · a3d8e39ee1
commit a3d8e39ee1
parent e79f992e0b
1 changed files with 146 additions and 99 deletions
--- a/scrapers/kattis.py
+++ b/scrapers/kattis.py
@ -5,6 +5,7 @@ import io
 import json
 import re
 import zipfile
 from datetime import datetime
 import httpx
@ -33,7 +34,6 @@ MEM_RE = re.compile(
    r"Memory limit</span>\s*<span[^>]*>\s*(\d+)\s*MB\s*</span>",
    re.DOTALL,
 )
 LAST_PAGE_RE = re.compile(r"\bpage=(\d+)")
 async def _fetch_text(client: httpx.AsyncClient, url: str) -> str:
@ -94,24 +94,110 @@ def _is_interactive(html: str) -> bool:
    return "This is an interactive problem" in html
-def _parse_problem_rows(html: str) -> list[tuple[str, str]]:
+def _parse_contests_page(html: str) -> list[ContestSummary]:
    results: list[ContestSummary] = []
    seen: set[str] = set()
-    out: list[tuple[str, str]] = []
+    for row_m in re.finditer(r"<tr[^>]*>(.*?)</tr>", html, re.DOTALL):
-    for m in re.finditer(
+        row = row_m.group(1)
-        r'<td\s+class="[^"]*">\s*<a\s+href="/problems/([a-z0-9]+)"\s*>([^<]+)</a>',
+        link_m = re.search(r'href="/contests/([a-z0-9]+)"[^>]*>([^<]+)</a>', row)
-        html,
+        if not link_m:
-    ):
+            continue
-        pid = m.group(1)
+        cid = link_m.group(1)
-        name = m.group(2).strip()
+        name = link_m.group(2).strip()
-        if pid not in seen:
+        if cid in seen:
-            seen.add(pid)
+            continue
-            out.append((pid, name))
+        seen.add(cid)
-    return out
+        start_time: int | None = None
        ts_m = re.search(r'data-timestamp="(\d+)"', row)
        if ts_m:
            start_time = int(ts_m.group(1))
        else:
            time_m = re.search(r'<time[^>]+datetime="([^"]+)"', row)
            if time_m:
                try:
                    dt = datetime.fromisoformat(time_m.group(1).replace("Z", "+00:00"))
                    start_time = int(dt.timestamp())
                except Exception:
                    pass
        results.append(ContestSummary(id=cid, name=name, start_time=start_time))
    return results
-def _parse_last_page(html: str) -> int:
+def _parse_contest_problem_list(html: str) -> list[tuple[str, str]]:
-    nums = [int(m.group(1)) for m in LAST_PAGE_RE.finditer(html)]
+    if "The problems will become available when the contest starts" in html:
-    return max(nums) if nums else 0
+        return []
    results: list[tuple[str, str]] = []
    seen: set[str] = set()
    for row_m in re.finditer(r"<tr[^>]*>(.*?)</tr>", html, re.DOTALL):
        row = row_m.group(1)
        link_m = re.search(
            r'href="/contests/[^/]+/problems/([^"]+)"[^>]*>([^<]+)</a>', row
        )
        if not link_m:
            continue
        slug = link_m.group(1)
        name = link_m.group(2).strip()
        if slug in seen:
            continue
        seen.add(slug)
        label_m = re.search(r"<td[^>]*>\s*([A-Z])\s*</td>", row)
        label = label_m.group(1) if label_m else ""
        display = f"{label} - {name}" if label else name
        results.append((slug, display))
    return results
 async def _fetch_contest_slugs(
    client: httpx.AsyncClient, contest_id: str
 ) -> list[tuple[str, str]]:
    try:
        html = await _fetch_text(client, f"{BASE_URL}/contests/{contest_id}/problems")
        return _parse_contest_problem_list(html)
    except httpx.HTTPStatusError:
        return []
    except Exception:
        return []
 async def _stream_single_problem(client: httpx.AsyncClient, slug: str) -> None:
    try:
        html = await _fetch_text(client, f"{BASE_URL}/problems/{slug}")
    except Exception:
        return
    timeout_ms, memory_mb = _parse_limits(html)
    interactive = _is_interactive(html)
    tests: list[TestCase] = []
    try:
        zip_data = await _fetch_bytes(
            client,
            f"{BASE_URL}/problems/{slug}/file/statement/samples.zip",
        )
        tests = _parse_samples_zip(zip_data)
    except Exception:
        tests = _parse_samples_html(html)
    combined_input = "\n".join(t.input for t in tests) if tests else ""
    combined_expected = "\n".join(t.expected for t in tests) if tests else ""
    print(
        json.dumps(
            {
                "problem_id": slug,
                "combined": {
                    "input": combined_input,
                    "expected": combined_expected,
                },
                "tests": [{"input": t.input, "expected": t.expected} for t in tests],
                "timeout_ms": timeout_ms,
                "memory_mb": memory_mb,
                "interactive": interactive,
                "multi_test": False,
            }
        ),
        flush=True,
    )
 class KattisScraper(BaseScraper):
@ -122,57 +208,46 @@ class KattisScraper(BaseScraper):
    async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
        try:
            async with httpx.AsyncClient() as client:
-                html = await _fetch_text(client, f"{BASE_URL}/problems/{contest_id}")
+                slugs = await _fetch_contest_slugs(client, contest_id)
-            timeout_ms, memory_mb = _parse_limits(html)
+                if slugs:
-            title_m = re.search(r"<title>([^<]+)</title>", html)
+                    return MetadataResult(
-            name = (
+                        success=True,
-                title_m.group(1).split("\u2013")[0].strip() if title_m else contest_id
+                        error="",
-            )
+                        contest_id=contest_id,
-            return MetadataResult(
+                        problems=[
-                success=True,
+                            ProblemSummary(id=slug, name=name) for slug, name in slugs
-                error="",
+                        ],
-                contest_id=contest_id,
+                        url=f"{BASE_URL}/problems/%s",
-                problems=[ProblemSummary(id=contest_id, name=name)],
+                    )
-                url=f"{BASE_URL}/problems/%s",
+                try:
-            )
+                    html = await _fetch_text(
                        client, f"{BASE_URL}/problems/{contest_id}"
                    )
                except Exception as e:
                    return self._metadata_error(str(e))
                title_m = re.search(r"<title>([^<]+)</title>", html)
                name = (
                    title_m.group(1).split("\u2013")[0].strip()
                    if title_m
                    else contest_id
                )
                return MetadataResult(
                    success=True,
                    error="",
                    contest_id=contest_id,
                    problems=[ProblemSummary(id=contest_id, name=name)],
                    url=f"{BASE_URL}/problems/%s",
                )
        except Exception as e:
            return self._metadata_error(str(e))
    async def scrape_contest_list(self) -> ContestListResult:
        try:
-            async with httpx.AsyncClient(
+            async with httpx.AsyncClient() as client:
-                limits=httpx.Limits(max_connections=CONNECTIONS)
+                html = await _fetch_text(client, f"{BASE_URL}/contests")
-            ) as client:
+            contests = _parse_contests_page(html)
                first_html = await _fetch_text(
                    client, f"{BASE_URL}/problems?page=0&order=problem_difficulty"
                )
                last = _parse_last_page(first_html)
                rows = _parse_problem_rows(first_html)
                sem = asyncio.Semaphore(CONNECTIONS)
                async def fetch_page(page: int) -> list[tuple[str, str]]:
                    async with sem:
                        html = await _fetch_text(
                            client,
                            f"{BASE_URL}/problems?page={page}&order=problem_difficulty",
                        )
                        return _parse_problem_rows(html)
                tasks = [fetch_page(p) for p in range(1, last + 1)]
                for coro in asyncio.as_completed(tasks):
                    rows.extend(await coro)
            seen: set[str] = set()
            contests: list[ContestSummary] = []
            for pid, name in rows:
                if pid not in seen:
                    seen.add(pid)
                    contests.append(
                        ContestSummary(id=pid, name=name, display_name=name)
                    )
            if not contests:
-                return self._contests_error("No problems found")
+                return self._contests_error("No contests found")
            return ContestListResult(success=True, error="", contests=contests)
        except Exception as e:
            return self._contests_error(str(e))
@ -181,46 +256,18 @@ class KattisScraper(BaseScraper):
        async with httpx.AsyncClient(
            limits=httpx.Limits(max_connections=CONNECTIONS)
        ) as client:
-            try:
+            slugs = await _fetch_contest_slugs(client, category_id)
-                html = await _fetch_text(client, f"{BASE_URL}/problems/{category_id}")
+            if slugs:
-            except Exception:
+                sem = asyncio.Semaphore(CONNECTIONS)
                async def emit_one(slug: str, _name: str) -> None:
                    async with sem:
                        await _stream_single_problem(client, slug)
                await asyncio.gather(*(emit_one(s, n) for s, n in slugs))
                return
-            timeout_ms, memory_mb = _parse_limits(html)
+            await _stream_single_problem(client, category_id)
            interactive = _is_interactive(html)
            tests: list[TestCase] = []
            try:
                zip_data = await _fetch_bytes(
                    client,
                    f"{BASE_URL}/problems/{category_id}/file/statement/samples.zip",
                )
                tests = _parse_samples_zip(zip_data)
            except Exception:
                tests = _parse_samples_html(html)
            combined_input = "\n".join(t.input for t in tests) if tests else ""
            combined_expected = "\n".join(t.expected for t in tests) if tests else ""
            print(
                json.dumps(
                    {
                        "problem_id": category_id,
                        "combined": {
                            "input": combined_input,
                            "expected": combined_expected,
                        },
                        "tests": [
                            {"input": t.input, "expected": t.expected} for t in tests
                        ],
                        "timeout_ms": timeout_ms,
                        "memory_mb": memory_mb,
                        "interactive": interactive,
                        "multi_test": False,
                    }
                ),
                flush=True,
            )
    async def submit(
        self,