feat(scrapers): make scrapers softer

2025-10-05 13:40:56 -04:00 · 2025-10-05 13:40:56 -04:00 · ee88450b3b
commit ee88450b3b
parent 1945999099
2 changed files with 31 additions and 63 deletions
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@ -169,7 +169,7 @@ def _parse_tasks_list(html: str) -> list[dict[str, str]]:
    return rows


-def _extract_limits(html: str) -> tuple[int, float]:
+def _extract_problem_info(html: str) -> tuple[int, float, bool]:
    soup = BeautifulSoup(html, "html.parser")
    txt = soup.get_text(" ", strip=True)
    timeout_ms = 0
@ -180,7 +180,10 @@ def _extract_limits(html: str) -> tuple[int, float]:
    ms = re.search(r"Memory\s*Limit:\s*(\d+)\s*MiB", txt, flags=re.I)
    if ms:
        memory_mb = float(ms.group(1)) * MIB_TO_MB
-    return timeout_ms, memory_mb
+    div = soup.select_one("#problem-statement")
+    txt = div.get_text(" ", strip=True) if div else soup.get_text(" ", strip=True)
+    interactive = "This is an interactive" in txt
+    return timeout_ms, memory_mb, interactive


 def _extract_samples(html: str) -> list[TestCase]:
@ -213,13 +216,16 @@ def _scrape_tasks_sync(contest_id: str) -> list[dict[str, str]]:

 def _scrape_problem_page_sync(contest_id: str, slug: str) -> dict[str, Any]:
    html = _fetch(f"{BASE_URL}/contests/{contest_id}/tasks/{slug}")
-    tests = _extract_samples(html)
-    timeout_ms, memory_mb = _extract_limits(html)
+    try:
+        tests = _extract_samples(html)
+    except Exception:
+        tests = []
+    timeout_ms, memory_mb, interactive = _extract_problem_info(html)
    return {
        "tests": tests,
        "timeout_ms": timeout_ms,
        "memory_mb": memory_mb,
-        "interactive": False,
+        "interactive": interactive,
    }


@ -309,47 +315,22 @@ class AtcoderScraper(BaseScraper):
            slug = row.get("slug") or ""
            if not letter or not slug:
                return
-            try:
-                data = await asyncio.to_thread(
-                    _scrape_problem_page_sync, category_id, slug
-                )
-                tests: list[TestCase] = data["tests"]
-                if not tests:
-                    print(
-                        json.dumps(
-                            {
-                                "problem_id": letter,
-                                "error": f"{self.platform_name}: no tests found",
-                            }
-                        ),
-                        flush=True,
-                    )
-                    return
-                print(
-                    json.dumps(
-                        {
-                            "problem_id": letter,
-                            "tests": [
-                                {"input": t.input, "expected": t.expected}
-                                for t in tests
-                            ],
-                            "timeout_ms": data["timeout_ms"],
-                            "memory_mb": data["memory_mb"],
-                            "interactive": bool(data["interactive"]),
-                        }
-                    ),
-                    flush=True,
-                )
-            except Exception as e:
-                print(
-                    json.dumps(
-                        {
-                            "problem_id": letter,
-                            "error": f"{self.platform_name}: {str(e)}",
-                        }
-                    ),
-                    flush=True,
-                )
+            data = await asyncio.to_thread(_scrape_problem_page_sync, category_id, slug)
+            tests: list[TestCase] = data.get("tests", [])
+            print(
+                json.dumps(
+                    {
+                        "problem_id": letter,
+                        "tests": [
+                            {"input": t.input, "expected": t.expected} for t in tests
+                        ],
+                        "timeout_ms": data.get("timeout_ms", 0),
+                        "memory_mb": data.get("memory_mb", 0),
+                        "interactive": bool(data.get("interactive")),
+                    }
+                ),
+                flush=True,
+            )

        await asyncio.gather(*(emit(r) for r in rows))

--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@ -244,20 +244,7 @@ class CodeforcesScraper(BaseScraper):

        for b in blocks:
            pid = b["letter"].lower()
-            tests: list[TestCase] = b["tests"]
-
-            if not tests:
-                print(
-                    json.dumps(
-                        {
-                            "problem_id": pid,
-                            "error": f"{self.platform_name}: no tests found",
-                        }
-                    ),
-                    flush=True,
-                )
-                continue
-
+            tests: list[TestCase] = b.get("tests", [])
            print(
                json.dumps(
                    {
@ -265,9 +252,9 @@ class CodeforcesScraper(BaseScraper):
                        "tests": [
                            {"input": t.input, "expected": t.expected} for t in tests
                        ],
-                        "timeout_ms": b["timeout_ms"],
-                        "memory_mb": b["memory_mb"],
-                        "interactive": bool(b["interactive"]),
+                        "timeout_ms": b.get("timeout_ms", 0),
+                        "memory_mb": b.get("memory_mb", 0),
+                        "interactive": bool(b.get("interactive")),
                    }
                ),
                flush=True,