feat(scrapers): make scrapers softer

This commit is contained in:
Barrett Ruth 2025-10-05 13:40:56 -04:00
parent 1945999099
commit ee88450b3b
2 changed files with 31 additions and 63 deletions

View file

@ -169,7 +169,7 @@ def _parse_tasks_list(html: str) -> list[dict[str, str]]:
return rows return rows
def _extract_limits(html: str) -> tuple[int, float]: def _extract_problem_info(html: str) -> tuple[int, float, bool]:
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
txt = soup.get_text(" ", strip=True) txt = soup.get_text(" ", strip=True)
timeout_ms = 0 timeout_ms = 0
@ -180,7 +180,10 @@ def _extract_limits(html: str) -> tuple[int, float]:
ms = re.search(r"Memory\s*Limit:\s*(\d+)\s*MiB", txt, flags=re.I) ms = re.search(r"Memory\s*Limit:\s*(\d+)\s*MiB", txt, flags=re.I)
if ms: if ms:
memory_mb = float(ms.group(1)) * MIB_TO_MB memory_mb = float(ms.group(1)) * MIB_TO_MB
return timeout_ms, memory_mb div = soup.select_one("#problem-statement")
txt = div.get_text(" ", strip=True) if div else soup.get_text(" ", strip=True)
interactive = "This is an interactive" in txt
return timeout_ms, memory_mb, interactive
def _extract_samples(html: str) -> list[TestCase]: def _extract_samples(html: str) -> list[TestCase]:
@ -213,13 +216,16 @@ def _scrape_tasks_sync(contest_id: str) -> list[dict[str, str]]:
def _scrape_problem_page_sync(contest_id: str, slug: str) -> dict[str, Any]: def _scrape_problem_page_sync(contest_id: str, slug: str) -> dict[str, Any]:
html = _fetch(f"{BASE_URL}/contests/{contest_id}/tasks/{slug}") html = _fetch(f"{BASE_URL}/contests/{contest_id}/tasks/{slug}")
try:
tests = _extract_samples(html) tests = _extract_samples(html)
timeout_ms, memory_mb = _extract_limits(html) except Exception:
tests = []
timeout_ms, memory_mb, interactive = _extract_problem_info(html)
return { return {
"tests": tests, "tests": tests,
"timeout_ms": timeout_ms, "timeout_ms": timeout_ms,
"memory_mb": memory_mb, "memory_mb": memory_mb,
"interactive": False, "interactive": interactive,
} }
@ -309,43 +315,18 @@ class AtcoderScraper(BaseScraper):
slug = row.get("slug") or "" slug = row.get("slug") or ""
if not letter or not slug: if not letter or not slug:
return return
try: data = await asyncio.to_thread(_scrape_problem_page_sync, category_id, slug)
data = await asyncio.to_thread( tests: list[TestCase] = data.get("tests", [])
_scrape_problem_page_sync, category_id, slug
)
tests: list[TestCase] = data["tests"]
if not tests:
print(
json.dumps(
{
"problem_id": letter,
"error": f"{self.platform_name}: no tests found",
}
),
flush=True,
)
return
print( print(
json.dumps( json.dumps(
{ {
"problem_id": letter, "problem_id": letter,
"tests": [ "tests": [
{"input": t.input, "expected": t.expected} {"input": t.input, "expected": t.expected} for t in tests
for t in tests
], ],
"timeout_ms": data["timeout_ms"], "timeout_ms": data.get("timeout_ms", 0),
"memory_mb": data["memory_mb"], "memory_mb": data.get("memory_mb", 0),
"interactive": bool(data["interactive"]), "interactive": bool(data.get("interactive")),
}
),
flush=True,
)
except Exception as e:
print(
json.dumps(
{
"problem_id": letter,
"error": f"{self.platform_name}: {str(e)}",
} }
), ),
flush=True, flush=True,

View file

@ -244,20 +244,7 @@ class CodeforcesScraper(BaseScraper):
for b in blocks: for b in blocks:
pid = b["letter"].lower() pid = b["letter"].lower()
tests: list[TestCase] = b["tests"] tests: list[TestCase] = b.get("tests", [])
if not tests:
print(
json.dumps(
{
"problem_id": pid,
"error": f"{self.platform_name}: no tests found",
}
),
flush=True,
)
continue
print( print(
json.dumps( json.dumps(
{ {
@ -265,9 +252,9 @@ class CodeforcesScraper(BaseScraper):
"tests": [ "tests": [
{"input": t.input, "expected": t.expected} for t in tests {"input": t.input, "expected": t.expected} for t in tests
], ],
"timeout_ms": b["timeout_ms"], "timeout_ms": b.get("timeout_ms", 0),
"memory_mb": b["memory_mb"], "memory_mb": b.get("memory_mb", 0),
"interactive": bool(b["interactive"]), "interactive": bool(b.get("interactive")),
} }
), ),
flush=True, flush=True,