feat(scrapers): make scrapers softer
This commit is contained in:
parent
1945999099
commit
ee88450b3b
2 changed files with 31 additions and 63 deletions
|
|
@ -169,7 +169,7 @@ def _parse_tasks_list(html: str) -> list[dict[str, str]]:
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
def _extract_limits(html: str) -> tuple[int, float]:
|
def _extract_problem_info(html: str) -> tuple[int, float, bool]:
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
txt = soup.get_text(" ", strip=True)
|
txt = soup.get_text(" ", strip=True)
|
||||||
timeout_ms = 0
|
timeout_ms = 0
|
||||||
|
|
@ -180,7 +180,10 @@ def _extract_limits(html: str) -> tuple[int, float]:
|
||||||
ms = re.search(r"Memory\s*Limit:\s*(\d+)\s*MiB", txt, flags=re.I)
|
ms = re.search(r"Memory\s*Limit:\s*(\d+)\s*MiB", txt, flags=re.I)
|
||||||
if ms:
|
if ms:
|
||||||
memory_mb = float(ms.group(1)) * MIB_TO_MB
|
memory_mb = float(ms.group(1)) * MIB_TO_MB
|
||||||
return timeout_ms, memory_mb
|
div = soup.select_one("#problem-statement")
|
||||||
|
txt = div.get_text(" ", strip=True) if div else soup.get_text(" ", strip=True)
|
||||||
|
interactive = "This is an interactive" in txt
|
||||||
|
return timeout_ms, memory_mb, interactive
|
||||||
|
|
||||||
|
|
||||||
def _extract_samples(html: str) -> list[TestCase]:
|
def _extract_samples(html: str) -> list[TestCase]:
|
||||||
|
|
@ -213,13 +216,16 @@ def _scrape_tasks_sync(contest_id: str) -> list[dict[str, str]]:
|
||||||
|
|
||||||
def _scrape_problem_page_sync(contest_id: str, slug: str) -> dict[str, Any]:
|
def _scrape_problem_page_sync(contest_id: str, slug: str) -> dict[str, Any]:
|
||||||
html = _fetch(f"{BASE_URL}/contests/{contest_id}/tasks/{slug}")
|
html = _fetch(f"{BASE_URL}/contests/{contest_id}/tasks/{slug}")
|
||||||
|
try:
|
||||||
tests = _extract_samples(html)
|
tests = _extract_samples(html)
|
||||||
timeout_ms, memory_mb = _extract_limits(html)
|
except Exception:
|
||||||
|
tests = []
|
||||||
|
timeout_ms, memory_mb, interactive = _extract_problem_info(html)
|
||||||
return {
|
return {
|
||||||
"tests": tests,
|
"tests": tests,
|
||||||
"timeout_ms": timeout_ms,
|
"timeout_ms": timeout_ms,
|
||||||
"memory_mb": memory_mb,
|
"memory_mb": memory_mb,
|
||||||
"interactive": False,
|
"interactive": interactive,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -309,43 +315,18 @@ class AtcoderScraper(BaseScraper):
|
||||||
slug = row.get("slug") or ""
|
slug = row.get("slug") or ""
|
||||||
if not letter or not slug:
|
if not letter or not slug:
|
||||||
return
|
return
|
||||||
try:
|
data = await asyncio.to_thread(_scrape_problem_page_sync, category_id, slug)
|
||||||
data = await asyncio.to_thread(
|
tests: list[TestCase] = data.get("tests", [])
|
||||||
_scrape_problem_page_sync, category_id, slug
|
|
||||||
)
|
|
||||||
tests: list[TestCase] = data["tests"]
|
|
||||||
if not tests:
|
|
||||||
print(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"problem_id": letter,
|
|
||||||
"error": f"{self.platform_name}: no tests found",
|
|
||||||
}
|
|
||||||
),
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
return
|
|
||||||
print(
|
print(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
"problem_id": letter,
|
"problem_id": letter,
|
||||||
"tests": [
|
"tests": [
|
||||||
{"input": t.input, "expected": t.expected}
|
{"input": t.input, "expected": t.expected} for t in tests
|
||||||
for t in tests
|
|
||||||
],
|
],
|
||||||
"timeout_ms": data["timeout_ms"],
|
"timeout_ms": data.get("timeout_ms", 0),
|
||||||
"memory_mb": data["memory_mb"],
|
"memory_mb": data.get("memory_mb", 0),
|
||||||
"interactive": bool(data["interactive"]),
|
"interactive": bool(data.get("interactive")),
|
||||||
}
|
|
||||||
),
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
print(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"problem_id": letter,
|
|
||||||
"error": f"{self.platform_name}: {str(e)}",
|
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
flush=True,
|
flush=True,
|
||||||
|
|
|
||||||
|
|
@ -244,20 +244,7 @@ class CodeforcesScraper(BaseScraper):
|
||||||
|
|
||||||
for b in blocks:
|
for b in blocks:
|
||||||
pid = b["letter"].lower()
|
pid = b["letter"].lower()
|
||||||
tests: list[TestCase] = b["tests"]
|
tests: list[TestCase] = b.get("tests", [])
|
||||||
|
|
||||||
if not tests:
|
|
||||||
print(
|
|
||||||
json.dumps(
|
|
||||||
{
|
|
||||||
"problem_id": pid,
|
|
||||||
"error": f"{self.platform_name}: no tests found",
|
|
||||||
}
|
|
||||||
),
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(
|
print(
|
||||||
json.dumps(
|
json.dumps(
|
||||||
{
|
{
|
||||||
|
|
@ -265,9 +252,9 @@ class CodeforcesScraper(BaseScraper):
|
||||||
"tests": [
|
"tests": [
|
||||||
{"input": t.input, "expected": t.expected} for t in tests
|
{"input": t.input, "expected": t.expected} for t in tests
|
||||||
],
|
],
|
||||||
"timeout_ms": b["timeout_ms"],
|
"timeout_ms": b.get("timeout_ms", 0),
|
||||||
"memory_mb": b["memory_mb"],
|
"memory_mb": b.get("memory_mb", 0),
|
||||||
"interactive": bool(b["interactive"]),
|
"interactive": bool(b.get("interactive")),
|
||||||
}
|
}
|
||||||
),
|
),
|
||||||
flush=True,
|
flush=True,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue