feat(scrapers): update all scrapers to provide time & memory limit

This commit is contained in:
Barrett Ruth 2025-09-19 20:28:20 -04:00
parent e8157a5491
commit aedbccffb4
4 changed files with 327 additions and 183 deletions

View file

@ -1,10 +1,14 @@
#!/usr/bin/env python3
import json
import re
import sys
from dataclasses import asdict
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
def parse_problem_url(problem_input: str) -> str | None:
@ -15,10 +19,43 @@ def parse_problem_url(problem_input: str) -> str | None:
return None
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]:
timeout_ms = None
memory_mb = None
constraints_ul = soup.find("ul", class_="task-constraints")
if not constraints_ul or not isinstance(constraints_ul, Tag):
raise ValueError("Could not find task-constraints section")
for li in constraints_ul.find_all("li"):
text = li.get_text()
if "Time limit:" in text:
match = re.search(r"Time limit:\s*(\d+(?:\.\d+)?)\s*s", text)
if match:
seconds = float(match.group(1))
timeout_ms = int(seconds * 1000)
if "Memory limit:" in text:
match = re.search(r"Memory limit:\s*(\d+)\s*MB", text)
if match:
memory_mb = int(match.group(1))
if timeout_ms is None:
raise ValueError("Could not find valid timeout in task-constraints section")
if memory_mb is None:
raise ValueError(
"Could not find valid memory limit in task-constraints section"
)
return timeout_ms, memory_mb
def process_problem_element(
element,
current_category: str | None,
all_categories: dict[str, list[dict[str, str]]],
all_categories: dict[str, list[ProblemSummary]],
) -> str | None:
if element.name == "h1":
category_name = element.get_text().strip()
@ -39,11 +76,12 @@ def process_problem_element(
if not (problem_id.isdigit() and problem_name and current_category):
return current_category
all_categories[current_category].append({"id": problem_id, "name": problem_name})
problem = ProblemSummary(id=problem_id, name=problem_name)
all_categories[current_category].append(problem)
return current_category
def scrape_all_problems() -> dict[str, list[dict[str, str]]]:
def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
try:
problemset_url = "https://cses.fi/problemset/"
headers = {
@ -54,7 +92,7 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]:
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
all_categories: dict[str, list[dict[str, str]]] = {}
all_categories: dict[str, list[ProblemSummary]] = {}
problem_links = soup.find_all(
"a", href=lambda x: x and "/problemset/task/" in x
@ -68,7 +106,7 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]:
)
for category in all_categories:
all_categories[category].sort(key=lambda x: int(x["id"]))
all_categories[category].sort(key=lambda x: int(x.id))
print(f"Found {len(all_categories)} categories", file=sys.stderr)
return all_categories
@ -129,79 +167,65 @@ def scrape(url: str) -> list[tuple[str, str]]:
def main() -> None:
if len(sys.argv) < 2:
print(
json.dumps(
{
"success": False,
"error": "Usage: cses.py metadata OR cses.py tests <problem_id_or_url>",
}
)
result = MetadataResult(
success=False,
error="Usage: cses.py metadata OR cses.py tests <problem_id_or_url>",
)
print(json.dumps(asdict(result)))
sys.exit(1)
mode: str = sys.argv[1]
if mode == "metadata":
if len(sys.argv) != 2:
print(
json.dumps(
{
"success": False,
"error": "Usage: cses.py metadata",
}
)
result = MetadataResult(
success=False,
error="Usage: cses.py metadata",
)
print(json.dumps(asdict(result)))
sys.exit(1)
all_categories: dict[str, list[dict[str, str]]] = scrape_all_problems()
all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems()
if not all_categories:
print(
json.dumps(
{
"success": False,
"error": "Failed to scrape CSES problem categories",
}
)
result = MetadataResult(
success=False,
error="Failed to scrape CSES problem categories",
)
print(json.dumps(asdict(result)))
sys.exit(1)
print(
json.dumps(
{
"success": True,
"categories": all_categories,
}
)
)
result = MetadataResult(success=True, error="", categories=all_categories)
print(json.dumps(asdict(result)))
elif mode == "tests":
if len(sys.argv) != 3:
print(
json.dumps(
{
"success": False,
"error": "Usage: cses.py tests <problem_id_or_url>",
}
)
tests_result = TestsResult(
success=False,
error="Usage: cses.py tests <problem_id_or_url>",
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
problem_input: str = sys.argv[2]
url: str | None = parse_problem_url(problem_input)
if not url:
print(
json.dumps(
{
"success": False,
"error": f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
"problem_id": problem_input
if problem_input.isdigit()
else None,
}
)
tests_result = TestsResult(
success=False,
error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
problem_id=problem_input if problem_input.isdigit() else "",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests: list[tuple[str, str]] = scrape(url)
@ -210,43 +234,58 @@ def main() -> None:
problem_input if problem_input.isdigit() else problem_input.split("/")[-1]
)
if not tests:
print(
json.dumps(
{
"success": False,
"error": f"No tests found for {problem_input}",
"problem_id": problem_id,
"url": url,
}
)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_list: list[dict[str, str]] = [
{"input": i, "expected": o} for i, o in tests
]
print(
json.dumps(
{
"success": True,
"problem_id": problem_id,
"url": url,
"tests": test_list,
}
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {problem_input}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_cases = [TestCase(input=i, expected=o) for i, o in tests]
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=test_cases,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
else:
print(
json.dumps(
{
"success": False,
"error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
}
)
result = MetadataResult(
success=False,
error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
)
print(json.dumps(asdict(result)))
sys.exit(1)