feat(scrapers): update all scrapers to provide time & memory limit

This commit is contained in:
Barrett Ruth 2025-09-19 20:28:20 -04:00
parent e8157a5491
commit aedbccffb4
4 changed files with 327 additions and 183 deletions

View file

@ -1,18 +1,49 @@
#!/usr/bin/env python3
import json
import re
import sys
from dataclasses import asdict
import requests
from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]:
timeout_ms = None
memory_mb = None
paragraphs = soup.find_all("p")
for p in paragraphs:
text = p.get_text()
if "Time Limit:" in text and "Memory Limit:" in text:
time_match = re.search(r"Time Limit:\s*(\d+)\s*sec", text)
if time_match:
seconds = int(time_match.group(1))
timeout_ms = seconds * 1000
memory_match = re.search(r"Memory Limit:\s*(\d+)\s*MiB", text)
if memory_match:
memory_mb = int(memory_match.group(1))
break
if timeout_ms is None:
raise ValueError("Could not find valid timeout in problem constraints")
if memory_mb is None:
raise ValueError("Could not find valid memory limit in problem constraints")
return timeout_ms, memory_mb
def parse_problem_url(contest_id: str, problem_letter: str) -> str:
task_id: str = f"{contest_id}_{problem_letter}"
return f"https://atcoder.jp/contests/{contest_id}/tasks/{task_id}"
def extract_problem_from_row(row, contest_id: str) -> dict[str, str] | None:
def extract_problem_from_row(row, contest_id: str) -> ProblemSummary | None:
cells = row.find_all("td")
if len(cells) < 2:
return None
@ -34,10 +65,10 @@ def extract_problem_from_row(row, contest_id: str) -> dict[str, str] | None:
if not problem_letter or not task_name:
return None
return {"id": problem_letter.lower(), "name": task_name}
return ProblemSummary(id=problem_letter.lower(), name=task_name)
def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]:
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try:
contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks"
headers = {
@ -53,13 +84,13 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]:
return []
rows = task_table.find_all("tr")[1:]
problems: list[dict[str, str]] = []
problems: list[ProblemSummary] = []
for row in rows:
problem = extract_problem_from_row(row, contest_id)
if problem:
problems.append(problem)
problems.sort(key=lambda x: x["id"])
problems.sort(key=lambda x: x.id)
return problems
except Exception as e:
@ -95,7 +126,7 @@ def extract_test_case_from_headers(sample_headers, i: int) -> tuple[str, str] |
return (input_text, output_text)
def scrape(url: str) -> list[tuple[str, str]]:
def scrape(url: str) -> list[TestCase]:
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
@ -109,12 +140,13 @@ def scrape(url: str) -> list[tuple[str, str]]:
"h3", string=lambda x: x and "sample" in x.lower() if x else False
)
tests: list[tuple[str, str]] = []
tests: list[TestCase] = []
i = 0
while i < len(sample_headers):
test_case = extract_test_case_from_headers(sample_headers, i)
if test_case:
tests.append(test_case)
input_text, output_text = test_case
tests.append(TestCase(input=input_text, expected=output_text))
i += 2
else:
i += 1
@ -128,64 +160,55 @@ def scrape(url: str) -> list[tuple[str, str]]:
def main() -> None:
if len(sys.argv) < 2:
print(
json.dumps(
{
"success": False,
"error": "Usage: atcoder.py metadata <contest_id> OR atcoder.py tests <contest_id> <problem_letter>",
}
)
result = MetadataResult(
success=False,
error="Usage: atcoder.py metadata <contest_id> OR atcoder.py tests <contest_id> <problem_letter>",
)
print(json.dumps(asdict(result)))
sys.exit(1)
mode: str = sys.argv[1]
if mode == "metadata":
if len(sys.argv) != 3:
print(
json.dumps(
{
"success": False,
"error": "Usage: atcoder.py metadata <contest_id>",
}
)
result = MetadataResult(
success=False,
error="Usage: atcoder.py metadata <contest_id>",
)
print(json.dumps(asdict(result)))
sys.exit(1)
contest_id: str = sys.argv[2]
problems: list[dict[str, str]] = scrape_contest_problems(contest_id)
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
if not problems:
print(
json.dumps(
{
"success": False,
"error": f"No problems found for contest {contest_id}",
}
)
result = MetadataResult(
success=False,
error=f"No problems found for contest {contest_id}",
)
print(json.dumps(asdict(result)))
sys.exit(1)
print(
json.dumps(
{
"success": True,
"contest_id": contest_id,
"problems": problems,
}
)
result = MetadataResult(
success=True,
error="",
contest_id=contest_id,
problems=problems,
)
print(json.dumps(asdict(result)))
elif mode == "tests":
if len(sys.argv) != 4:
print(
json.dumps(
{
"success": False,
"error": "Usage: atcoder.py tests <contest_id> <problem_letter>",
}
)
tests_result = TestsResult(
success=False,
error="Usage: atcoder.py tests <contest_id> <problem_letter>",
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_contest_id: str = sys.argv[2]
@ -193,46 +216,59 @@ def main() -> None:
problem_id: str = f"{test_contest_id}_{problem_letter.lower()}"
url: str = parse_problem_url(test_contest_id, problem_letter)
print(f"Scraping: {url}", file=sys.stderr)
tests: list[TestCase] = scrape(url)
tests: list[tuple[str, str]] = scrape(url)
if not tests:
print(
json.dumps(
{
"success": False,
"error": f"No tests found for {test_contest_id} {problem_letter}",
"problem_id": problem_id,
"url": url,
}
)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_list: list[dict[str, str]] = [
{"input": i, "expected": o} for i, o in tests
]
print(
json.dumps(
{
"success": True,
"problem_id": problem_id,
"url": url,
"tests": test_list,
}
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {test_contest_id} {problem_letter}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
else:
print(
json.dumps(
{
"success": False,
"error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
}
)
result = MetadataResult(
success=False,
error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
)
print(json.dumps(asdict(result)))
sys.exit(1)

View file

@ -7,7 +7,7 @@ from dataclasses import asdict
import cloudscraper
from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, Problem, TestCase, TestsResult
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
def scrape(url: str) -> list[TestCase]:
@ -140,7 +140,37 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str:
)
def scrape_contest_problems(contest_id: str) -> list[Problem]:
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]:
import re
timeout_ms = None
memory_mb = None
time_limit_div = soup.find("div", class_="time-limit")
if time_limit_div:
text = time_limit_div.get_text().strip()
match = re.search(r"(\d+) seconds?", text)
if match:
seconds = int(match.group(1))
timeout_ms = seconds * 1000
if timeout_ms is None:
raise ValueError("Could not find valid timeout in time-limit section")
memory_limit_div = soup.find("div", class_="memory-limit")
if memory_limit_div:
text = memory_limit_div.get_text().strip()
match = re.search(r"(\d+) megabytes", text)
if match:
memory_mb = int(match.group(1))
if memory_mb is None:
raise ValueError("Could not find valid memory limit in memory-limit section")
return timeout_ms, memory_mb
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
scraper = cloudscraper.create_scraper()
@ -148,7 +178,7 @@ def scrape_contest_problems(contest_id: str) -> list[Problem]:
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
problems: list[Problem] = []
problems: list[ProblemSummary] = []
problem_links = soup.find_all(
"a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x
@ -163,12 +193,14 @@ def scrape_contest_problems(contest_id: str) -> list[Problem]:
problem_name: str = link.get_text(strip=True)
if problem_letter and problem_name:
problems.append(Problem(id=problem_letter, name=problem_name))
problems.append(
ProblemSummary(id=problem_letter, name=problem_name)
)
problems.sort(key=lambda x: x.id)
seen: set[str] = set()
unique_problems: list[Problem] = []
unique_problems: list[ProblemSummary] = []
for p in problems:
if p.id not in seen:
seen.add(p.id)
@ -206,7 +238,7 @@ def main() -> None:
sys.exit(1)
contest_id: str = sys.argv[2]
problems: list[Problem] = scrape_contest_problems(contest_id)
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
if not problems:
result = MetadataResult(
@ -215,7 +247,9 @@ def main() -> None:
print(json.dumps(asdict(result)))
sys.exit(1)
result = MetadataResult(success=True, contest_id=contest_id, problems=problems)
result = MetadataResult(
success=True, error="", contest_id=contest_id, problems=problems
)
print(json.dumps(asdict(result)))
elif mode == "tests":
@ -223,6 +257,11 @@ def main() -> None:
tests_result = TestsResult(
success=False,
error="Usage: codeforces.py tests <contest_id> <problem_letter>",
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
@ -234,18 +273,46 @@ def main() -> None:
url: str = parse_problem_url(tests_contest_id, problem_letter)
tests: list[TestCase] = scrape_sample_tests(url)
try:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {tests_contest_id} {problem_letter}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests_result = TestsResult(
success=True, problem_id=problem_id, url=url, tests=tests
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))

View file

@ -1,10 +1,14 @@
#!/usr/bin/env python3
import json
import re
import sys
from dataclasses import asdict
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
def parse_problem_url(problem_input: str) -> str | None:
@ -15,10 +19,43 @@ def parse_problem_url(problem_input: str) -> str | None:
return None
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]:
timeout_ms = None
memory_mb = None
constraints_ul = soup.find("ul", class_="task-constraints")
if not constraints_ul or not isinstance(constraints_ul, Tag):
raise ValueError("Could not find task-constraints section")
for li in constraints_ul.find_all("li"):
text = li.get_text()
if "Time limit:" in text:
match = re.search(r"Time limit:\s*(\d+(?:\.\d+)?)\s*s", text)
if match:
seconds = float(match.group(1))
timeout_ms = int(seconds * 1000)
if "Memory limit:" in text:
match = re.search(r"Memory limit:\s*(\d+)\s*MB", text)
if match:
memory_mb = int(match.group(1))
if timeout_ms is None:
raise ValueError("Could not find valid timeout in task-constraints section")
if memory_mb is None:
raise ValueError(
"Could not find valid memory limit in task-constraints section"
)
return timeout_ms, memory_mb
def process_problem_element(
element,
current_category: str | None,
all_categories: dict[str, list[dict[str, str]]],
all_categories: dict[str, list[ProblemSummary]],
) -> str | None:
if element.name == "h1":
category_name = element.get_text().strip()
@ -39,11 +76,12 @@ def process_problem_element(
if not (problem_id.isdigit() and problem_name and current_category):
return current_category
all_categories[current_category].append({"id": problem_id, "name": problem_name})
problem = ProblemSummary(id=problem_id, name=problem_name)
all_categories[current_category].append(problem)
return current_category
def scrape_all_problems() -> dict[str, list[dict[str, str]]]:
def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
try:
problemset_url = "https://cses.fi/problemset/"
headers = {
@ -54,7 +92,7 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]:
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
all_categories: dict[str, list[dict[str, str]]] = {}
all_categories: dict[str, list[ProblemSummary]] = {}
problem_links = soup.find_all(
"a", href=lambda x: x and "/problemset/task/" in x
@ -68,7 +106,7 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]:
)
for category in all_categories:
all_categories[category].sort(key=lambda x: int(x["id"]))
all_categories[category].sort(key=lambda x: int(x.id))
print(f"Found {len(all_categories)} categories", file=sys.stderr)
return all_categories
@ -129,79 +167,65 @@ def scrape(url: str) -> list[tuple[str, str]]:
def main() -> None:
if len(sys.argv) < 2:
print(
json.dumps(
{
"success": False,
"error": "Usage: cses.py metadata OR cses.py tests <problem_id_or_url>",
}
)
result = MetadataResult(
success=False,
error="Usage: cses.py metadata OR cses.py tests <problem_id_or_url>",
)
print(json.dumps(asdict(result)))
sys.exit(1)
mode: str = sys.argv[1]
if mode == "metadata":
if len(sys.argv) != 2:
print(
json.dumps(
{
"success": False,
"error": "Usage: cses.py metadata",
}
)
result = MetadataResult(
success=False,
error="Usage: cses.py metadata",
)
print(json.dumps(asdict(result)))
sys.exit(1)
all_categories: dict[str, list[dict[str, str]]] = scrape_all_problems()
all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems()
if not all_categories:
print(
json.dumps(
{
"success": False,
"error": "Failed to scrape CSES problem categories",
}
)
result = MetadataResult(
success=False,
error="Failed to scrape CSES problem categories",
)
print(json.dumps(asdict(result)))
sys.exit(1)
print(
json.dumps(
{
"success": True,
"categories": all_categories,
}
)
)
result = MetadataResult(success=True, error="", categories=all_categories)
print(json.dumps(asdict(result)))
elif mode == "tests":
if len(sys.argv) != 3:
print(
json.dumps(
{
"success": False,
"error": "Usage: cses.py tests <problem_id_or_url>",
}
)
tests_result = TestsResult(
success=False,
error="Usage: cses.py tests <problem_id_or_url>",
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
problem_input: str = sys.argv[2]
url: str | None = parse_problem_url(problem_input)
if not url:
print(
json.dumps(
{
"success": False,
"error": f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
"problem_id": problem_input
if problem_input.isdigit()
else None,
}
)
tests_result = TestsResult(
success=False,
error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
problem_id=problem_input if problem_input.isdigit() else "",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests: list[tuple[str, str]] = scrape(url)
@ -210,43 +234,58 @@ def main() -> None:
problem_input if problem_input.isdigit() else problem_input.split("/")[-1]
)
if not tests:
print(
json.dumps(
{
"success": False,
"error": f"No tests found for {problem_input}",
"problem_id": problem_id,
"url": url,
}
)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_list: list[dict[str, str]] = [
{"input": i, "expected": o} for i, o in tests
]
print(
json.dumps(
{
"success": True,
"problem_id": problem_id,
"url": url,
"tests": test_list,
}
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {problem_input}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_cases = [TestCase(input=i, expected=o) for i, o in tests]
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=test_cases,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
else:
print(
json.dumps(
{
"success": False,
"error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
}
)
result = MetadataResult(
success=False,
error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
)
print(json.dumps(asdict(result)))
sys.exit(1)

View file

@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
@dataclass
@ -8,34 +8,36 @@ class TestCase:
@dataclass
class Problem:
class ProblemSummary:
id: str
name: str
@dataclass
class Problem:
id: str
name: str
timeout_ms: int
memory_mb: int
@dataclass
class ScrapingResult:
success: bool
error: str | None = None
error: str
@dataclass
class MetadataResult(ScrapingResult):
contest_id: str | None = None
problems: list[Problem] | None = None
categories: dict[str, list[Problem]] | None = None
def __post_init__(self):
if self.problems is None:
self.problems = []
contest_id: str = ""
problems: list[ProblemSummary] = field(default_factory=list)
categories: dict[str, list[ProblemSummary]] = field(default_factory=dict)
@dataclass
class TestsResult(ScrapingResult):
problem_id: str = ""
url: str = ""
tests: list[TestCase] | None = None
def __post_init__(self):
if self.tests is None:
self.tests = []
problem_id: str
url: str
tests: list[TestCase]
timeout_ms: int
memory_mb: int