feat(scrapers): update all scrapers to provide time & memory limit

This commit is contained in:
Barrett Ruth 2025-09-19 20:28:20 -04:00
parent e8157a5491
commit aedbccffb4
4 changed files with 327 additions and 183 deletions

View file

@ -7,7 +7,7 @@ from dataclasses import asdict
import cloudscraper
from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, Problem, TestCase, TestsResult
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
def scrape(url: str) -> list[TestCase]:
@ -140,7 +140,37 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str:
)
def scrape_contest_problems(contest_id: str) -> list[Problem]:
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]:
import re
timeout_ms = None
memory_mb = None
time_limit_div = soup.find("div", class_="time-limit")
if time_limit_div:
text = time_limit_div.get_text().strip()
match = re.search(r"(\d+) seconds?", text)
if match:
seconds = int(match.group(1))
timeout_ms = seconds * 1000
if timeout_ms is None:
raise ValueError("Could not find valid timeout in time-limit section")
memory_limit_div = soup.find("div", class_="memory-limit")
if memory_limit_div:
text = memory_limit_div.get_text().strip()
match = re.search(r"(\d+) megabytes", text)
if match:
memory_mb = int(match.group(1))
if memory_mb is None:
raise ValueError("Could not find valid memory limit in memory-limit section")
return timeout_ms, memory_mb
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
scraper = cloudscraper.create_scraper()
@ -148,7 +178,7 @@ def scrape_contest_problems(contest_id: str) -> list[Problem]:
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
problems: list[Problem] = []
problems: list[ProblemSummary] = []
problem_links = soup.find_all(
"a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x
@ -163,12 +193,14 @@ def scrape_contest_problems(contest_id: str) -> list[Problem]:
problem_name: str = link.get_text(strip=True)
if problem_letter and problem_name:
problems.append(Problem(id=problem_letter, name=problem_name))
problems.append(
ProblemSummary(id=problem_letter, name=problem_name)
)
problems.sort(key=lambda x: x.id)
seen: set[str] = set()
unique_problems: list[Problem] = []
unique_problems: list[ProblemSummary] = []
for p in problems:
if p.id not in seen:
seen.add(p.id)
@ -206,7 +238,7 @@ def main() -> None:
sys.exit(1)
contest_id: str = sys.argv[2]
problems: list[Problem] = scrape_contest_problems(contest_id)
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
if not problems:
result = MetadataResult(
@ -215,7 +247,9 @@ def main() -> None:
print(json.dumps(asdict(result)))
sys.exit(1)
result = MetadataResult(success=True, contest_id=contest_id, problems=problems)
result = MetadataResult(
success=True, error="", contest_id=contest_id, problems=problems
)
print(json.dumps(asdict(result)))
elif mode == "tests":
@ -223,6 +257,11 @@ def main() -> None:
tests_result = TestsResult(
success=False,
error="Usage: codeforces.py tests <contest_id> <problem_letter>",
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
@ -234,18 +273,46 @@ def main() -> None:
url: str = parse_problem_url(tests_contest_id, problem_letter)
tests: list[TestCase] = scrape_sample_tests(url)
try:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {tests_contest_id} {problem_letter}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests_result = TestsResult(
success=True, problem_id=problem_id, url=url, tests=tests
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))