diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index b9b39ea..d9bf3c5 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -1,18 +1,49 @@ #!/usr/bin/env python3 import json +import re import sys +from dataclasses import asdict import requests from bs4 import BeautifulSoup, Tag +from .models import MetadataResult, ProblemSummary, TestCase, TestsResult + + +def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]: + timeout_ms = None + memory_mb = None + + paragraphs = soup.find_all("p") + for p in paragraphs: + text = p.get_text() + if "Time Limit:" in text and "Memory Limit:" in text: + time_match = re.search(r"Time Limit:\s*(\d+)\s*sec", text) + if time_match: + seconds = int(time_match.group(1)) + timeout_ms = seconds * 1000 + + memory_match = re.search(r"Memory Limit:\s*(\d+)\s*MiB", text) + if memory_match: + memory_mb = int(memory_match.group(1)) + break + + if timeout_ms is None: + raise ValueError("Could not find valid timeout in problem constraints") + + if memory_mb is None: + raise ValueError("Could not find valid memory limit in problem constraints") + + return timeout_ms, memory_mb + def parse_problem_url(contest_id: str, problem_letter: str) -> str: task_id: str = f"{contest_id}_{problem_letter}" return f"https://atcoder.jp/contests/{contest_id}/tasks/{task_id}" -def extract_problem_from_row(row, contest_id: str) -> dict[str, str] | None: +def extract_problem_from_row(row, contest_id: str) -> ProblemSummary | None: cells = row.find_all("td") if len(cells) < 2: return None @@ -34,10 +65,10 @@ def extract_problem_from_row(row, contest_id: str) -> dict[str, str] | None: if not problem_letter or not task_name: return None - return {"id": problem_letter.lower(), "name": task_name} + return ProblemSummary(id=problem_letter.lower(), name=task_name) -def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: +def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: try: contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks" headers = { @@ -53,13 +84,13 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: return [] rows = task_table.find_all("tr")[1:] - problems: list[dict[str, str]] = [] + problems: list[ProblemSummary] = [] for row in rows: problem = extract_problem_from_row(row, contest_id) if problem: problems.append(problem) - problems.sort(key=lambda x: x["id"]) + problems.sort(key=lambda x: x.id) return problems except Exception as e: @@ -95,7 +126,7 @@ def extract_test_case_from_headers(sample_headers, i: int) -> tuple[str, str] | return (input_text, output_text) -def scrape(url: str) -> list[tuple[str, str]]: +def scrape(url: str) -> list[TestCase]: try: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" @@ -109,12 +140,13 @@ def scrape(url: str) -> list[tuple[str, str]]: "h3", string=lambda x: x and "sample" in x.lower() if x else False ) - tests: list[tuple[str, str]] = [] + tests: list[TestCase] = [] i = 0 while i < len(sample_headers): test_case = extract_test_case_from_headers(sample_headers, i) if test_case: - tests.append(test_case) + input_text, output_text = test_case + tests.append(TestCase(input=input_text, expected=output_text)) i += 2 else: i += 1 @@ -128,64 +160,55 @@ def scrape(url: str) -> list[tuple[str, str]]: def main() -> None: if len(sys.argv) < 2: - print( - json.dumps( - { - "success": False, - "error": "Usage: atcoder.py metadata OR atcoder.py tests ", - } - ) + result = MetadataResult( + success=False, + error="Usage: atcoder.py metadata OR atcoder.py tests ", ) + print(json.dumps(asdict(result))) sys.exit(1) mode: str = sys.argv[1] if mode == "metadata": if len(sys.argv) != 3: - print( - json.dumps( - { - "success": False, - "error": "Usage: atcoder.py metadata ", - } - ) + result = MetadataResult( + success=False, + error="Usage: atcoder.py metadata ", ) + print(json.dumps(asdict(result))) sys.exit(1) contest_id: str = sys.argv[2] - problems: list[dict[str, str]] = scrape_contest_problems(contest_id) + problems: list[ProblemSummary] = scrape_contest_problems(contest_id) if not problems: - print( - json.dumps( - { - "success": False, - "error": f"No problems found for contest {contest_id}", - } - ) + result = MetadataResult( + success=False, + error=f"No problems found for contest {contest_id}", ) + print(json.dumps(asdict(result))) sys.exit(1) - print( - json.dumps( - { - "success": True, - "contest_id": contest_id, - "problems": problems, - } - ) + result = MetadataResult( + success=True, + error="", + contest_id=contest_id, + problems=problems, ) + print(json.dumps(asdict(result))) elif mode == "tests": if len(sys.argv) != 4: - print( - json.dumps( - { - "success": False, - "error": "Usage: atcoder.py tests ", - } - ) + tests_result = TestsResult( + success=False, + error="Usage: atcoder.py tests ", + problem_id="", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, ) + print(json.dumps(asdict(tests_result))) sys.exit(1) test_contest_id: str = sys.argv[2] @@ -193,46 +216,59 @@ def main() -> None: problem_id: str = f"{test_contest_id}_{problem_letter.lower()}" url: str = parse_problem_url(test_contest_id, problem_letter) - print(f"Scraping: {url}", file=sys.stderr) + tests: list[TestCase] = scrape(url) - tests: list[tuple[str, str]] = scrape(url) - if not tests: - print( - json.dumps( - { - "success": False, - "error": f"No tests found for {test_contest_id} {problem_letter}", - "problem_id": problem_id, - "url": url, - } - ) + try: + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + except Exception as e: + tests_result = TestsResult( + success=False, + error=f"Failed to extract constraints: {e}", + problem_id=problem_id, + url=url, + tests=[], + timeout_ms=0, + memory_mb=0, ) + print(json.dumps(asdict(tests_result))) sys.exit(1) - test_list: list[dict[str, str]] = [ - {"input": i, "expected": o} for i, o in tests - ] - - print( - json.dumps( - { - "success": True, - "problem_id": problem_id, - "url": url, - "tests": test_list, - } + if not tests: + tests_result = TestsResult( + success=False, + error=f"No tests found for {test_contest_id} {problem_letter}", + problem_id=problem_id, + url=url, + tests=[], + timeout_ms=timeout_ms, + memory_mb=memory_mb, ) + print(json.dumps(asdict(tests_result))) + sys.exit(1) + + tests_result = TestsResult( + success=True, + error="", + problem_id=problem_id, + url=url, + tests=tests, + timeout_ms=timeout_ms, + memory_mb=memory_mb, ) + print(json.dumps(asdict(tests_result))) else: - print( - json.dumps( - { - "success": False, - "error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'", - } - ) + result = MetadataResult( + success=False, + error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'", ) + print(json.dumps(asdict(result))) sys.exit(1) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index c193a21..54a51a1 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -7,7 +7,7 @@ from dataclasses import asdict import cloudscraper from bs4 import BeautifulSoup, Tag -from .models import MetadataResult, Problem, TestCase, TestsResult +from .models import MetadataResult, ProblemSummary, TestCase, TestsResult def scrape(url: str) -> list[TestCase]: @@ -140,7 +140,37 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str: ) -def scrape_contest_problems(contest_id: str) -> list[Problem]: +def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]: + import re + + timeout_ms = None + memory_mb = None + + time_limit_div = soup.find("div", class_="time-limit") + if time_limit_div: + text = time_limit_div.get_text().strip() + match = re.search(r"(\d+) seconds?", text) + if match: + seconds = int(match.group(1)) + timeout_ms = seconds * 1000 + + if timeout_ms is None: + raise ValueError("Could not find valid timeout in time-limit section") + + memory_limit_div = soup.find("div", class_="memory-limit") + if memory_limit_div: + text = memory_limit_div.get_text().strip() + match = re.search(r"(\d+) megabytes", text) + if match: + memory_mb = int(match.group(1)) + + if memory_mb is None: + raise ValueError("Could not find valid memory limit in memory-limit section") + + return timeout_ms, memory_mb + + +def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: try: contest_url: str = f"https://codeforces.com/contest/{contest_id}" scraper = cloudscraper.create_scraper() @@ -148,7 +178,7 @@ def scrape_contest_problems(contest_id: str) -> list[Problem]: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - problems: list[Problem] = [] + problems: list[ProblemSummary] = [] problem_links = soup.find_all( "a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x @@ -163,12 +193,14 @@ def scrape_contest_problems(contest_id: str) -> list[Problem]: problem_name: str = link.get_text(strip=True) if problem_letter and problem_name: - problems.append(Problem(id=problem_letter, name=problem_name)) + problems.append( + ProblemSummary(id=problem_letter, name=problem_name) + ) problems.sort(key=lambda x: x.id) seen: set[str] = set() - unique_problems: list[Problem] = [] + unique_problems: list[ProblemSummary] = [] for p in problems: if p.id not in seen: seen.add(p.id) @@ -206,7 +238,7 @@ def main() -> None: sys.exit(1) contest_id: str = sys.argv[2] - problems: list[Problem] = scrape_contest_problems(contest_id) + problems: list[ProblemSummary] = scrape_contest_problems(contest_id) if not problems: result = MetadataResult( @@ -215,7 +247,9 @@ def main() -> None: print(json.dumps(asdict(result))) sys.exit(1) - result = MetadataResult(success=True, contest_id=contest_id, problems=problems) + result = MetadataResult( + success=True, error="", contest_id=contest_id, problems=problems + ) print(json.dumps(asdict(result))) elif mode == "tests": @@ -223,6 +257,11 @@ def main() -> None: tests_result = TestsResult( success=False, error="Usage: codeforces.py tests ", + problem_id="", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, ) print(json.dumps(asdict(tests_result))) sys.exit(1) @@ -234,18 +273,46 @@ def main() -> None: url: str = parse_problem_url(tests_contest_id, problem_letter) tests: list[TestCase] = scrape_sample_tests(url) + try: + scraper = cloudscraper.create_scraper() + response = scraper.get(url, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + except Exception as e: + tests_result = TestsResult( + success=False, + error=f"Failed to extract constraints: {e}", + problem_id=problem_id, + url=url, + tests=[], + timeout_ms=0, + memory_mb=0, + ) + print(json.dumps(asdict(tests_result))) + sys.exit(1) + if not tests: tests_result = TestsResult( success=False, error=f"No tests found for {tests_contest_id} {problem_letter}", problem_id=problem_id, url=url, + tests=[], + timeout_ms=timeout_ms, + memory_mb=memory_mb, ) print(json.dumps(asdict(tests_result))) sys.exit(1) tests_result = TestsResult( - success=True, problem_id=problem_id, url=url, tests=tests + success=True, + error="", + problem_id=problem_id, + url=url, + tests=tests, + timeout_ms=timeout_ms, + memory_mb=memory_mb, ) print(json.dumps(asdict(tests_result))) diff --git a/scrapers/cses.py b/scrapers/cses.py index 16a9c18..27959e1 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 import json +import re import sys +from dataclasses import asdict import requests -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag + +from .models import MetadataResult, ProblemSummary, TestCase, TestsResult def parse_problem_url(problem_input: str) -> str | None: @@ -15,10 +19,43 @@ def parse_problem_url(problem_input: str) -> str | None: return None +def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, int]: + timeout_ms = None + memory_mb = None + + constraints_ul = soup.find("ul", class_="task-constraints") + if not constraints_ul or not isinstance(constraints_ul, Tag): + raise ValueError("Could not find task-constraints section") + + for li in constraints_ul.find_all("li"): + text = li.get_text() + + if "Time limit:" in text: + match = re.search(r"Time limit:\s*(\d+(?:\.\d+)?)\s*s", text) + if match: + seconds = float(match.group(1)) + timeout_ms = int(seconds * 1000) + + if "Memory limit:" in text: + match = re.search(r"Memory limit:\s*(\d+)\s*MB", text) + if match: + memory_mb = int(match.group(1)) + + if timeout_ms is None: + raise ValueError("Could not find valid timeout in task-constraints section") + + if memory_mb is None: + raise ValueError( + "Could not find valid memory limit in task-constraints section" + ) + + return timeout_ms, memory_mb + + def process_problem_element( element, current_category: str | None, - all_categories: dict[str, list[dict[str, str]]], + all_categories: dict[str, list[ProblemSummary]], ) -> str | None: if element.name == "h1": category_name = element.get_text().strip() @@ -39,11 +76,12 @@ def process_problem_element( if not (problem_id.isdigit() and problem_name and current_category): return current_category - all_categories[current_category].append({"id": problem_id, "name": problem_name}) + problem = ProblemSummary(id=problem_id, name=problem_name) + all_categories[current_category].append(problem) return current_category -def scrape_all_problems() -> dict[str, list[dict[str, str]]]: +def scrape_all_problems() -> dict[str, list[ProblemSummary]]: try: problemset_url = "https://cses.fi/problemset/" headers = { @@ -54,7 +92,7 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - all_categories: dict[str, list[dict[str, str]]] = {} + all_categories: dict[str, list[ProblemSummary]] = {} problem_links = soup.find_all( "a", href=lambda x: x and "/problemset/task/" in x @@ -68,7 +106,7 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]: ) for category in all_categories: - all_categories[category].sort(key=lambda x: int(x["id"])) + all_categories[category].sort(key=lambda x: int(x.id)) print(f"Found {len(all_categories)} categories", file=sys.stderr) return all_categories @@ -129,79 +167,65 @@ def scrape(url: str) -> list[tuple[str, str]]: def main() -> None: if len(sys.argv) < 2: - print( - json.dumps( - { - "success": False, - "error": "Usage: cses.py metadata OR cses.py tests ", - } - ) + result = MetadataResult( + success=False, + error="Usage: cses.py metadata OR cses.py tests ", ) + print(json.dumps(asdict(result))) sys.exit(1) mode: str = sys.argv[1] if mode == "metadata": if len(sys.argv) != 2: - print( - json.dumps( - { - "success": False, - "error": "Usage: cses.py metadata", - } - ) + result = MetadataResult( + success=False, + error="Usage: cses.py metadata", ) + print(json.dumps(asdict(result))) sys.exit(1) - all_categories: dict[str, list[dict[str, str]]] = scrape_all_problems() + all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems() if not all_categories: - print( - json.dumps( - { - "success": False, - "error": "Failed to scrape CSES problem categories", - } - ) + result = MetadataResult( + success=False, + error="Failed to scrape CSES problem categories", ) + print(json.dumps(asdict(result))) sys.exit(1) - print( - json.dumps( - { - "success": True, - "categories": all_categories, - } - ) - ) + result = MetadataResult(success=True, error="", categories=all_categories) + print(json.dumps(asdict(result))) elif mode == "tests": if len(sys.argv) != 3: - print( - json.dumps( - { - "success": False, - "error": "Usage: cses.py tests ", - } - ) + tests_result = TestsResult( + success=False, + error="Usage: cses.py tests ", + problem_id="", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, ) + print(json.dumps(asdict(tests_result))) sys.exit(1) problem_input: str = sys.argv[2] url: str | None = parse_problem_url(problem_input) if not url: - print( - json.dumps( - { - "success": False, - "error": f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL", - "problem_id": problem_input - if problem_input.isdigit() - else None, - } - ) + tests_result = TestsResult( + success=False, + error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL", + problem_id=problem_input if problem_input.isdigit() else "", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, ) + print(json.dumps(asdict(tests_result))) sys.exit(1) tests: list[tuple[str, str]] = scrape(url) @@ -210,43 +234,58 @@ def main() -> None: problem_input if problem_input.isdigit() else problem_input.split("/")[-1] ) - if not tests: - print( - json.dumps( - { - "success": False, - "error": f"No tests found for {problem_input}", - "problem_id": problem_id, - "url": url, - } - ) + try: + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + except Exception as e: + tests_result = TestsResult( + success=False, + error=f"Failed to extract constraints: {e}", + problem_id=problem_id, + url=url, + tests=[], + timeout_ms=0, + memory_mb=0, ) + print(json.dumps(asdict(tests_result))) sys.exit(1) - test_list: list[dict[str, str]] = [ - {"input": i, "expected": o} for i, o in tests - ] - - print( - json.dumps( - { - "success": True, - "problem_id": problem_id, - "url": url, - "tests": test_list, - } + if not tests: + tests_result = TestsResult( + success=False, + error=f"No tests found for {problem_input}", + problem_id=problem_id, + url=url, + tests=[], + timeout_ms=timeout_ms, + memory_mb=memory_mb, ) + print(json.dumps(asdict(tests_result))) + sys.exit(1) + + test_cases = [TestCase(input=i, expected=o) for i, o in tests] + tests_result = TestsResult( + success=True, + error="", + problem_id=problem_id, + url=url, + tests=test_cases, + timeout_ms=timeout_ms, + memory_mb=memory_mb, ) + print(json.dumps(asdict(tests_result))) else: - print( - json.dumps( - { - "success": False, - "error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'", - } - ) + result = MetadataResult( + success=False, + error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'", ) + print(json.dumps(asdict(result))) sys.exit(1) diff --git a/scrapers/models.py b/scrapers/models.py index ea0e03e..a37d186 100644 --- a/scrapers/models.py +++ b/scrapers/models.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field @dataclass @@ -8,34 +8,36 @@ class TestCase: @dataclass -class Problem: +class ProblemSummary: id: str name: str +@dataclass +class Problem: + id: str + name: str + timeout_ms: int + memory_mb: int + + @dataclass class ScrapingResult: success: bool - error: str | None = None + error: str @dataclass class MetadataResult(ScrapingResult): - contest_id: str | None = None - problems: list[Problem] | None = None - categories: dict[str, list[Problem]] | None = None - - def __post_init__(self): - if self.problems is None: - self.problems = [] + contest_id: str = "" + problems: list[ProblemSummary] = field(default_factory=list) + categories: dict[str, list[ProblemSummary]] = field(default_factory=dict) @dataclass class TestsResult(ScrapingResult): - problem_id: str = "" - url: str = "" - tests: list[TestCase] | None = None - - def __post_init__(self): - if self.tests is None: - self.tests = [] + problem_id: str + url: str + tests: list[TestCase] + timeout_ms: int + memory_mb: int