From ffaec3b9479ded5dd92bf498f9496cc88bb8646d Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 18 Sep 2025 22:14:13 -0400 Subject: [PATCH] fix(ci): type scrapers --- scrapers/atcoder.py | 4 +- scrapers/codeforces.py | 123 +++++++++++------------------- scrapers/models.py | 41 ++++++++++ tests/scrapers/test_codeforces.py | 9 ++- 4 files changed, 92 insertions(+), 85 deletions(-) create mode 100644 scrapers/models.py diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 0de9f2b..b9b39ea 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -52,7 +52,7 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: if not task_table or not isinstance(task_table, Tag): return [] - rows = task_table.find_all("tr")[1:] # skip header + rows = task_table.find_all("tr")[1:] problems: list[dict[str, str]] = [] for row in rows: problem = extract_problem_from_row(row, contest_id) @@ -115,7 +115,7 @@ def scrape(url: str) -> list[tuple[str, str]]: test_case = extract_test_case_from_headers(sample_headers, i) if test_case: tests.append(test_case) - i += 2 # move from "Sample Input n" to after "Sample Output n" + i += 2 else: i += 1 diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 730800a..c193a21 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -2,12 +2,15 @@ import json import sys +from dataclasses import asdict import cloudscraper from bs4 import BeautifulSoup, Tag +from .models import MetadataResult, Problem, TestCase, TestsResult -def scrape(url: str) -> list[tuple[str, str]]: + +def scrape(url: str) -> list[TestCase]: try: scraper = cloudscraper.create_scraper() response = scraper.get(url, timeout=10) @@ -88,7 +91,7 @@ def scrape(url: str) -> list[tuple[str, str]]: input_text = "\n".join(individual_inputs[test_num]) output_text = "\n".join(individual_outputs[test_num]) prefixed_input = "1\n" + input_text - tests.append((prefixed_input, output_text)) + tests.append(TestCase(input=prefixed_input, expected=output_text)) return tests all_inputs = [] all_outputs = [] @@ -124,7 +127,7 @@ def scrape(url: str) -> list[tuple[str, str]]: combined_input = "\n".join(all_inputs) combined_output = "\n".join(all_outputs) - return [(combined_input, combined_output)] + return [TestCase(input=combined_input, expected=combined_output)] except Exception as e: print(f"CloudScraper failed: {e}", file=sys.stderr) @@ -137,7 +140,7 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str: ) -def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: +def scrape_contest_problems(contest_id: str) -> list[Problem]: try: contest_url: str = f"https://codeforces.com/contest/{contest_id}" scraper = cloudscraper.create_scraper() @@ -145,7 +148,7 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - problems: list[dict[str, str]] = [] + problems: list[Problem] = [] problem_links = soup.find_all( "a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x @@ -160,15 +163,15 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: problem_name: str = link.get_text(strip=True) if problem_letter and problem_name: - problems.append({"id": problem_letter, "name": problem_name}) + problems.append(Problem(id=problem_letter, name=problem_name)) - problems.sort(key=lambda x: x["id"]) + problems.sort(key=lambda x: x.id) seen: set[str] = set() - unique_problems: list[dict[str, str]] = [] + unique_problems: list[Problem] = [] for p in problems: - if p["id"] not in seen: - seen.add(p["id"]) + if p.id not in seen: + seen.add(p.id) unique_problems.append(p) return unique_problems @@ -178,71 +181,50 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: return [] -def scrape_sample_tests(url: str) -> list[tuple[str, str]]: +def scrape_sample_tests(url: str) -> list[TestCase]: print(f"Scraping: {url}", file=sys.stderr) return scrape(url) def main() -> None: if len(sys.argv) < 2: - print( - json.dumps( - { - "success": False, - "error": "Usage: codeforces.py metadata OR codeforces.py tests ", - } - ) + result = MetadataResult( + success=False, + error="Usage: codeforces.py metadata OR codeforces.py tests ", ) + print(json.dumps(asdict(result))) sys.exit(1) mode: str = sys.argv[1] if mode == "metadata": if len(sys.argv) != 3: - print( - json.dumps( - { - "success": False, - "error": "Usage: codeforces.py metadata ", - } - ) + result = MetadataResult( + success=False, error="Usage: codeforces.py metadata " ) + print(json.dumps(asdict(result))) sys.exit(1) contest_id: str = sys.argv[2] - problems: list[dict[str, str]] = scrape_contest_problems(contest_id) + problems: list[Problem] = scrape_contest_problems(contest_id) if not problems: - print( - json.dumps( - { - "success": False, - "error": f"No problems found for contest {contest_id}", - } - ) + result = MetadataResult( + success=False, error=f"No problems found for contest {contest_id}" ) + print(json.dumps(asdict(result))) sys.exit(1) - print( - json.dumps( - { - "success": True, - "contest_id": contest_id, - "problems": problems, - } - ) - ) + result = MetadataResult(success=True, contest_id=contest_id, problems=problems) + print(json.dumps(asdict(result))) elif mode == "tests": if len(sys.argv) != 4: - print( - json.dumps( - { - "success": False, - "error": "Usage: codeforces.py tests ", - } - ) + tests_result = TestsResult( + success=False, + error="Usage: codeforces.py tests ", ) + print(json.dumps(asdict(tests_result))) sys.exit(1) tests_contest_id: str = sys.argv[2] @@ -250,45 +232,28 @@ def main() -> None: problem_id: str = tests_contest_id + problem_letter.lower() url: str = parse_problem_url(tests_contest_id, problem_letter) - tests: list[tuple[str, str]] = scrape_sample_tests(url) + tests: list[TestCase] = scrape_sample_tests(url) if not tests: - print( - json.dumps( - { - "success": False, - "error": f"No tests found for {tests_contest_id} {problem_letter}", - "problem_id": problem_id, - "url": url, - } - ) + tests_result = TestsResult( + success=False, + error=f"No tests found for {tests_contest_id} {problem_letter}", + problem_id=problem_id, + url=url, ) + print(json.dumps(asdict(tests_result))) sys.exit(1) - test_list: list[dict[str, str]] = [] - for input_data, output_data in tests: - test_list.append({"input": input_data, "expected": output_data}) - - print( - json.dumps( - { - "success": True, - "problem_id": problem_id, - "url": url, - "tests": test_list, - } - ) + tests_result = TestsResult( + success=True, problem_id=problem_id, url=url, tests=tests ) + print(json.dumps(asdict(tests_result))) else: - print( - json.dumps( - { - "success": False, - "error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'", - } - ) + result = MetadataResult( + success=False, error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'" ) + print(json.dumps(asdict(result))) sys.exit(1) diff --git a/scrapers/models.py b/scrapers/models.py new file mode 100644 index 0000000..ea0e03e --- /dev/null +++ b/scrapers/models.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass + + +@dataclass +class TestCase: + input: str + expected: str + + +@dataclass +class Problem: + id: str + name: str + + +@dataclass +class ScrapingResult: + success: bool + error: str | None = None + + +@dataclass +class MetadataResult(ScrapingResult): + contest_id: str | None = None + problems: list[Problem] | None = None + categories: dict[str, list[Problem]] | None = None + + def __post_init__(self): + if self.problems is None: + self.problems = [] + + +@dataclass +class TestsResult(ScrapingResult): + problem_id: str = "" + url: str = "" + tests: list[TestCase] | None = None + + def __post_init__(self): + if self.tests is None: + self.tests = [] diff --git a/tests/scrapers/test_codeforces.py b/tests/scrapers/test_codeforces.py index eacc801..e9c9429 100644 --- a/tests/scrapers/test_codeforces.py +++ b/tests/scrapers/test_codeforces.py @@ -1,5 +1,6 @@ from unittest.mock import Mock from scrapers.codeforces import scrape, scrape_contest_problems +from scrapers.models import Problem, TestCase def test_scrape_success(mocker, mock_codeforces_html): @@ -15,8 +16,8 @@ def test_scrape_success(mocker, mock_codeforces_html): result = scrape("https://codeforces.com/contest/1900/problem/A") assert len(result) == 1 - assert result[0][0] == "1\n3\n1 2 3" - assert result[0][1] == "6" + assert result[0].input == "1\n3\n1 2 3" + assert result[0].expected == "6" def test_scrape_contest_problems(mocker): @@ -35,8 +36,8 @@ def test_scrape_contest_problems(mocker): result = scrape_contest_problems("1900") assert len(result) == 2 - assert result[0] == {"id": "a", "name": "A. Problem A"} - assert result[1] == {"id": "b", "name": "B. Problem B"} + assert result[0] == Problem(id="a", name="A. Problem A") + assert result[1] == Problem(id="b", name="B. Problem B") def test_scrape_network_error(mocker):