From 5293515acaba5cf28b172d18b9a016da49190c1e Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Tue, 27 Jan 2026 14:44:08 -0500 Subject: [PATCH 1/4] feat(scrapers): refactor --- scrapers/base.py | 101 +++++++++++++++++--------------- scrapers/codeforces.py | 130 +++++++++-------------------------------- scrapers/cses.py | 73 +---------------------- 3 files changed, 83 insertions(+), 221 deletions(-) diff --git a/scrapers/base.py b/scrapers/base.py index 6409c9a..4b685d0 100644 --- a/scrapers/base.py +++ b/scrapers/base.py @@ -1,9 +1,8 @@ +import asyncio +import sys from abc import ABC, abstractmethod -from typing import Any, Awaitable, Callable, ParamSpec, cast -from .models import ContestListResult, MetadataResult, TestsResult - -P = ParamSpec("P") +from .models import CombinedTest, ContestListResult, MetadataResult, TestsResult class BaseScraper(ABC): @@ -20,57 +19,65 @@ class BaseScraper(ABC): @abstractmethod async def stream_tests_for_category_async(self, category_id: str) -> None: ... - def _create_metadata_error( - self, error_msg: str, contest_id: str = "" - ) -> MetadataResult: - return MetadataResult( - success=False, - error=f"{self.platform_name}: {error_msg}", - contest_id=contest_id, - problems=[], - url="", - ) + def _usage(self) -> str: + name = self.platform_name + return f"Usage: {name}.py metadata | tests | contests" - def _create_tests_error( - self, error_msg: str, problem_id: str = "", url: str = "" - ) -> TestsResult: - from .models import CombinedTest + def _metadata_error(self, msg: str) -> MetadataResult: + return MetadataResult(success=False, error=msg, url="") + def _tests_error(self, msg: str) -> TestsResult: return TestsResult( success=False, - error=f"{self.platform_name}: {error_msg}", - problem_id=problem_id, + error=msg, + problem_id="", combined=CombinedTest(input="", expected=""), tests=[], timeout_ms=0, memory_mb=0, - interactive=False, ) - def _create_contests_error(self, error_msg: str) -> ContestListResult: - return ContestListResult( - success=False, - error=f"{self.platform_name}: {error_msg}", - contests=[], - ) + def _contests_error(self, msg: str) -> ContestListResult: + return ContestListResult(success=False, error=msg) - async def _safe_execute( - self, - operation: str, - func: Callable[P, Awaitable[Any]], - *args: P.args, - **kwargs: P.kwargs, - ): - try: - return await func(*args, **kwargs) - except Exception as e: - if operation == "metadata": - contest_id = cast(str, args[0]) if args else "" - return self._create_metadata_error(str(e), contest_id) - elif operation == "tests": - problem_id = cast(str, args[1]) if len(args) > 1 else "" - return self._create_tests_error(str(e), problem_id) - elif operation == "contests": - return self._create_contests_error(str(e)) - else: - raise + async def _run_cli_async(self, args: list[str]) -> int: + if len(args) < 2: + print(self._metadata_error(self._usage()).model_dump_json()) + return 1 + + mode = args[1] + + match mode: + case "metadata": + if len(args) != 3: + print(self._metadata_error(self._usage()).model_dump_json()) + return 1 + result = await self.scrape_contest_metadata(args[2]) + print(result.model_dump_json()) + return 0 if result.success else 1 + + case "tests": + if len(args) != 3: + print(self._tests_error(self._usage()).model_dump_json()) + return 1 + await self.stream_tests_for_category_async(args[2]) + return 0 + + case "contests": + if len(args) != 2: + print(self._contests_error(self._usage()).model_dump_json()) + return 1 + result = await self.scrape_contest_list() + print(result.model_dump_json()) + return 0 if result.success else 1 + + case _: + print( + self._metadata_error( + f"Unknown mode: {mode}. {self._usage()}" + ).model_dump_json() + ) + return 1 + + def run_cli(self) -> None: + sys.exit(asyncio.run(self._run_cli_async(sys.argv))) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 24f55f6..840616f 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -4,7 +4,6 @@ import asyncio import json import logging import re -import sys from typing import Any import requests @@ -13,13 +12,11 @@ from scrapling.fetchers import Fetcher from .base import BaseScraper from .models import ( - CombinedTest, ContestListResult, ContestSummary, MetadataResult, ProblemSummary, TestCase, - TestsResult, ) # suppress scrapling logging - https://github.com/D4Vinci/Scrapling/issues/31) @@ -209,49 +206,46 @@ class CodeforcesScraper(BaseScraper): return "codeforces" async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: - async def impl(cid: str) -> MetadataResult: - problems = await asyncio.to_thread(_scrape_contest_problems_sync, cid) + try: + problems = await asyncio.to_thread( + _scrape_contest_problems_sync, contest_id + ) if not problems: - return self._create_metadata_error( - f"No problems found for contest {cid}", cid + return self._metadata_error( + f"No problems found for contest {contest_id}" ) return MetadataResult( success=True, error="", - contest_id=cid, + contest_id=contest_id, problems=problems, url=f"https://codeforces.com/contest/{contest_id}/problem/%s", ) - - return await self._safe_execute("metadata", impl, contest_id) + except Exception as e: + return self._metadata_error(str(e)) async def scrape_contest_list(self) -> ContestListResult: - async def impl() -> ContestListResult: - try: - r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS) - r.raise_for_status() - data = r.json() - if data.get("status") != "OK": - return self._create_contests_error("Invalid API response") + try: + r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS) + r.raise_for_status() + data = r.json() + if data.get("status") != "OK": + return self._contests_error("Invalid API response") - contests: list[ContestSummary] = [] - for c in data["result"]: - if c.get("phase") != "FINISHED": - continue - cid = str(c["id"]) - name = c["name"] - contests.append( - ContestSummary(id=cid, name=name, display_name=name) - ) + contests: list[ContestSummary] = [] + for c in data["result"]: + if c.get("phase") != "FINISHED": + continue + cid = str(c["id"]) + name = c["name"] + contests.append(ContestSummary(id=cid, name=name, display_name=name)) - if not contests: - return self._create_contests_error("No contests found") + if not contests: + return self._contests_error("No contests found") - return ContestListResult(success=True, error="", contests=contests) - except Exception as e: - return self._create_contests_error(str(e)) - - return await self._safe_execute("contests", impl) + return ContestListResult(success=True, error="", contests=contests) + except Exception as e: + return self._contests_error(str(e)) async def stream_tests_for_category_async(self, category_id: str) -> None: html = await asyncio.to_thread(_fetch_problems_html, category_id) @@ -281,73 +275,5 @@ class CodeforcesScraper(BaseScraper): ) -async def main_async() -> int: - if len(sys.argv) < 2: - result = MetadataResult( - success=False, - error="Usage: codeforces.py metadata OR codeforces.py tests OR codeforces.py contests", - url="", - ) - print(result.model_dump_json()) - return 1 - - mode: str = sys.argv[1] - scraper = CodeforcesScraper() - - if mode == "metadata": - if len(sys.argv) != 3: - result = MetadataResult( - success=False, - error="Usage: codeforces.py metadata ", - url="", - ) - print(result.model_dump_json()) - return 1 - contest_id = sys.argv[2] - result = await scraper.scrape_contest_metadata(contest_id) - print(result.model_dump_json()) - return 0 if result.success else 1 - - if mode == "tests": - if len(sys.argv) != 3: - tests_result = TestsResult( - success=False, - error="Usage: codeforces.py tests ", - problem_id="", - combined=CombinedTest(input="", expected=""), - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(tests_result.model_dump_json()) - return 1 - contest_id = sys.argv[2] - await scraper.stream_tests_for_category_async(contest_id) - return 0 - - if mode == "contests": - if len(sys.argv) != 2: - contest_result = ContestListResult( - success=False, error="Usage: codeforces.py contests" - ) - print(contest_result.model_dump_json()) - return 1 - contest_result = await scraper.scrape_contest_list() - print(contest_result.model_dump_json()) - return 0 if contest_result.success else 1 - - result = MetadataResult( - success=False, - error="Unknown mode. Use 'metadata ', 'tests ', or 'contests'", - url="", - ) - print(result.model_dump_json()) - return 1 - - -def main() -> None: - sys.exit(asyncio.run(main_async())) - - if __name__ == "__main__": - main() + CodeforcesScraper().run_cli() diff --git a/scrapers/cses.py b/scrapers/cses.py index 620cb7f..5440b34 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -3,20 +3,17 @@ import asyncio import json import re -import sys from typing import Any import httpx from .base import BaseScraper from .models import ( - CombinedTest, ContestListResult, ContestSummary, MetadataResult, ProblemSummary, TestCase, - TestsResult, ) BASE_URL = "https://cses.fi" @@ -261,73 +258,5 @@ class CSESScraper(BaseScraper): print(json.dumps(payload), flush=True) -async def main_async() -> int: - if len(sys.argv) < 2: - result = MetadataResult( - success=False, - error="Usage: cses.py metadata OR cses.py tests OR cses.py contests", - url="", - ) - print(result.model_dump_json()) - return 1 - - mode: str = sys.argv[1] - scraper = CSESScraper() - - if mode == "metadata": - if len(sys.argv) != 3: - result = MetadataResult( - success=False, - error="Usage: cses.py metadata ", - url="", - ) - print(result.model_dump_json()) - return 1 - category_id = sys.argv[2] - result = await scraper.scrape_contest_metadata(category_id) - print(result.model_dump_json()) - return 0 if result.success else 1 - - if mode == "tests": - if len(sys.argv) != 3: - tests_result = TestsResult( - success=False, - error="Usage: cses.py tests ", - problem_id="", - combined=CombinedTest(input="", expected=""), - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(tests_result.model_dump_json()) - return 1 - category = sys.argv[2] - await scraper.stream_tests_for_category_async(category) - return 0 - - if mode == "contests": - if len(sys.argv) != 2: - contest_result = ContestListResult( - success=False, error="Usage: cses.py contests" - ) - print(contest_result.model_dump_json()) - return 1 - contest_result = await scraper.scrape_contest_list() - print(contest_result.model_dump_json()) - return 0 if contest_result.success else 1 - - result = MetadataResult( - success=False, - error=f"Unknown mode: {mode}. Use 'metadata ', 'tests ', or 'contests'", - url="", - ) - print(result.model_dump_json()) - return 1 - - -def main() -> None: - sys.exit(asyncio.run(main_async())) - - if __name__ == "__main__": - main() + CSESScraper().run_cli() From d5c6783124eaa24e9cb1ae67382fe4c94032f3af Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Tue, 27 Jan 2026 15:43:40 -0500 Subject: [PATCH 2/4] feat(scrapers): refactor --- scrapers/codechef.py | 117 +++++++++---------------------------------- tests/conftest.py | 18 ++++--- 2 files changed, 35 insertions(+), 100 deletions(-) diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 1680e83..c9e402c 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -10,13 +10,11 @@ from scrapling.fetchers import Fetcher from .base import BaseScraper from .models import ( - CombinedTest, ContestListResult, ContestSummary, MetadataResult, ProblemSummary, TestCase, - TestsResult, ) BASE_URL = "https://www.codechef.com" @@ -62,42 +60,40 @@ class CodeChefScraper(BaseScraper): return "codechef" async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: - async with httpx.AsyncClient() as client: - try: + try: + async with httpx.AsyncClient() as client: data = await fetch_json( client, API_CONTEST.format(contest_id=contest_id) ) - except httpx.HTTPStatusError as e: - return self._create_metadata_error( - f"Failed to fetch contest {contest_id}: {e}", contest_id + if not data.get("problems"): + return self._metadata_error( + f"No problems found for contest {contest_id}" ) - if not data.get("problems"): - return self._create_metadata_error( - f"No problems found for contest {contest_id}", contest_id - ) - problems = [] - for problem_code, problem_data in data["problems"].items(): - if problem_data.get("category_name") == "main": - problems.append( - ProblemSummary( - id=problem_code, - name=problem_data.get("name", problem_code), + problems = [] + for problem_code, problem_data in data["problems"].items(): + if problem_data.get("category_name") == "main": + problems.append( + ProblemSummary( + id=problem_code, + name=problem_data.get("name", problem_code), + ) ) - ) - return MetadataResult( - success=True, - error="", - contest_id=contest_id, - problems=problems, - url=f"{BASE_URL}/{contest_id}", - ) + return MetadataResult( + success=True, + error="", + contest_id=contest_id, + problems=problems, + url=f"{BASE_URL}/{contest_id}", + ) + except Exception as e: + return self._metadata_error(f"Failed to fetch contest {contest_id}: {e}") async def scrape_contest_list(self) -> ContestListResult: async with httpx.AsyncClient() as client: try: data = await fetch_json(client, API_CONTESTS_ALL) except httpx.HTTPStatusError as e: - return self._create_contests_error(f"Failed to fetch contests: {e}") + return self._contests_error(f"Failed to fetch contests: {e}") all_contests = data.get("future_contests", []) + data.get( "past_contests", [] ) @@ -110,7 +106,7 @@ class CodeChefScraper(BaseScraper): num = int(match.group(1)) max_num = max(max_num, num) if max_num == 0: - return self._create_contests_error("No Starters contests found") + return self._contests_error("No Starters contests found") contests = [] sem = asyncio.Semaphore(CONNECTIONS) @@ -252,68 +248,5 @@ class CodeChefScraper(BaseScraper): print(json.dumps(payload), flush=True) -async def main_async() -> int: - if len(sys.argv) < 2: - result = MetadataResult( - success=False, - error="Usage: codechef.py metadata OR codechef.py tests OR codechef.py contests", - url="", - ) - print(result.model_dump_json()) - return 1 - mode: str = sys.argv[1] - scraper = CodeChefScraper() - if mode == "metadata": - if len(sys.argv) != 3: - result = MetadataResult( - success=False, - error="Usage: codechef.py metadata ", - url="", - ) - print(result.model_dump_json()) - return 1 - contest_id = sys.argv[2] - result = await scraper.scrape_contest_metadata(contest_id) - print(result.model_dump_json()) - return 0 if result.success else 1 - if mode == "tests": - if len(sys.argv) != 3: - tests_result = TestsResult( - success=False, - error="Usage: codechef.py tests ", - problem_id="", - combined=CombinedTest(input="", expected=""), - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(tests_result.model_dump_json()) - return 1 - contest_id = sys.argv[2] - await scraper.stream_tests_for_category_async(contest_id) - return 0 - if mode == "contests": - if len(sys.argv) != 2: - contest_result = ContestListResult( - success=False, error="Usage: codechef.py contests" - ) - print(contest_result.model_dump_json()) - return 1 - contest_result = await scraper.scrape_contest_list() - print(contest_result.model_dump_json()) - return 0 if contest_result.success else 1 - result = MetadataResult( - success=False, - error=f"Unknown mode: {mode}. Use 'metadata ', 'tests ', or 'contests'", - url="", - ) - print(result.model_dump_json()) - return 1 - - -def main() -> None: - sys.exit(asyncio.run(main_async())) - - if __name__ == "__main__": - main() + CodeChefScraper().run_cli() diff --git a/tests/conftest.py b/tests/conftest.py index 63e6108..bd84941 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -232,6 +232,13 @@ def run_scraper_offline(fixture_text): case _: raise AssertionError(f"Unknown scraper: {scraper_name}") + scraper_classes = { + "cses": "CSESScraper", + "atcoder": "AtcoderScraper", + "codeforces": "CodeforcesScraper", + "codechef": "CodeChefScraper", + } + def _run(scraper_name: str, mode: str, *args: str): mod_path = ROOT / "scrapers" / f"{scraper_name}.py" ns = _load_scraper_module(mod_path, scraper_name) @@ -249,16 +256,11 @@ def run_scraper_offline(fixture_text): httpx.AsyncClient.get = offline_fetches["__offline_get_async"] # type: ignore[assignment] fetchers.Fetcher.get = offline_fetches["Fetcher.get"] # type: ignore[assignment] - main_async = getattr(ns, "main_async") - assert callable(main_async), f"main_async not found in {scraper_name}" + scraper_class = getattr(ns, scraper_classes[scraper_name]) + scraper = scraper_class() argv = [str(mod_path), mode, *args] - old_argv = sys.argv - sys.argv = argv - try: - rc, out = _capture_stdout(main_async()) - finally: - sys.argv = old_argv + rc, out = _capture_stdout(scraper._run_cli_async(argv)) json_lines: list[Any] = [] for line in (_line for _line in out.splitlines() if _line.strip()): From 83514c453eb9b7e773b18f2ac2645b3a6f56a9ae Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Tue, 27 Jan 2026 15:48:26 -0500 Subject: [PATCH 3/4] fix(ci): remove unused import --- scrapers/codechef.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapers/codechef.py b/scrapers/codechef.py index c9e402c..0687c1e 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 + import asyncio import json import re -import sys from typing import Any import httpx From 89c1a3c683fb5b5cb73fee6afc27c82bb8df22cb Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Tue, 27 Jan 2026 15:56:34 -0500 Subject: [PATCH 4/4] fix(ci): more fixes --- scrapers/atcoder.py | 36 ++++++++++++------------------------ scrapers/codeforces.py | 10 +++++----- tests/conftest.py | 8 ++++---- 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 66b95aa..1b946dd 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -266,43 +266,31 @@ class AtcoderScraper(BaseScraper): return "atcoder" async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: - async def impl(cid: str) -> MetadataResult: - try: - rows = await asyncio.to_thread(_scrape_tasks_sync, cid) - except requests.HTTPError as e: - if e.response is not None and e.response.status_code == 404: - return self._create_metadata_error( - f"No problems found for contest {cid}", cid - ) - raise - + try: + rows = await asyncio.to_thread(_scrape_tasks_sync, contest_id) problems = _to_problem_summaries(rows) if not problems: - return self._create_metadata_error( - f"No problems found for contest {cid}", cid + return self._metadata_error( + f"No problems found for contest {contest_id}" ) - return MetadataResult( success=True, error="", - contest_id=cid, + contest_id=contest_id, problems=problems, url=f"https://atcoder.jp/contests/{contest_id}/tasks/{contest_id}_%s", ) - - return await self._safe_execute("metadata", impl, contest_id) + except Exception as e: + return self._metadata_error(str(e)) async def scrape_contest_list(self) -> ContestListResult: - async def impl() -> ContestListResult: - try: - contests = await _fetch_all_contests_async() - except Exception as e: - return self._create_contests_error(str(e)) + try: + contests = await _fetch_all_contests_async() if not contests: - return self._create_contests_error("No contests found") + return self._contests_error("No contests found") return ContestListResult(success=True, error="", contests=contests) - - return await self._safe_execute("contests", impl) + except Exception as e: + return self._contests_error(str(e)) async def stream_tests_for_category_async(self, category_id: str) -> None: rows = await asyncio.to_thread(_scrape_tasks_sync, category_id) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 840616f..cf172b8 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -86,14 +86,14 @@ def _extract_samples(block: Tag) -> tuple[list[TestCase], bool]: if not st: return [], False - input_pres: list[Tag] = [ # type: ignore[misc] - inp.find("pre") # type: ignore[misc] - for inp in st.find_all("div", class_="input") # type: ignore[union-attr] + input_pres: list[Tag] = [ + inp.find("pre") + for inp in st.find_all("div", class_="input") if isinstance(inp, Tag) and inp.find("pre") ] output_pres: list[Tag] = [ - out.find("pre") # type: ignore[misc] - for out in st.find_all("div", class_="output") # type: ignore[union-attr] + out.find("pre") + for out in st.find_all("div", class_="output") if isinstance(out, Tag) and out.find("pre") ] input_pres = [p for p in input_pres if isinstance(p, Tag)] diff --git a/tests/conftest.py b/tests/conftest.py index bd84941..aaefec8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -245,16 +245,16 @@ def run_scraper_offline(fixture_text): offline_fetches = _make_offline_fetches(scraper_name) if scraper_name == "codeforces": - fetchers.Fetcher.get = offline_fetches["Fetcher.get"] # type: ignore[assignment] + fetchers.Fetcher.get = offline_fetches["Fetcher.get"] requests.get = offline_fetches["requests.get"] elif scraper_name == "atcoder": ns._fetch = offline_fetches["_fetch"] ns._get_async = offline_fetches["_get_async"] elif scraper_name == "cses": - httpx.AsyncClient.get = offline_fetches["__offline_fetch_text"] # type: ignore[assignment] + httpx.AsyncClient.get = offline_fetches["__offline_fetch_text"] elif scraper_name == "codechef": - httpx.AsyncClient.get = offline_fetches["__offline_get_async"] # type: ignore[assignment] - fetchers.Fetcher.get = offline_fetches["Fetcher.get"] # type: ignore[assignment] + httpx.AsyncClient.get = offline_fetches["__offline_get_async"] + fetchers.Fetcher.get = offline_fetches["Fetcher.get"] scraper_class = getattr(ns, scraper_classes[scraper_name]) scraper = scraper_class()