feat(scrapers): refactor

2026-01-27 15:43:40 -05:00 · 2026-01-27 15:43:40 -05:00 · d5c6783124
commit d5c6783124
parent 5293515aca
2 changed files with 35 additions and 100 deletions
--- a/scrapers/codechef.py
+++ b/scrapers/codechef.py
@ -10,13 +10,11 @@ from scrapling.fetchers import Fetcher
 from .base import BaseScraper
 from .models import (
    CombinedTest,
    ContestListResult,
    ContestSummary,
    MetadataResult,
    ProblemSummary,
    TestCase,
    TestsResult,
 )
 BASE_URL = "https://www.codechef.com"
@ -62,42 +60,40 @@ class CodeChefScraper(BaseScraper):
        return "codechef"
    async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
-        async with httpx.AsyncClient() as client:
+        try:
-            try:
+            async with httpx.AsyncClient() as client:
                data = await fetch_json(
                    client, API_CONTEST.format(contest_id=contest_id)
                )
-            except httpx.HTTPStatusError as e:
+            if not data.get("problems"):
-                return self._create_metadata_error(
+                return self._metadata_error(
-                    f"Failed to fetch contest {contest_id}: {e}", contest_id
+                    f"No problems found for contest {contest_id}"
                )
-        if not data.get("problems"):
+            problems = []
-            return self._create_metadata_error(
+            for problem_code, problem_data in data["problems"].items():
-                f"No problems found for contest {contest_id}", contest_id
+                if problem_data.get("category_name") == "main":
-            )
+                    problems.append(
-        problems = []
+                        ProblemSummary(
-        for problem_code, problem_data in data["problems"].items():
+                            id=problem_code,
-            if problem_data.get("category_name") == "main":
+                            name=problem_data.get("name", problem_code),
-                problems.append(
+                        )
                    ProblemSummary(
                        id=problem_code,
                        name=problem_data.get("name", problem_code),
                    )
-                )
+            return MetadataResult(
-        return MetadataResult(
+                success=True,
-            success=True,
+                error="",
-            error="",
+                contest_id=contest_id,
-            contest_id=contest_id,
+                problems=problems,
-            problems=problems,
+                url=f"{BASE_URL}/{contest_id}",
-            url=f"{BASE_URL}/{contest_id}",
+            )
-        )
+        except Exception as e:
            return self._metadata_error(f"Failed to fetch contest {contest_id}: {e}")
    async def scrape_contest_list(self) -> ContestListResult:
        async with httpx.AsyncClient() as client:
            try:
                data = await fetch_json(client, API_CONTESTS_ALL)
            except httpx.HTTPStatusError as e:
-                return self._create_contests_error(f"Failed to fetch contests: {e}")
+                return self._contests_error(f"Failed to fetch contests: {e}")
            all_contests = data.get("future_contests", []) + data.get(
                "past_contests", []
            )
@ -110,7 +106,7 @@ class CodeChefScraper(BaseScraper):
                        num = int(match.group(1))
                        max_num = max(max_num, num)
            if max_num == 0:
-                return self._create_contests_error("No Starters contests found")
+                return self._contests_error("No Starters contests found")
            contests = []
            sem = asyncio.Semaphore(CONNECTIONS)
@ -252,68 +248,5 @@ class CodeChefScraper(BaseScraper):
                print(json.dumps(payload), flush=True)
 async def main_async() -> int:
    if len(sys.argv) < 2:
        result = MetadataResult(
            success=False,
            error="Usage: codechef.py metadata <contest_id> OR codechef.py tests <contest_id> OR codechef.py contests",
            url="",
        )
        print(result.model_dump_json())
        return 1
    mode: str = sys.argv[1]
    scraper = CodeChefScraper()
    if mode == "metadata":
        if len(sys.argv) != 3:
            result = MetadataResult(
                success=False,
                error="Usage: codechef.py metadata <contest_id>",
                url="",
            )
            print(result.model_dump_json())
            return 1
        contest_id = sys.argv[2]
        result = await scraper.scrape_contest_metadata(contest_id)
        print(result.model_dump_json())
        return 0 if result.success else 1
    if mode == "tests":
        if len(sys.argv) != 3:
            tests_result = TestsResult(
                success=False,
                error="Usage: codechef.py tests <contest_id>",
                problem_id="",
                combined=CombinedTest(input="", expected=""),
                tests=[],
                timeout_ms=0,
                memory_mb=0,
            )
            print(tests_result.model_dump_json())
            return 1
        contest_id = sys.argv[2]
        await scraper.stream_tests_for_category_async(contest_id)
        return 0
    if mode == "contests":
        if len(sys.argv) != 2:
            contest_result = ContestListResult(
                success=False, error="Usage: codechef.py contests"
            )
            print(contest_result.model_dump_json())
            return 1
        contest_result = await scraper.scrape_contest_list()
        print(contest_result.model_dump_json())
        return 0 if contest_result.success else 1
    result = MetadataResult(
        success=False,
        error=f"Unknown mode: {mode}. Use 'metadata <contest_id>', 'tests <contest_id>', or 'contests'",
        url="",
    )
    print(result.model_dump_json())
    return 1
 def main() -> None:
    sys.exit(asyncio.run(main_async()))
 if __name__ == "__main__":
-    main()
+    CodeChefScraper().run_cli()
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -232,6 +232,13 @@ def run_scraper_offline(fixture_text):
            case _:
                raise AssertionError(f"Unknown scraper: {scraper_name}")
    scraper_classes = {
        "cses": "CSESScraper",
        "atcoder": "AtcoderScraper",
        "codeforces": "CodeforcesScraper",
        "codechef": "CodeChefScraper",
    }
    def _run(scraper_name: str, mode: str, *args: str):
        mod_path = ROOT / "scrapers" / f"{scraper_name}.py"
        ns = _load_scraper_module(mod_path, scraper_name)
@ -249,16 +256,11 @@ def run_scraper_offline(fixture_text):
            httpx.AsyncClient.get = offline_fetches["__offline_get_async"]  # type: ignore[assignment]
            fetchers.Fetcher.get = offline_fetches["Fetcher.get"]  # type: ignore[assignment]
-        main_async = getattr(ns, "main_async")
+        scraper_class = getattr(ns, scraper_classes[scraper_name])
-        assert callable(main_async), f"main_async not found in {scraper_name}"
+        scraper = scraper_class()
        argv = [str(mod_path), mode, *args]
-        old_argv = sys.argv
+        rc, out = _capture_stdout(scraper._run_cli_async(argv))
        sys.argv = argv
        try:
            rc, out = _capture_stdout(main_async())
        finally:
            sys.argv = old_argv
        json_lines: list[Any] = []
        for line in (_line for _line in out.splitlines() if _line.strip()):