From 89440e5d1491142490a16d9a9a1afe57a9f05312 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Mon, 22 Sep 2025 22:44:08 -0400 Subject: [PATCH 1/3] feat(scrapers): simplify structure --- scrapers/__init__.py | 44 +--------------- tests/scrapers/test_interface_compliance.py | 21 ++++---- tests/scrapers/test_registry.py | 58 --------------------- 3 files changed, 14 insertions(+), 109 deletions(-) delete mode 100644 tests/scrapers/test_registry.py diff --git a/scrapers/__init__.py b/scrapers/__init__.py index 8de8c42..f0cfd45 100644 --- a/scrapers/__init__.py +++ b/scrapers/__init__.py @@ -11,20 +11,11 @@ from .models import ( TestsResult, ) -ALL_SCRAPERS: dict[str, type[BaseScraper]] = { - "atcoder": AtCoderScraper, - "codeforces": CodeforcesScraper, - "cses": CSESScraper, -} - -_SCRAPER_CLASSES = [ +__all__ = [ "AtCoderScraper", + "BaseScraper", "CodeforcesScraper", "CSESScraper", -] - -_BASE_EXPORTS = [ - "BaseScraper", "ScraperConfig", "ContestListResult", "ContestSummary", @@ -33,34 +24,3 @@ _BASE_EXPORTS = [ "TestCase", "TestsResult", ] - -_REGISTRY_FUNCTIONS = [ - "get_scraper", - "list_platforms", - "ALL_SCRAPERS", -] - -__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS - -_exported_types = ( - ScraperConfig, - ContestListResult, - ContestSummary, - MetadataResult, - ProblemSummary, - TestCase, - TestsResult, -) - - -def get_scraper(platform: str) -> type[BaseScraper]: - if platform not in ALL_SCRAPERS: - available = ", ".join(ALL_SCRAPERS.keys()) - raise KeyError( - f"Unknown platform '{platform}'. Available platforms: {available}" - ) - return ALL_SCRAPERS[platform] - - -def list_platforms() -> list[str]: - return list(ALL_SCRAPERS.keys()) diff --git a/tests/scrapers/test_interface_compliance.py b/tests/scrapers/test_interface_compliance.py index 753e0de..8bfb185 100644 --- a/tests/scrapers/test_interface_compliance.py +++ b/tests/scrapers/test_interface_compliance.py @@ -2,14 +2,17 @@ from unittest.mock import Mock import pytest -from scrapers import ALL_SCRAPERS, BaseScraper +from scrapers.atcoder import AtCoderScraper +from scrapers.base import BaseScraper +from scrapers.codeforces import CodeforcesScraper +from scrapers.cses import CSESScraper from scrapers.models import ContestListResult, MetadataResult, TestsResult -ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values()) +SCRAPERS = [AtCoderScraper, CodeforcesScraper, CSESScraper] class TestScraperInterfaceCompliance: - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_implements_base_interface(self, scraper_class): scraper = scraper_class() @@ -19,7 +22,7 @@ class TestScraperInterfaceCompliance: assert hasattr(scraper, "scrape_problem_tests") assert hasattr(scraper, "scrape_contest_list") - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_platform_name_is_string(self, scraper_class): scraper = scraper_class() platform_name = scraper.platform_name @@ -28,7 +31,7 @@ class TestScraperInterfaceCompliance: assert len(platform_name) > 0 assert platform_name.islower() # Convention: lowercase platform names - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_metadata_method_signature(self, scraper_class, mocker): scraper = scraper_class() @@ -53,7 +56,7 @@ class TestScraperInterfaceCompliance: assert isinstance(result.success, bool) assert isinstance(result.error, str) - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_problem_tests_method_signature(self, scraper_class, mocker): scraper = scraper_class() @@ -85,7 +88,7 @@ class TestScraperInterfaceCompliance: assert isinstance(result.success, bool) assert isinstance(result.error, str) - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_contest_list_method_signature(self, scraper_class, mocker): scraper = scraper_class() @@ -111,7 +114,7 @@ class TestScraperInterfaceCompliance: assert isinstance(result.success, bool) assert isinstance(result.error, str) - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_error_message_format(self, scraper_class, mocker): scraper = scraper_class() platform_name = scraper.platform_name @@ -148,7 +151,7 @@ class TestScraperInterfaceCompliance: assert not result.success assert result.error.startswith(f"{platform_name}: ") - @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + @pytest.mark.parametrize("scraper_class", SCRAPERS) def test_scraper_instantiation(self, scraper_class): scraper1 = scraper_class() assert isinstance(scraper1, BaseScraper) diff --git a/tests/scrapers/test_registry.py b/tests/scrapers/test_registry.py deleted file mode 100644 index a656d1e..0000000 --- a/tests/scrapers/test_registry.py +++ /dev/null @@ -1,58 +0,0 @@ -import pytest - -from scrapers import ALL_SCRAPERS, get_scraper, list_platforms -from scrapers.base import BaseScraper -from scrapers.codeforces import CodeforcesScraper - - -class TestScraperRegistry: - def test_get_scraper_valid_platform(self): - scraper_class = get_scraper("codeforces") - assert scraper_class == CodeforcesScraper - assert issubclass(scraper_class, BaseScraper) - - scraper = scraper_class() - assert isinstance(scraper, BaseScraper) - assert scraper.platform_name == "codeforces" - - def test_get_scraper_invalid_platform(self): - with pytest.raises(KeyError) as exc_info: - get_scraper("nonexistent") - - error_msg = str(exc_info.value) - assert "nonexistent" in error_msg - assert "Available platforms" in error_msg - - def test_list_platforms(self): - platforms = list_platforms() - - assert isinstance(platforms, list) - assert len(platforms) > 0 - assert "codeforces" in platforms - - assert set(platforms) == set(ALL_SCRAPERS.keys()) - - def test_all_scrapers_registry(self): - assert isinstance(ALL_SCRAPERS, dict) - assert len(ALL_SCRAPERS) > 0 - - for platform_name, scraper_class in ALL_SCRAPERS.items(): - assert isinstance(platform_name, str) - assert platform_name.islower() - - assert issubclass(scraper_class, BaseScraper) - - scraper = scraper_class() - assert scraper.platform_name == platform_name - - def test_registry_import_consistency(self): - from scrapers.codeforces import CodeforcesScraper as DirectImport - - registry_class = get_scraper("codeforces") - assert registry_class == DirectImport - - def test_all_scrapers_can_be_instantiated(self): - for platform_name, scraper_class in ALL_SCRAPERS.items(): - scraper = scraper_class() - assert isinstance(scraper, BaseScraper) - assert scraper.platform_name == platform_name From 0a8dc50c76accf08d357dda37cf72279380f9ef6 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Mon, 22 Sep 2025 22:46:36 -0400 Subject: [PATCH 2/3] fix(test): systeamtically gather scrapers --- tests/scrapers/test_interface_compliance.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/scrapers/test_interface_compliance.py b/tests/scrapers/test_interface_compliance.py index 8bfb185..e81375b 100644 --- a/tests/scrapers/test_interface_compliance.py +++ b/tests/scrapers/test_interface_compliance.py @@ -1,14 +1,17 @@ +import inspect from unittest.mock import Mock import pytest -from scrapers.atcoder import AtCoderScraper +import scrapers from scrapers.base import BaseScraper -from scrapers.codeforces import CodeforcesScraper -from scrapers.cses import CSESScraper from scrapers.models import ContestListResult, MetadataResult, TestsResult -SCRAPERS = [AtCoderScraper, CodeforcesScraper, CSESScraper] +SCRAPERS = [ + cls + for name, cls in inspect.getmembers(scrapers, inspect.isclass) + if issubclass(cls, BaseScraper) and cls != BaseScraper +] class TestScraperInterfaceCompliance: From 53562eb6a874ab86d0be02c9ca62695ac385f836 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Mon, 22 Sep 2025 22:48:24 -0400 Subject: [PATCH 3/3] fix(scrapers): reorg codeforces scraper --- scrapers/codeforces.py | 126 ++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 0ec1958..e7e1e4b 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -19,69 +19,6 @@ from .models import ( ) -class CodeforcesScraper(BaseScraper): - @property - def platform_name(self) -> str: - return "codeforces" - - def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: - return self._safe_execute( - "metadata", self._scrape_contest_metadata_impl, contest_id - ) - - def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: - return self._safe_execute( - "tests", self._scrape_problem_tests_impl, contest_id, problem_id - ) - - def scrape_contest_list(self) -> ContestListResult: - return self._safe_execute("contests", self._scrape_contest_list_impl) - - def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult: - problems = scrape_contest_problems(contest_id) - if not problems: - return self._create_metadata_error( - f"No problems found for contest {contest_id}", contest_id - ) - return MetadataResult( - success=True, error="", contest_id=contest_id, problems=problems - ) - - def _scrape_problem_tests_impl( - self, contest_id: str, problem_letter: str - ) -> TestsResult: - problem_id = contest_id + problem_letter.lower() - url = parse_problem_url(contest_id, problem_letter) - tests = scrape_sample_tests(url) - - scraper = cloudscraper.create_scraper() - response = scraper.get(url, timeout=self.config.timeout_seconds) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - timeout_ms, memory_mb = extract_problem_limits(soup) - - if not tests: - return self._create_tests_error( - f"No tests found for {contest_id} {problem_letter}", problem_id, url - ) - - return TestsResult( - success=True, - error="", - problem_id=problem_id, - url=url, - tests=tests, - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) - - def _scrape_contest_list_impl(self) -> ContestListResult: - contests = scrape_contests() - if not contests: - return self._create_contests_error("No contests found") - return ContestListResult(success=True, error="", contests=contests) - - def scrape(url: str) -> list[TestCase]: try: scraper = cloudscraper.create_scraper() @@ -305,6 +242,69 @@ def scrape_contests() -> list[ContestSummary]: return contests +class CodeforcesScraper(BaseScraper): + @property + def platform_name(self) -> str: + return "codeforces" + + def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: + return self._safe_execute( + "metadata", self._scrape_contest_metadata_impl, contest_id + ) + + def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: + return self._safe_execute( + "tests", self._scrape_problem_tests_impl, contest_id, problem_id + ) + + def scrape_contest_list(self) -> ContestListResult: + return self._safe_execute("contests", self._scrape_contest_list_impl) + + def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult: + problems = scrape_contest_problems(contest_id) + if not problems: + return self._create_metadata_error( + f"No problems found for contest {contest_id}", contest_id + ) + return MetadataResult( + success=True, error="", contest_id=contest_id, problems=problems + ) + + def _scrape_problem_tests_impl( + self, contest_id: str, problem_letter: str + ) -> TestsResult: + problem_id = contest_id + problem_letter.lower() + url = parse_problem_url(contest_id, problem_letter) + tests = scrape_sample_tests(url) + + scraper = cloudscraper.create_scraper() + response = scraper.get(url, timeout=self.config.timeout_seconds) + response.raise_for_status() + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + + if not tests: + return self._create_tests_error( + f"No tests found for {contest_id} {problem_letter}", problem_id, url + ) + + return TestsResult( + success=True, + error="", + problem_id=problem_id, + url=url, + tests=tests, + timeout_ms=timeout_ms, + memory_mb=memory_mb, + ) + + def _scrape_contest_list_impl(self) -> ContestListResult: + contests = scrape_contests() + if not contests: + return self._create_contests_error("No contests found") + return ContestListResult(success=True, error="", contests=contests) + + def main() -> None: if len(sys.argv) < 2: result = MetadataResult(