From db391da52c245ef536a4c1e5c54ef25d770c1053 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Mon, 22 Sep 2025 22:00:20 -0400 Subject: [PATCH] feat(scrapers): total refactor --- scrapers/__init__.py | 56 ++++++ scrapers/atcoder.py | 180 ++++++++++------- scrapers/base.py | 23 --- scrapers/clients.py | 82 -------- scrapers/codeforces.py | 43 +++-- scrapers/cses.py | 202 ++++++++++++-------- tests/scrapers/test_codeforces.py | 60 +++--- tests/scrapers/test_interface_compliance.py | 162 ++++++++++++++++ tests/scrapers/test_registry.py | 58 ++++++ 9 files changed, 559 insertions(+), 307 deletions(-) delete mode 100644 scrapers/clients.py create mode 100644 tests/scrapers/test_interface_compliance.py create mode 100644 tests/scrapers/test_registry.py diff --git a/scrapers/__init__.py b/scrapers/__init__.py index e69de29..391f349 100644 --- a/scrapers/__init__.py +++ b/scrapers/__init__.py @@ -0,0 +1,56 @@ +from .atcoder import AtCoderScraper +from .base import BaseScraper, ScraperConfig +from .codeforces import CodeforcesScraper +from .cses import CSESScraper +from .models import ( + ContestListResult, + ContestSummary, + MetadataResult, + ProblemSummary, + TestCase, + TestsResult, +) + +ALL_SCRAPERS: dict[str, type[BaseScraper]] = { + "atcoder": AtCoderScraper, + "codeforces": CodeforcesScraper, + "cses": CSESScraper, +} + +_SCRAPER_CLASSES = [ + "AtCoderScraper", + "CodeforcesScraper", + "CSESScraper", +] + +_BASE_EXPORTS = [ + "BaseScraper", + "ScraperConfig", + "ContestListResult", + "ContestSummary", + "MetadataResult", + "ProblemSummary", + "TestCase", + "TestsResult", +] + +_REGISTRY_FUNCTIONS = [ + "get_scraper", + "list_platforms", + "ALL_SCRAPERS", +] + +__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS + + +def get_scraper(platform: str) -> type[BaseScraper]: + if platform not in ALL_SCRAPERS: + available = ", ".join(ALL_SCRAPERS.keys()) + raise KeyError( + f"Unknown platform '{platform}'. Available platforms: {available}" + ) + return ALL_SCRAPERS[platform] + + +def list_platforms() -> list[str]: + return list(ALL_SCRAPERS.keys()) diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 1935c6e..20cc3d3 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import concurrent.futures import json import re import sys @@ -9,6 +10,7 @@ import backoff import requests from bs4 import BeautifulSoup, Tag +from .base import BaseScraper from .models import ( ContestListResult, ContestSummary, @@ -167,8 +169,6 @@ def scrape(url: str) -> list[TestCase]: def scrape_contests() -> list[ContestSummary]: - import concurrent.futures - def get_max_pages() -> int: try: headers = { @@ -296,6 +296,101 @@ def scrape_contests() -> list[ContestSummary]: return all_contests +class AtCoderScraper(BaseScraper): + @property + def platform_name(self) -> str: + return "atcoder" + + def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: + return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id) + + def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: + return self._safe_execute( + "tests", self._scrape_tests_impl, contest_id, problem_id + ) + + def scrape_contest_list(self) -> ContestListResult: + return self._safe_execute("contests", self._scrape_contests_impl) + + def _safe_execute(self, operation: str, func, *args): + try: + return func(*args) + except Exception as e: + error_msg = f"{self.platform_name}: {str(e)}" + + if operation == "metadata": + return MetadataResult(success=False, error=error_msg) + elif operation == "tests": + return TestsResult( + success=False, + error=error_msg, + problem_id="", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, + ) + elif operation == "contests": + return ContestListResult(success=False, error=error_msg) + + def _scrape_metadata_impl(self, contest_id: str) -> MetadataResult: + problems = scrape_contest_problems(contest_id) + if not problems: + return MetadataResult( + success=False, + error=f"{self.platform_name}: No problems found for contest {contest_id}", + ) + return MetadataResult( + success=True, error="", contest_id=contest_id, problems=problems + ) + + def _scrape_tests_impl(self, contest_id: str, problem_id: str) -> TestsResult: + problem_letter = problem_id.upper() + url = parse_problem_url(contest_id, problem_letter) + tests = scrape(url) + + response = requests.get( + url, + headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + }, + timeout=10, + ) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + + if not tests: + return TestsResult( + success=False, + error=f"{self.platform_name}: No tests found for {contest_id} {problem_letter}", + problem_id=f"{contest_id}_{problem_id.lower()}", + url=url, + tests=[], + timeout_ms=timeout_ms, + memory_mb=memory_mb, + ) + + return TestsResult( + success=True, + error="", + problem_id=f"{contest_id}_{problem_id.lower()}", + url=url, + tests=tests, + timeout_ms=timeout_ms, + memory_mb=memory_mb, + ) + + def _scrape_contests_impl(self) -> ContestListResult: + contests = scrape_contests() + if not contests: + return ContestListResult( + success=False, error=f"{self.platform_name}: No contests found" + ) + return ContestListResult(success=True, error="", contests=contests) + + def main() -> None: if len(sys.argv) < 2: result = MetadataResult( @@ -306,6 +401,7 @@ def main() -> None: sys.exit(1) mode: str = sys.argv[1] + scraper = AtCoderScraper() if mode == "metadata": if len(sys.argv) != 3: @@ -317,23 +413,10 @@ def main() -> None: sys.exit(1) contest_id: str = sys.argv[2] - problems: list[ProblemSummary] = scrape_contest_problems(contest_id) - - if not problems: - result = MetadataResult( - success=False, - error=f"No problems found for contest {contest_id}", - ) - print(json.dumps(asdict(result))) - sys.exit(1) - - result = MetadataResult( - success=True, - error="", - contest_id=contest_id, - problems=problems, - ) + result = scraper.scrape_contest_metadata(contest_id) print(json.dumps(asdict(result))) + if not result.success: + sys.exit(1) elif mode == "tests": if len(sys.argv) != 4: @@ -351,55 +434,10 @@ def main() -> None: test_contest_id: str = sys.argv[2] problem_letter: str = sys.argv[3] - problem_id: str = f"{test_contest_id}_{problem_letter.lower()}" - - url: str = parse_problem_url(test_contest_id, problem_letter) - tests: list[TestCase] = scrape(url) - - try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - timeout_ms, memory_mb = extract_problem_limits(soup) - except Exception as e: - tests_result = TestsResult( - success=False, - error=f"Failed to extract constraints: {e}", - problem_id=problem_id, - url=url, - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - if not tests: - tests_result = TestsResult( - success=False, - error=f"No tests found for {test_contest_id} {problem_letter}", - problem_id=problem_id, - url=url, - tests=[], - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - tests_result = TestsResult( - success=True, - error="", - problem_id=problem_id, - url=url, - tests=tests, - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) + tests_result = scraper.scrape_problem_tests(test_contest_id, problem_letter) print(json.dumps(asdict(tests_result))) + if not tests_result.success: + sys.exit(1) elif mode == "contests": if len(sys.argv) != 2: @@ -409,14 +447,10 @@ def main() -> None: print(json.dumps(asdict(contest_result))) sys.exit(1) - contests = scrape_contests() - if not contests: - contest_result = ContestListResult(success=False, error="No contests found") - print(json.dumps(asdict(contest_result))) - sys.exit(1) - - contest_result = ContestListResult(success=True, error="", contests=contests) + contest_result = scraper.scrape_contest_list() print(json.dumps(asdict(contest_result))) + if not contest_result.success: + sys.exit(1) else: result = MetadataResult( diff --git a/scrapers/base.py b/scrapers/base.py index bf96241..c8336a8 100644 --- a/scrapers/base.py +++ b/scrapers/base.py @@ -1,8 +1,5 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Protocol - -import requests from .models import ContestListResult, MetadataResult, TestsResult @@ -15,23 +12,14 @@ class ScraperConfig: rate_limit_delay: float = 1.0 -class HttpClient(Protocol): - def get(self, url: str, **kwargs) -> requests.Response: ... - def close(self) -> None: ... - - class BaseScraper(ABC): def __init__(self, config: ScraperConfig | None = None): self.config = config or ScraperConfig() - self._client: HttpClient | None = None @property @abstractmethod def platform_name(self) -> str: ... - @abstractmethod - def _create_client(self) -> HttpClient: ... - @abstractmethod def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ... @@ -41,17 +29,6 @@ class BaseScraper(ABC): @abstractmethod def scrape_contest_list(self) -> ContestListResult: ... - @property - def client(self) -> HttpClient: - if self._client is None: - self._client = self._create_client() - return self._client - - def close(self) -> None: - if self._client is not None: - self._client.close() - self._client = None - def _create_metadata_error( self, error_msg: str, contest_id: str = "" ) -> MetadataResult: diff --git a/scrapers/clients.py b/scrapers/clients.py deleted file mode 100644 index d5bd232..0000000 --- a/scrapers/clients.py +++ /dev/null @@ -1,82 +0,0 @@ -import time - -import backoff -import requests - -from .base import HttpClient, ScraperConfig - - -class RequestsClient: - def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None): - self.config = config - self.session = requests.Session() - - default_headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - if headers: - default_headers.update(headers) - - self.session.headers.update(default_headers) - - @backoff.on_exception( - backoff.expo, - (requests.RequestException, requests.HTTPError), - max_tries=3, - base=2.0, - jitter=backoff.random_jitter, - ) - @backoff.on_predicate( - backoff.expo, - lambda response: response.status_code == 429, - max_tries=3, - base=2.0, - jitter=backoff.random_jitter, - ) - def get(self, url: str, **kwargs) -> requests.Response: - timeout = kwargs.get("timeout", self.config.timeout_seconds) - response = self.session.get(url, timeout=timeout, **kwargs) - response.raise_for_status() - - if ( - hasattr(self.config, "rate_limit_delay") - and self.config.rate_limit_delay > 0 - ): - time.sleep(self.config.rate_limit_delay) - - return response - - def close(self) -> None: - self.session.close() - - -class CloudScraperClient: - def __init__(self, config: ScraperConfig): - import cloudscraper - - self.config = config - self.scraper = cloudscraper.create_scraper() - - @backoff.on_exception( - backoff.expo, - (requests.RequestException, requests.HTTPError), - max_tries=3, - base=2.0, - jitter=backoff.random_jitter, - ) - def get(self, url: str, **kwargs) -> requests.Response: - timeout = kwargs.get("timeout", self.config.timeout_seconds) - response = self.scraper.get(url, timeout=timeout, **kwargs) - response.raise_for_status() - - if ( - hasattr(self.config, "rate_limit_delay") - and self.config.rate_limit_delay > 0 - ): - time.sleep(self.config.rate_limit_delay) - - return response - - def close(self) -> None: - if hasattr(self.scraper, "close"): - self.scraper.close() diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 3bacaf5..0ec1958 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -5,10 +5,10 @@ import re import sys from dataclasses import asdict +import cloudscraper from bs4 import BeautifulSoup, Tag -from .base import BaseScraper, HttpClient -from .clients import CloudScraperClient +from .base import BaseScraper from .models import ( ContestListResult, ContestSummary, @@ -24,9 +24,6 @@ class CodeforcesScraper(BaseScraper): def platform_name(self) -> str: return "codeforces" - def _create_client(self) -> HttpClient: - return CloudScraperClient(self.config) - def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: return self._safe_execute( "metadata", self._scrape_contest_metadata_impl, contest_id @@ -41,7 +38,7 @@ class CodeforcesScraper(BaseScraper): return self._safe_execute("contests", self._scrape_contest_list_impl) def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult: - problems = scrape_contest_problems(contest_id, self.client) + problems = scrape_contest_problems(contest_id) if not problems: return self._create_metadata_error( f"No problems found for contest {contest_id}", contest_id @@ -55,9 +52,11 @@ class CodeforcesScraper(BaseScraper): ) -> TestsResult: problem_id = contest_id + problem_letter.lower() url = parse_problem_url(contest_id, problem_letter) - tests = scrape_sample_tests(url, self.client) + tests = scrape_sample_tests(url) - response = self.client.get(url) + scraper = cloudscraper.create_scraper() + response = scraper.get(url, timeout=self.config.timeout_seconds) + response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") timeout_ms, memory_mb = extract_problem_limits(soup) @@ -77,15 +76,17 @@ class CodeforcesScraper(BaseScraper): ) def _scrape_contest_list_impl(self) -> ContestListResult: - contests = scrape_contests(self.client) + contests = scrape_contests() if not contests: return self._create_contests_error("No contests found") return ContestListResult(success=True, error="", contests=contests) -def scrape(url: str, client: HttpClient) -> list[TestCase]: +def scrape(url: str) -> list[TestCase]: try: - response = client.get(url) + scraper = cloudscraper.create_scraper() + response = scraper.get(url, timeout=10) + response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") input_sections = soup.find_all("div", class_="input") @@ -239,12 +240,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: return timeout_ms, memory_mb -def scrape_contest_problems( - contest_id: str, client: HttpClient -) -> list[ProblemSummary]: +def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: try: contest_url: str = f"https://codeforces.com/contest/{contest_id}" - response = client.get(contest_url) + scraper = cloudscraper.create_scraper() + response = scraper.get(contest_url, timeout=10) + response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") problems: list[ProblemSummary] = [] @@ -280,13 +281,15 @@ def scrape_contest_problems( return [] -def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]: +def scrape_sample_tests(url: str) -> list[TestCase]: print(f"Scraping: {url}", file=sys.stderr) - return scrape(url, client) + return scrape(url) -def scrape_contests(client: HttpClient) -> list[ContestSummary]: - response = client.get("https://codeforces.com/api/contest.list") +def scrape_contests() -> list[ContestSummary]: + scraper = cloudscraper.create_scraper() + response = scraper.get("https://codeforces.com/api/contest.list", timeout=10) + response.raise_for_status() data = response.json() if data["status"] != "OK": @@ -364,8 +367,6 @@ def main() -> None: print(json.dumps(asdict(result))) sys.exit(1) - scraper.close() - if __name__ == "__main__": main() diff --git a/scrapers/cses.py b/scrapers/cses.py index 3c5db7a..c9144c6 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -9,6 +9,7 @@ import backoff import requests from bs4 import BeautifulSoup, Tag +from .base import BaseScraper from .models import ( ContestListResult, ContestSummary, @@ -322,6 +323,111 @@ def scrape(url: str) -> list[TestCase]: return [] +class CSESScraper(BaseScraper): + @property + def platform_name(self) -> str: + return "cses" + + def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: + return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id) + + def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: + return self._safe_execute( + "tests", self._scrape_tests_impl, contest_id, problem_id + ) + + def scrape_contest_list(self) -> ContestListResult: + return self._safe_execute("contests", self._scrape_contests_impl) + + def _safe_execute(self, operation: str, func, *args): + try: + return func(*args) + except Exception as e: + error_msg = f"{self.platform_name}: {str(e)}" + + if operation == "metadata": + return MetadataResult(success=False, error=error_msg) + elif operation == "tests": + return TestsResult( + success=False, + error=error_msg, + problem_id="", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, + ) + elif operation == "contests": + return ContestListResult(success=False, error=error_msg) + + def _scrape_metadata_impl(self, category_id: str) -> MetadataResult: + problems = scrape_category_problems(category_id) + if not problems: + return MetadataResult( + success=False, + error=f"{self.platform_name}: No problems found for category: {category_id}", + ) + return MetadataResult( + success=True, error="", contest_id=category_id, problems=problems + ) + + def _scrape_tests_impl(self, category: str, problem_id: str) -> TestsResult: + url = parse_problem_url(problem_id) + if not url: + return TestsResult( + success=False, + error=f"{self.platform_name}: Invalid problem input: {problem_id}. Use either problem ID (e.g., 1068) or full URL", + problem_id=problem_id if problem_id.isdigit() else "", + url="", + tests=[], + timeout_ms=0, + memory_mb=0, + ) + + tests = scrape(url) + actual_problem_id = ( + problem_id if problem_id.isdigit() else problem_id.split("/")[-1] + ) + + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + + if not tests: + return TestsResult( + success=False, + error=f"{self.platform_name}: No tests found for {problem_id}", + problem_id=actual_problem_id, + url=url, + tests=[], + timeout_ms=timeout_ms, + memory_mb=memory_mb, + ) + + return TestsResult( + success=True, + error="", + problem_id=actual_problem_id, + url=url, + tests=tests, + timeout_ms=timeout_ms, + memory_mb=memory_mb, + ) + + def _scrape_contests_impl(self) -> ContestListResult: + categories = scrape_categories() + if not categories: + return ContestListResult( + success=False, error=f"{self.platform_name}: No contests found" + ) + return ContestListResult(success=True, error="", contests=categories) + + def main() -> None: if len(sys.argv) < 2: result = MetadataResult( @@ -332,6 +438,7 @@ def main() -> None: sys.exit(1) mode: str = sys.argv[1] + scraper = CSESScraper() if mode == "metadata": if len(sys.argv) != 3: @@ -343,18 +450,10 @@ def main() -> None: sys.exit(1) category_id = sys.argv[2] - problems = scrape_category_problems(category_id) - - if not problems: - result = MetadataResult( - success=False, - error=f"No problems found for category: {category_id}", - ) - print(json.dumps(asdict(result))) - return - - result = MetadataResult(success=True, error="", problems=problems) + result = scraper.scrape_contest_metadata(category_id) print(json.dumps(asdict(result))) + if not result.success: + sys.exit(1) elif mode == "tests": if len(sys.argv) != 4: @@ -370,73 +469,12 @@ def main() -> None: print(json.dumps(asdict(tests_result))) sys.exit(1) - problem_input: str = sys.argv[3] - url: str | None = parse_problem_url(problem_input) - - if not url: - tests_result = TestsResult( - success=False, - error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL", - problem_id=problem_input if problem_input.isdigit() else "", - url="", - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - tests: list[TestCase] = scrape(url) - - problem_id: str = ( - problem_input if problem_input.isdigit() else problem_input.split("/")[-1] - ) - - try: - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - } - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - timeout_ms, memory_mb = extract_problem_limits(soup) - except Exception as e: - tests_result = TestsResult( - success=False, - error=f"Failed to extract constraints: {e}", - problem_id=problem_id, - url=url, - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - if not tests: - tests_result = TestsResult( - success=False, - error=f"No tests found for {problem_input}", - problem_id=problem_id, - url=url, - tests=[], - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - test_cases = tests - tests_result = TestsResult( - success=True, - error="", - problem_id=problem_id, - url=url, - tests=test_cases, - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) + category = sys.argv[2] + problem_id = sys.argv[3] + tests_result = scraper.scrape_problem_tests(category, problem_id) print(json.dumps(asdict(tests_result))) + if not tests_result.success: + sys.exit(1) elif mode == "contests": if len(sys.argv) != 2: @@ -446,14 +484,10 @@ def main() -> None: print(json.dumps(asdict(contest_result))) sys.exit(1) - categories = scrape_categories() - if not categories: - contest_result = ContestListResult(success=False, error="No contests found") - print(json.dumps(asdict(contest_result))) - sys.exit(1) - - contest_result = ContestListResult(success=True, error="", contests=categories) + contest_result = scraper.scrape_contest_list() print(json.dumps(asdict(contest_result))) + if not contest_result.success: + sys.exit(1) else: result = MetadataResult( diff --git a/tests/scrapers/test_codeforces.py b/tests/scrapers/test_codeforces.py index fd98b1b..8c436a3 100644 --- a/tests/scrapers/test_codeforces.py +++ b/tests/scrapers/test_codeforces.py @@ -5,14 +5,16 @@ from scrapers.models import ContestSummary, ProblemSummary def test_scrape_success(mocker, mock_codeforces_html): - mock_client = Mock() + mock_scraper = Mock() mock_response = Mock() mock_response.text = mock_codeforces_html - mock_client.get.return_value = mock_response + mock_scraper.get.return_value = mock_response + + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper + ) scraper = CodeforcesScraper() - mocker.patch.object(scraper, "_create_client", return_value=mock_client) - result = scraper.scrape_problem_tests("1900", "A") assert result.success == True @@ -22,17 +24,19 @@ def test_scrape_success(mocker, mock_codeforces_html): def test_scrape_contest_problems(mocker): - mock_client = Mock() + mock_scraper = Mock() mock_response = Mock() mock_response.text = """ A. Problem A B. Problem B """ - mock_client.get.return_value = mock_response + mock_scraper.get.return_value = mock_response + + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper + ) scraper = CodeforcesScraper() - mocker.patch.object(scraper, "_create_client", return_value=mock_client) - result = scraper.scrape_contest_metadata("1900") assert result.success == True @@ -42,12 +46,14 @@ def test_scrape_contest_problems(mocker): def test_scrape_network_error(mocker): - mock_client = Mock() - mock_client.get.side_effect = Exception("Network error") + mock_scraper = Mock() + mock_scraper.get.side_effect = Exception("Network error") + + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper + ) scraper = CodeforcesScraper() - mocker.patch.object(scraper, "_create_client", return_value=mock_client) - result = scraper.scrape_problem_tests("1900", "A") assert result.success == False @@ -55,7 +61,7 @@ def test_scrape_network_error(mocker): def test_scrape_contests_success(mocker): - mock_client = Mock() + mock_scraper = Mock() mock_response = Mock() mock_response.json.return_value = { "status": "OK", @@ -65,11 +71,13 @@ def test_scrape_contests_success(mocker): {"id": 1949, "name": "Codeforces Global Round 26"}, ], } - mock_client.get.return_value = mock_response + mock_scraper.get.return_value = mock_response + + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper + ) scraper = CodeforcesScraper() - mocker.patch.object(scraper, "_create_client", return_value=mock_client) - result = scraper.scrape_contest_list() assert result.success == True @@ -92,14 +100,16 @@ def test_scrape_contests_success(mocker): def test_scrape_contests_api_error(mocker): - mock_client = Mock() + mock_scraper = Mock() mock_response = Mock() mock_response.json.return_value = {"status": "FAILED", "result": []} - mock_client.get.return_value = mock_response + mock_scraper.get.return_value = mock_response + + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper + ) scraper = CodeforcesScraper() - mocker.patch.object(scraper, "_create_client", return_value=mock_client) - result = scraper.scrape_contest_list() assert result.success == False @@ -107,12 +117,14 @@ def test_scrape_contests_api_error(mocker): def test_scrape_contests_network_error(mocker): - mock_client = Mock() - mock_client.get.side_effect = Exception("Network error") + mock_scraper = Mock() + mock_scraper.get.side_effect = Exception("Network error") + + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper + ) scraper = CodeforcesScraper() - mocker.patch.object(scraper, "_create_client", return_value=mock_client) - result = scraper.scrape_contest_list() assert result.success == False diff --git a/tests/scrapers/test_interface_compliance.py b/tests/scrapers/test_interface_compliance.py new file mode 100644 index 0000000..da931c1 --- /dev/null +++ b/tests/scrapers/test_interface_compliance.py @@ -0,0 +1,162 @@ +from unittest.mock import Mock + +import pytest + +from scrapers import ALL_SCRAPERS, BaseScraper +from scrapers.models import ContestListResult, MetadataResult, TestsResult + +ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values()) + + +class TestScraperInterfaceCompliance: + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_implements_base_interface(self, scraper_class): + scraper = scraper_class() + + assert isinstance(scraper, BaseScraper) + assert hasattr(scraper, "platform_name") + assert hasattr(scraper, "scrape_contest_metadata") + assert hasattr(scraper, "scrape_problem_tests") + assert hasattr(scraper, "scrape_contest_list") + + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_platform_name_is_string(self, scraper_class): + scraper = scraper_class() + platform_name = scraper.platform_name + + assert isinstance(platform_name, str) + assert len(platform_name) > 0 + assert platform_name.islower() # Convention: lowercase platform names + + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_metadata_method_signature(self, scraper_class, mocker): + scraper = scraper_class() + + # Mock the underlying HTTP calls to avoid network requests + if scraper.platform_name == "codeforces": + mock_scraper = Mock() + mock_response = Mock() + mock_response.text = "A. Test" + mock_scraper.get.return_value = mock_response + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", + return_value=mock_scraper, + ) + + result = scraper.scrape_contest_metadata("test_contest") + + assert isinstance(result, MetadataResult) + assert hasattr(result, "success") + assert hasattr(result, "error") + assert hasattr(result, "problems") + assert hasattr(result, "contest_id") + assert isinstance(result.success, bool) + assert isinstance(result.error, str) + + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_problem_tests_method_signature(self, scraper_class, mocker): + scraper = scraper_class() + + if scraper.platform_name == "codeforces": + mock_scraper = Mock() + mock_response = Mock() + mock_response.text = """ +
Time limit: 1 seconds
+
Memory limit: 256 megabytes
+
3
+
6
+ """ + mock_scraper.get.return_value = mock_response + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", + return_value=mock_scraper, + ) + + result = scraper.scrape_problem_tests("test_contest", "A") + + assert isinstance(result, TestsResult) + assert hasattr(result, "success") + assert hasattr(result, "error") + assert hasattr(result, "tests") + assert hasattr(result, "problem_id") + assert hasattr(result, "url") + assert hasattr(result, "timeout_ms") + assert hasattr(result, "memory_mb") + assert isinstance(result.success, bool) + assert isinstance(result.error, str) + + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_contest_list_method_signature(self, scraper_class, mocker): + scraper = scraper_class() + + if scraper.platform_name == "codeforces": + mock_scraper = Mock() + mock_response = Mock() + mock_response.json.return_value = { + "status": "OK", + "result": [{"id": 1900, "name": "Test Contest"}], + } + mock_scraper.get.return_value = mock_response + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", + return_value=mock_scraper, + ) + + result = scraper.scrape_contest_list() + + assert isinstance(result, ContestListResult) + assert hasattr(result, "success") + assert hasattr(result, "error") + assert hasattr(result, "contests") + assert isinstance(result.success, bool) + assert isinstance(result.error, str) + + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_error_message_format(self, scraper_class, mocker): + scraper = scraper_class() + platform_name = scraper.platform_name + + # Force an error by mocking HTTP failure + if scraper.platform_name == "codeforces": + mock_scraper = Mock() + mock_scraper.get.side_effect = Exception("Network error") + mocker.patch( + "scrapers.codeforces.cloudscraper.create_scraper", + return_value=mock_scraper, + ) + elif scraper.platform_name == "atcoder": + mocker.patch( + "scrapers.atcoder.requests.get", side_effect=Exception("Network error") + ) + elif scraper.platform_name == "cses": + mocker.patch( + "scrapers.cses.make_request", side_effect=Exception("Network error") + ) + + # Test metadata error format + result = scraper.scrape_contest_metadata("test") + assert result.success == False + assert result.error.startswith(f"{platform_name}: ") + + # Test problem tests error format + result = scraper.scrape_problem_tests("test", "A") + assert result.success == False + assert result.error.startswith(f"{platform_name}: ") + + # Test contest list error format + result = scraper.scrape_contest_list() + assert result.success == False + assert result.error.startswith(f"{platform_name}: ") + + @pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES) + def test_scraper_instantiation(self, scraper_class): + scraper1 = scraper_class() + assert isinstance(scraper1, BaseScraper) + assert scraper1.config is not None + + from scrapers.base import ScraperConfig + + custom_config = ScraperConfig(timeout_seconds=60) + scraper2 = scraper_class(custom_config) + assert isinstance(scraper2, BaseScraper) + assert scraper2.config.timeout_seconds == 60 diff --git a/tests/scrapers/test_registry.py b/tests/scrapers/test_registry.py new file mode 100644 index 0000000..a656d1e --- /dev/null +++ b/tests/scrapers/test_registry.py @@ -0,0 +1,58 @@ +import pytest + +from scrapers import ALL_SCRAPERS, get_scraper, list_platforms +from scrapers.base import BaseScraper +from scrapers.codeforces import CodeforcesScraper + + +class TestScraperRegistry: + def test_get_scraper_valid_platform(self): + scraper_class = get_scraper("codeforces") + assert scraper_class == CodeforcesScraper + assert issubclass(scraper_class, BaseScraper) + + scraper = scraper_class() + assert isinstance(scraper, BaseScraper) + assert scraper.platform_name == "codeforces" + + def test_get_scraper_invalid_platform(self): + with pytest.raises(KeyError) as exc_info: + get_scraper("nonexistent") + + error_msg = str(exc_info.value) + assert "nonexistent" in error_msg + assert "Available platforms" in error_msg + + def test_list_platforms(self): + platforms = list_platforms() + + assert isinstance(platforms, list) + assert len(platforms) > 0 + assert "codeforces" in platforms + + assert set(platforms) == set(ALL_SCRAPERS.keys()) + + def test_all_scrapers_registry(self): + assert isinstance(ALL_SCRAPERS, dict) + assert len(ALL_SCRAPERS) > 0 + + for platform_name, scraper_class in ALL_SCRAPERS.items(): + assert isinstance(platform_name, str) + assert platform_name.islower() + + assert issubclass(scraper_class, BaseScraper) + + scraper = scraper_class() + assert scraper.platform_name == platform_name + + def test_registry_import_consistency(self): + from scrapers.codeforces import CodeforcesScraper as DirectImport + + registry_class = get_scraper("codeforces") + assert registry_class == DirectImport + + def test_all_scrapers_can_be_instantiated(self): + for platform_name, scraper_class in ALL_SCRAPERS.items(): + scraper = scraper_class() + assert isinstance(scraper, BaseScraper) + assert scraper.platform_name == platform_name