feat(scrapers): total refactor

This commit is contained in:
Barrett Ruth 2025-09-22 22:00:20 -04:00
parent eb3f7762de
commit db391da52c
9 changed files with 559 additions and 307 deletions

View file

@ -0,0 +1,56 @@
from .atcoder import AtCoderScraper
from .base import BaseScraper, ScraperConfig
from .codeforces import CodeforcesScraper
from .cses import CSESScraper
from .models import (
ContestListResult,
ContestSummary,
MetadataResult,
ProblemSummary,
TestCase,
TestsResult,
)
ALL_SCRAPERS: dict[str, type[BaseScraper]] = {
"atcoder": AtCoderScraper,
"codeforces": CodeforcesScraper,
"cses": CSESScraper,
}
_SCRAPER_CLASSES = [
"AtCoderScraper",
"CodeforcesScraper",
"CSESScraper",
]
_BASE_EXPORTS = [
"BaseScraper",
"ScraperConfig",
"ContestListResult",
"ContestSummary",
"MetadataResult",
"ProblemSummary",
"TestCase",
"TestsResult",
]
_REGISTRY_FUNCTIONS = [
"get_scraper",
"list_platforms",
"ALL_SCRAPERS",
]
__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS
def get_scraper(platform: str) -> type[BaseScraper]:
if platform not in ALL_SCRAPERS:
available = ", ".join(ALL_SCRAPERS.keys())
raise KeyError(
f"Unknown platform '{platform}'. Available platforms: {available}"
)
return ALL_SCRAPERS[platform]
def list_platforms() -> list[str]:
return list(ALL_SCRAPERS.keys())

View file

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import concurrent.futures
import json
import re
import sys
@ -9,6 +10,7 @@ import backoff
import requests
from bs4 import BeautifulSoup, Tag
from .base import BaseScraper
from .models import (
ContestListResult,
ContestSummary,
@ -167,8 +169,6 @@ def scrape(url: str) -> list[TestCase]:
def scrape_contests() -> list[ContestSummary]:
import concurrent.futures
def get_max_pages() -> int:
try:
headers = {
@ -296,6 +296,101 @@ def scrape_contests() -> list[ContestSummary]:
return all_contests
class AtCoderScraper(BaseScraper):
@property
def platform_name(self) -> str:
return "atcoder"
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
return self._safe_execute(
"tests", self._scrape_tests_impl, contest_id, problem_id
)
def scrape_contest_list(self) -> ContestListResult:
return self._safe_execute("contests", self._scrape_contests_impl)
def _safe_execute(self, operation: str, func, *args):
try:
return func(*args)
except Exception as e:
error_msg = f"{self.platform_name}: {str(e)}"
if operation == "metadata":
return MetadataResult(success=False, error=error_msg)
elif operation == "tests":
return TestsResult(
success=False,
error=error_msg,
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
elif operation == "contests":
return ContestListResult(success=False, error=error_msg)
def _scrape_metadata_impl(self, contest_id: str) -> MetadataResult:
problems = scrape_contest_problems(contest_id)
if not problems:
return MetadataResult(
success=False,
error=f"{self.platform_name}: No problems found for contest {contest_id}",
)
return MetadataResult(
success=True, error="", contest_id=contest_id, problems=problems
)
def _scrape_tests_impl(self, contest_id: str, problem_id: str) -> TestsResult:
problem_letter = problem_id.upper()
url = parse_problem_url(contest_id, problem_letter)
tests = scrape(url)
response = requests.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
},
timeout=10,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
if not tests:
return TestsResult(
success=False,
error=f"{self.platform_name}: No tests found for {contest_id} {problem_letter}",
problem_id=f"{contest_id}_{problem_id.lower()}",
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
return TestsResult(
success=True,
error="",
problem_id=f"{contest_id}_{problem_id.lower()}",
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
def _scrape_contests_impl(self) -> ContestListResult:
contests = scrape_contests()
if not contests:
return ContestListResult(
success=False, error=f"{self.platform_name}: No contests found"
)
return ContestListResult(success=True, error="", contests=contests)
def main() -> None:
if len(sys.argv) < 2:
result = MetadataResult(
@ -306,6 +401,7 @@ def main() -> None:
sys.exit(1)
mode: str = sys.argv[1]
scraper = AtCoderScraper()
if mode == "metadata":
if len(sys.argv) != 3:
@ -317,23 +413,10 @@ def main() -> None:
sys.exit(1)
contest_id: str = sys.argv[2]
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
if not problems:
result = MetadataResult(
success=False,
error=f"No problems found for contest {contest_id}",
)
print(json.dumps(asdict(result)))
sys.exit(1)
result = MetadataResult(
success=True,
error="",
contest_id=contest_id,
problems=problems,
)
result = scraper.scrape_contest_metadata(contest_id)
print(json.dumps(asdict(result)))
if not result.success:
sys.exit(1)
elif mode == "tests":
if len(sys.argv) != 4:
@ -351,55 +434,10 @@ def main() -> None:
test_contest_id: str = sys.argv[2]
problem_letter: str = sys.argv[3]
problem_id: str = f"{test_contest_id}_{problem_letter.lower()}"
url: str = parse_problem_url(test_contest_id, problem_letter)
tests: list[TestCase] = scrape(url)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {test_contest_id} {problem_letter}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
tests_result = scraper.scrape_problem_tests(test_contest_id, problem_letter)
print(json.dumps(asdict(tests_result)))
if not tests_result.success:
sys.exit(1)
elif mode == "contests":
if len(sys.argv) != 2:
@ -409,14 +447,10 @@ def main() -> None:
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contests = scrape_contests()
if not contests:
contest_result = ContestListResult(success=False, error="No contests found")
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contest_result = ContestListResult(success=True, error="", contests=contests)
contest_result = scraper.scrape_contest_list()
print(json.dumps(asdict(contest_result)))
if not contest_result.success:
sys.exit(1)
else:
result = MetadataResult(

View file

@ -1,8 +1,5 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Protocol
import requests
from .models import ContestListResult, MetadataResult, TestsResult
@ -15,23 +12,14 @@ class ScraperConfig:
rate_limit_delay: float = 1.0
class HttpClient(Protocol):
def get(self, url: str, **kwargs) -> requests.Response: ...
def close(self) -> None: ...
class BaseScraper(ABC):
def __init__(self, config: ScraperConfig | None = None):
self.config = config or ScraperConfig()
self._client: HttpClient | None = None
@property
@abstractmethod
def platform_name(self) -> str: ...
@abstractmethod
def _create_client(self) -> HttpClient: ...
@abstractmethod
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ...
@ -41,17 +29,6 @@ class BaseScraper(ABC):
@abstractmethod
def scrape_contest_list(self) -> ContestListResult: ...
@property
def client(self) -> HttpClient:
if self._client is None:
self._client = self._create_client()
return self._client
def close(self) -> None:
if self._client is not None:
self._client.close()
self._client = None
def _create_metadata_error(
self, error_msg: str, contest_id: str = ""
) -> MetadataResult:

View file

@ -1,82 +0,0 @@
import time
import backoff
import requests
from .base import HttpClient, ScraperConfig
class RequestsClient:
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
self.config = config
self.session = requests.Session()
default_headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
if headers:
default_headers.update(headers)
self.session.headers.update(default_headers)
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.session.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
self.session.close()
class CloudScraperClient:
def __init__(self, config: ScraperConfig):
import cloudscraper
self.config = config
self.scraper = cloudscraper.create_scraper()
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.scraper.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
if hasattr(self.scraper, "close"):
self.scraper.close()

View file

@ -5,10 +5,10 @@ import re
import sys
from dataclasses import asdict
import cloudscraper
from bs4 import BeautifulSoup, Tag
from .base import BaseScraper, HttpClient
from .clients import CloudScraperClient
from .base import BaseScraper
from .models import (
ContestListResult,
ContestSummary,
@ -24,9 +24,6 @@ class CodeforcesScraper(BaseScraper):
def platform_name(self) -> str:
return "codeforces"
def _create_client(self) -> HttpClient:
return CloudScraperClient(self.config)
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute(
"metadata", self._scrape_contest_metadata_impl, contest_id
@ -41,7 +38,7 @@ class CodeforcesScraper(BaseScraper):
return self._safe_execute("contests", self._scrape_contest_list_impl)
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
problems = scrape_contest_problems(contest_id, self.client)
problems = scrape_contest_problems(contest_id)
if not problems:
return self._create_metadata_error(
f"No problems found for contest {contest_id}", contest_id
@ -55,9 +52,11 @@ class CodeforcesScraper(BaseScraper):
) -> TestsResult:
problem_id = contest_id + problem_letter.lower()
url = parse_problem_url(contest_id, problem_letter)
tests = scrape_sample_tests(url, self.client)
tests = scrape_sample_tests(url)
response = self.client.get(url)
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=self.config.timeout_seconds)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
@ -77,15 +76,17 @@ class CodeforcesScraper(BaseScraper):
)
def _scrape_contest_list_impl(self) -> ContestListResult:
contests = scrape_contests(self.client)
contests = scrape_contests()
if not contests:
return self._create_contests_error("No contests found")
return ContestListResult(success=True, error="", contests=contests)
def scrape(url: str, client: HttpClient) -> list[TestCase]:
def scrape(url: str) -> list[TestCase]:
try:
response = client.get(url)
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
input_sections = soup.find_all("div", class_="input")
@ -239,12 +240,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
return timeout_ms, memory_mb
def scrape_contest_problems(
contest_id: str, client: HttpClient
) -> list[ProblemSummary]:
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
response = client.get(contest_url)
scraper = cloudscraper.create_scraper()
response = scraper.get(contest_url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
problems: list[ProblemSummary] = []
@ -280,13 +281,15 @@ def scrape_contest_problems(
return []
def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]:
def scrape_sample_tests(url: str) -> list[TestCase]:
print(f"Scraping: {url}", file=sys.stderr)
return scrape(url, client)
return scrape(url)
def scrape_contests(client: HttpClient) -> list[ContestSummary]:
response = client.get("https://codeforces.com/api/contest.list")
def scrape_contests() -> list[ContestSummary]:
scraper = cloudscraper.create_scraper()
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
response.raise_for_status()
data = response.json()
if data["status"] != "OK":
@ -364,8 +367,6 @@ def main() -> None:
print(json.dumps(asdict(result)))
sys.exit(1)
scraper.close()
if __name__ == "__main__":
main()

View file

@ -9,6 +9,7 @@ import backoff
import requests
from bs4 import BeautifulSoup, Tag
from .base import BaseScraper
from .models import (
ContestListResult,
ContestSummary,
@ -322,6 +323,111 @@ def scrape(url: str) -> list[TestCase]:
return []
class CSESScraper(BaseScraper):
@property
def platform_name(self) -> str:
return "cses"
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
return self._safe_execute(
"tests", self._scrape_tests_impl, contest_id, problem_id
)
def scrape_contest_list(self) -> ContestListResult:
return self._safe_execute("contests", self._scrape_contests_impl)
def _safe_execute(self, operation: str, func, *args):
try:
return func(*args)
except Exception as e:
error_msg = f"{self.platform_name}: {str(e)}"
if operation == "metadata":
return MetadataResult(success=False, error=error_msg)
elif operation == "tests":
return TestsResult(
success=False,
error=error_msg,
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
elif operation == "contests":
return ContestListResult(success=False, error=error_msg)
def _scrape_metadata_impl(self, category_id: str) -> MetadataResult:
problems = scrape_category_problems(category_id)
if not problems:
return MetadataResult(
success=False,
error=f"{self.platform_name}: No problems found for category: {category_id}",
)
return MetadataResult(
success=True, error="", contest_id=category_id, problems=problems
)
def _scrape_tests_impl(self, category: str, problem_id: str) -> TestsResult:
url = parse_problem_url(problem_id)
if not url:
return TestsResult(
success=False,
error=f"{self.platform_name}: Invalid problem input: {problem_id}. Use either problem ID (e.g., 1068) or full URL",
problem_id=problem_id if problem_id.isdigit() else "",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
tests = scrape(url)
actual_problem_id = (
problem_id if problem_id.isdigit() else problem_id.split("/")[-1]
)
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
if not tests:
return TestsResult(
success=False,
error=f"{self.platform_name}: No tests found for {problem_id}",
problem_id=actual_problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
return TestsResult(
success=True,
error="",
problem_id=actual_problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
def _scrape_contests_impl(self) -> ContestListResult:
categories = scrape_categories()
if not categories:
return ContestListResult(
success=False, error=f"{self.platform_name}: No contests found"
)
return ContestListResult(success=True, error="", contests=categories)
def main() -> None:
if len(sys.argv) < 2:
result = MetadataResult(
@ -332,6 +438,7 @@ def main() -> None:
sys.exit(1)
mode: str = sys.argv[1]
scraper = CSESScraper()
if mode == "metadata":
if len(sys.argv) != 3:
@ -343,18 +450,10 @@ def main() -> None:
sys.exit(1)
category_id = sys.argv[2]
problems = scrape_category_problems(category_id)
if not problems:
result = MetadataResult(
success=False,
error=f"No problems found for category: {category_id}",
)
print(json.dumps(asdict(result)))
return
result = MetadataResult(success=True, error="", problems=problems)
result = scraper.scrape_contest_metadata(category_id)
print(json.dumps(asdict(result)))
if not result.success:
sys.exit(1)
elif mode == "tests":
if len(sys.argv) != 4:
@ -370,73 +469,12 @@ def main() -> None:
print(json.dumps(asdict(tests_result)))
sys.exit(1)
problem_input: str = sys.argv[3]
url: str | None = parse_problem_url(problem_input)
if not url:
tests_result = TestsResult(
success=False,
error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
problem_id=problem_input if problem_input.isdigit() else "",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests: list[TestCase] = scrape(url)
problem_id: str = (
problem_input if problem_input.isdigit() else problem_input.split("/")[-1]
)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {problem_input}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_cases = tests
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=test_cases,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
category = sys.argv[2]
problem_id = sys.argv[3]
tests_result = scraper.scrape_problem_tests(category, problem_id)
print(json.dumps(asdict(tests_result)))
if not tests_result.success:
sys.exit(1)
elif mode == "contests":
if len(sys.argv) != 2:
@ -446,14 +484,10 @@ def main() -> None:
print(json.dumps(asdict(contest_result)))
sys.exit(1)
categories = scrape_categories()
if not categories:
contest_result = ContestListResult(success=False, error="No contests found")
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contest_result = ContestListResult(success=True, error="", contests=categories)
contest_result = scraper.scrape_contest_list()
print(json.dumps(asdict(contest_result)))
if not contest_result.success:
sys.exit(1)
else:
result = MetadataResult(

View file

@ -5,14 +5,16 @@ from scrapers.models import ContestSummary, ProblemSummary
def test_scrape_success(mocker, mock_codeforces_html):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = mock_codeforces_html
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_problem_tests("1900", "A")
assert result.success == True
@ -22,17 +24,19 @@ def test_scrape_success(mocker, mock_codeforces_html):
def test_scrape_contest_problems(mocker):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = """
<a href="/contest/1900/problem/A">A. Problem A</a>
<a href="/contest/1900/problem/B">B. Problem B</a>
"""
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_metadata("1900")
assert result.success == True
@ -42,12 +46,14 @@ def test_scrape_contest_problems(mocker):
def test_scrape_network_error(mocker):
mock_client = Mock()
mock_client.get.side_effect = Exception("Network error")
mock_scraper = Mock()
mock_scraper.get.side_effect = Exception("Network error")
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_problem_tests("1900", "A")
assert result.success == False
@ -55,7 +61,7 @@ def test_scrape_network_error(mocker):
def test_scrape_contests_success(mocker):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.json.return_value = {
"status": "OK",
@ -65,11 +71,13 @@ def test_scrape_contests_success(mocker):
{"id": 1949, "name": "Codeforces Global Round 26"},
],
}
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_list()
assert result.success == True
@ -92,14 +100,16 @@ def test_scrape_contests_success(mocker):
def test_scrape_contests_api_error(mocker):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.json.return_value = {"status": "FAILED", "result": []}
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_list()
assert result.success == False
@ -107,12 +117,14 @@ def test_scrape_contests_api_error(mocker):
def test_scrape_contests_network_error(mocker):
mock_client = Mock()
mock_client.get.side_effect = Exception("Network error")
mock_scraper = Mock()
mock_scraper.get.side_effect = Exception("Network error")
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_list()
assert result.success == False

View file

@ -0,0 +1,162 @@
from unittest.mock import Mock
import pytest
from scrapers import ALL_SCRAPERS, BaseScraper
from scrapers.models import ContestListResult, MetadataResult, TestsResult
ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values())
class TestScraperInterfaceCompliance:
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_implements_base_interface(self, scraper_class):
scraper = scraper_class()
assert isinstance(scraper, BaseScraper)
assert hasattr(scraper, "platform_name")
assert hasattr(scraper, "scrape_contest_metadata")
assert hasattr(scraper, "scrape_problem_tests")
assert hasattr(scraper, "scrape_contest_list")
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_platform_name_is_string(self, scraper_class):
scraper = scraper_class()
platform_name = scraper.platform_name
assert isinstance(platform_name, str)
assert len(platform_name) > 0
assert platform_name.islower() # Convention: lowercase platform names
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_metadata_method_signature(self, scraper_class, mocker):
scraper = scraper_class()
# Mock the underlying HTTP calls to avoid network requests
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = "<a href='/contest/1900/problem/A'>A. Test</a>"
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
result = scraper.scrape_contest_metadata("test_contest")
assert isinstance(result, MetadataResult)
assert hasattr(result, "success")
assert hasattr(result, "error")
assert hasattr(result, "problems")
assert hasattr(result, "contest_id")
assert isinstance(result.success, bool)
assert isinstance(result.error, str)
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_problem_tests_method_signature(self, scraper_class, mocker):
scraper = scraper_class()
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = """
<div class="time-limit">Time limit: 1 seconds</div>
<div class="memory-limit">Memory limit: 256 megabytes</div>
<div class="input"><pre><div class="test-example-line-1">3</div></pre></div>
<div class="output"><pre><div class="test-example-line-1">6</div></pre></div>
"""
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
result = scraper.scrape_problem_tests("test_contest", "A")
assert isinstance(result, TestsResult)
assert hasattr(result, "success")
assert hasattr(result, "error")
assert hasattr(result, "tests")
assert hasattr(result, "problem_id")
assert hasattr(result, "url")
assert hasattr(result, "timeout_ms")
assert hasattr(result, "memory_mb")
assert isinstance(result.success, bool)
assert isinstance(result.error, str)
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_contest_list_method_signature(self, scraper_class, mocker):
scraper = scraper_class()
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_response = Mock()
mock_response.json.return_value = {
"status": "OK",
"result": [{"id": 1900, "name": "Test Contest"}],
}
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
result = scraper.scrape_contest_list()
assert isinstance(result, ContestListResult)
assert hasattr(result, "success")
assert hasattr(result, "error")
assert hasattr(result, "contests")
assert isinstance(result.success, bool)
assert isinstance(result.error, str)
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_error_message_format(self, scraper_class, mocker):
scraper = scraper_class()
platform_name = scraper.platform_name
# Force an error by mocking HTTP failure
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_scraper.get.side_effect = Exception("Network error")
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
elif scraper.platform_name == "atcoder":
mocker.patch(
"scrapers.atcoder.requests.get", side_effect=Exception("Network error")
)
elif scraper.platform_name == "cses":
mocker.patch(
"scrapers.cses.make_request", side_effect=Exception("Network error")
)
# Test metadata error format
result = scraper.scrape_contest_metadata("test")
assert result.success == False
assert result.error.startswith(f"{platform_name}: ")
# Test problem tests error format
result = scraper.scrape_problem_tests("test", "A")
assert result.success == False
assert result.error.startswith(f"{platform_name}: ")
# Test contest list error format
result = scraper.scrape_contest_list()
assert result.success == False
assert result.error.startswith(f"{platform_name}: ")
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_scraper_instantiation(self, scraper_class):
scraper1 = scraper_class()
assert isinstance(scraper1, BaseScraper)
assert scraper1.config is not None
from scrapers.base import ScraperConfig
custom_config = ScraperConfig(timeout_seconds=60)
scraper2 = scraper_class(custom_config)
assert isinstance(scraper2, BaseScraper)
assert scraper2.config.timeout_seconds == 60

View file

@ -0,0 +1,58 @@
import pytest
from scrapers import ALL_SCRAPERS, get_scraper, list_platforms
from scrapers.base import BaseScraper
from scrapers.codeforces import CodeforcesScraper
class TestScraperRegistry:
def test_get_scraper_valid_platform(self):
scraper_class = get_scraper("codeforces")
assert scraper_class == CodeforcesScraper
assert issubclass(scraper_class, BaseScraper)
scraper = scraper_class()
assert isinstance(scraper, BaseScraper)
assert scraper.platform_name == "codeforces"
def test_get_scraper_invalid_platform(self):
with pytest.raises(KeyError) as exc_info:
get_scraper("nonexistent")
error_msg = str(exc_info.value)
assert "nonexistent" in error_msg
assert "Available platforms" in error_msg
def test_list_platforms(self):
platforms = list_platforms()
assert isinstance(platforms, list)
assert len(platforms) > 0
assert "codeforces" in platforms
assert set(platforms) == set(ALL_SCRAPERS.keys())
def test_all_scrapers_registry(self):
assert isinstance(ALL_SCRAPERS, dict)
assert len(ALL_SCRAPERS) > 0
for platform_name, scraper_class in ALL_SCRAPERS.items():
assert isinstance(platform_name, str)
assert platform_name.islower()
assert issubclass(scraper_class, BaseScraper)
scraper = scraper_class()
assert scraper.platform_name == platform_name
def test_registry_import_consistency(self):
from scrapers.codeforces import CodeforcesScraper as DirectImport
registry_class = get_scraper("codeforces")
assert registry_class == DirectImport
def test_all_scrapers_can_be_instantiated(self):
for platform_name, scraper_class in ALL_SCRAPERS.items():
scraper = scraper_class()
assert isinstance(scraper, BaseScraper)
assert scraper.platform_name == platform_name