feat(scrapers): total refactor
This commit is contained in:
parent
eb3f7762de
commit
db391da52c
9 changed files with 559 additions and 307 deletions
|
|
@ -0,0 +1,56 @@
|
||||||
|
from .atcoder import AtCoderScraper
|
||||||
|
from .base import BaseScraper, ScraperConfig
|
||||||
|
from .codeforces import CodeforcesScraper
|
||||||
|
from .cses import CSESScraper
|
||||||
|
from .models import (
|
||||||
|
ContestListResult,
|
||||||
|
ContestSummary,
|
||||||
|
MetadataResult,
|
||||||
|
ProblemSummary,
|
||||||
|
TestCase,
|
||||||
|
TestsResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
ALL_SCRAPERS: dict[str, type[BaseScraper]] = {
|
||||||
|
"atcoder": AtCoderScraper,
|
||||||
|
"codeforces": CodeforcesScraper,
|
||||||
|
"cses": CSESScraper,
|
||||||
|
}
|
||||||
|
|
||||||
|
_SCRAPER_CLASSES = [
|
||||||
|
"AtCoderScraper",
|
||||||
|
"CodeforcesScraper",
|
||||||
|
"CSESScraper",
|
||||||
|
]
|
||||||
|
|
||||||
|
_BASE_EXPORTS = [
|
||||||
|
"BaseScraper",
|
||||||
|
"ScraperConfig",
|
||||||
|
"ContestListResult",
|
||||||
|
"ContestSummary",
|
||||||
|
"MetadataResult",
|
||||||
|
"ProblemSummary",
|
||||||
|
"TestCase",
|
||||||
|
"TestsResult",
|
||||||
|
]
|
||||||
|
|
||||||
|
_REGISTRY_FUNCTIONS = [
|
||||||
|
"get_scraper",
|
||||||
|
"list_platforms",
|
||||||
|
"ALL_SCRAPERS",
|
||||||
|
]
|
||||||
|
|
||||||
|
__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS
|
||||||
|
|
||||||
|
|
||||||
|
def get_scraper(platform: str) -> type[BaseScraper]:
|
||||||
|
if platform not in ALL_SCRAPERS:
|
||||||
|
available = ", ".join(ALL_SCRAPERS.keys())
|
||||||
|
raise KeyError(
|
||||||
|
f"Unknown platform '{platform}'. Available platforms: {available}"
|
||||||
|
)
|
||||||
|
return ALL_SCRAPERS[platform]
|
||||||
|
|
||||||
|
|
||||||
|
def list_platforms() -> list[str]:
|
||||||
|
return list(ALL_SCRAPERS.keys())
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -9,6 +10,7 @@ import backoff
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from .base import BaseScraper
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -167,8 +169,6 @@ def scrape(url: str) -> list[TestCase]:
|
||||||
|
|
||||||
|
|
||||||
def scrape_contests() -> list[ContestSummary]:
|
def scrape_contests() -> list[ContestSummary]:
|
||||||
import concurrent.futures
|
|
||||||
|
|
||||||
def get_max_pages() -> int:
|
def get_max_pages() -> int:
|
||||||
try:
|
try:
|
||||||
headers = {
|
headers = {
|
||||||
|
|
@ -296,6 +296,101 @@ def scrape_contests() -> list[ContestSummary]:
|
||||||
return all_contests
|
return all_contests
|
||||||
|
|
||||||
|
|
||||||
|
class AtCoderScraper(BaseScraper):
|
||||||
|
@property
|
||||||
|
def platform_name(self) -> str:
|
||||||
|
return "atcoder"
|
||||||
|
|
||||||
|
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||||
|
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
|
||||||
|
|
||||||
|
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||||
|
return self._safe_execute(
|
||||||
|
"tests", self._scrape_tests_impl, contest_id, problem_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def scrape_contest_list(self) -> ContestListResult:
|
||||||
|
return self._safe_execute("contests", self._scrape_contests_impl)
|
||||||
|
|
||||||
|
def _safe_execute(self, operation: str, func, *args):
|
||||||
|
try:
|
||||||
|
return func(*args)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"{self.platform_name}: {str(e)}"
|
||||||
|
|
||||||
|
if operation == "metadata":
|
||||||
|
return MetadataResult(success=False, error=error_msg)
|
||||||
|
elif operation == "tests":
|
||||||
|
return TestsResult(
|
||||||
|
success=False,
|
||||||
|
error=error_msg,
|
||||||
|
problem_id="",
|
||||||
|
url="",
|
||||||
|
tests=[],
|
||||||
|
timeout_ms=0,
|
||||||
|
memory_mb=0,
|
||||||
|
)
|
||||||
|
elif operation == "contests":
|
||||||
|
return ContestListResult(success=False, error=error_msg)
|
||||||
|
|
||||||
|
def _scrape_metadata_impl(self, contest_id: str) -> MetadataResult:
|
||||||
|
problems = scrape_contest_problems(contest_id)
|
||||||
|
if not problems:
|
||||||
|
return MetadataResult(
|
||||||
|
success=False,
|
||||||
|
error=f"{self.platform_name}: No problems found for contest {contest_id}",
|
||||||
|
)
|
||||||
|
return MetadataResult(
|
||||||
|
success=True, error="", contest_id=contest_id, problems=problems
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scrape_tests_impl(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||||
|
problem_letter = problem_id.upper()
|
||||||
|
url = parse_problem_url(contest_id, problem_letter)
|
||||||
|
tests = scrape(url)
|
||||||
|
|
||||||
|
response = requests.get(
|
||||||
|
url,
|
||||||
|
headers={
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
},
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||||
|
|
||||||
|
if not tests:
|
||||||
|
return TestsResult(
|
||||||
|
success=False,
|
||||||
|
error=f"{self.platform_name}: No tests found for {contest_id} {problem_letter}",
|
||||||
|
problem_id=f"{contest_id}_{problem_id.lower()}",
|
||||||
|
url=url,
|
||||||
|
tests=[],
|
||||||
|
timeout_ms=timeout_ms,
|
||||||
|
memory_mb=memory_mb,
|
||||||
|
)
|
||||||
|
|
||||||
|
return TestsResult(
|
||||||
|
success=True,
|
||||||
|
error="",
|
||||||
|
problem_id=f"{contest_id}_{problem_id.lower()}",
|
||||||
|
url=url,
|
||||||
|
tests=tests,
|
||||||
|
timeout_ms=timeout_ms,
|
||||||
|
memory_mb=memory_mb,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scrape_contests_impl(self) -> ContestListResult:
|
||||||
|
contests = scrape_contests()
|
||||||
|
if not contests:
|
||||||
|
return ContestListResult(
|
||||||
|
success=False, error=f"{self.platform_name}: No contests found"
|
||||||
|
)
|
||||||
|
return ContestListResult(success=True, error="", contests=contests)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
result = MetadataResult(
|
result = MetadataResult(
|
||||||
|
|
@ -306,6 +401,7 @@ def main() -> None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
mode: str = sys.argv[1]
|
mode: str = sys.argv[1]
|
||||||
|
scraper = AtCoderScraper()
|
||||||
|
|
||||||
if mode == "metadata":
|
if mode == "metadata":
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
|
|
@ -317,23 +413,10 @@ def main() -> None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
contest_id: str = sys.argv[2]
|
contest_id: str = sys.argv[2]
|
||||||
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
|
result = scraper.scrape_contest_metadata(contest_id)
|
||||||
|
|
||||||
if not problems:
|
|
||||||
result = MetadataResult(
|
|
||||||
success=False,
|
|
||||||
error=f"No problems found for contest {contest_id}",
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
result = MetadataResult(
|
|
||||||
success=True,
|
|
||||||
error="",
|
|
||||||
contest_id=contest_id,
|
|
||||||
problems=problems,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(result)))
|
print(json.dumps(asdict(result)))
|
||||||
|
if not result.success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
elif mode == "tests":
|
elif mode == "tests":
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
|
|
@ -351,55 +434,10 @@ def main() -> None:
|
||||||
|
|
||||||
test_contest_id: str = sys.argv[2]
|
test_contest_id: str = sys.argv[2]
|
||||||
problem_letter: str = sys.argv[3]
|
problem_letter: str = sys.argv[3]
|
||||||
problem_id: str = f"{test_contest_id}_{problem_letter.lower()}"
|
tests_result = scraper.scrape_problem_tests(test_contest_id, problem_letter)
|
||||||
|
|
||||||
url: str = parse_problem_url(test_contest_id, problem_letter)
|
|
||||||
tests: list[TestCase] = scrape(url)
|
|
||||||
|
|
||||||
try:
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
|
||||||
except Exception as e:
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=False,
|
|
||||||
error=f"Failed to extract constraints: {e}",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=[],
|
|
||||||
timeout_ms=0,
|
|
||||||
memory_mb=0,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if not tests:
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=False,
|
|
||||||
error=f"No tests found for {test_contest_id} {problem_letter}",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=[],
|
|
||||||
timeout_ms=timeout_ms,
|
|
||||||
memory_mb=memory_mb,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=True,
|
|
||||||
error="",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=tests,
|
|
||||||
timeout_ms=timeout_ms,
|
|
||||||
memory_mb=memory_mb,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
print(json.dumps(asdict(tests_result)))
|
||||||
|
if not tests_result.success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
elif mode == "contests":
|
elif mode == "contests":
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
|
|
@ -409,14 +447,10 @@ def main() -> None:
|
||||||
print(json.dumps(asdict(contest_result)))
|
print(json.dumps(asdict(contest_result)))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
contests = scrape_contests()
|
contest_result = scraper.scrape_contest_list()
|
||||||
if not contests:
|
|
||||||
contest_result = ContestListResult(success=False, error="No contests found")
|
|
||||||
print(json.dumps(asdict(contest_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
contest_result = ContestListResult(success=True, error="", contests=contests)
|
|
||||||
print(json.dumps(asdict(contest_result)))
|
print(json.dumps(asdict(contest_result)))
|
||||||
|
if not contest_result.success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
result = MetadataResult(
|
result = MetadataResult(
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,5 @@
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Protocol
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from .models import ContestListResult, MetadataResult, TestsResult
|
from .models import ContestListResult, MetadataResult, TestsResult
|
||||||
|
|
||||||
|
|
@ -15,23 +12,14 @@ class ScraperConfig:
|
||||||
rate_limit_delay: float = 1.0
|
rate_limit_delay: float = 1.0
|
||||||
|
|
||||||
|
|
||||||
class HttpClient(Protocol):
|
|
||||||
def get(self, url: str, **kwargs) -> requests.Response: ...
|
|
||||||
def close(self) -> None: ...
|
|
||||||
|
|
||||||
|
|
||||||
class BaseScraper(ABC):
|
class BaseScraper(ABC):
|
||||||
def __init__(self, config: ScraperConfig | None = None):
|
def __init__(self, config: ScraperConfig | None = None):
|
||||||
self.config = config or ScraperConfig()
|
self.config = config or ScraperConfig()
|
||||||
self._client: HttpClient | None = None
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def platform_name(self) -> str: ...
|
def platform_name(self) -> str: ...
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def _create_client(self) -> HttpClient: ...
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ...
|
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ...
|
||||||
|
|
||||||
|
|
@ -41,17 +29,6 @@ class BaseScraper(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def scrape_contest_list(self) -> ContestListResult: ...
|
def scrape_contest_list(self) -> ContestListResult: ...
|
||||||
|
|
||||||
@property
|
|
||||||
def client(self) -> HttpClient:
|
|
||||||
if self._client is None:
|
|
||||||
self._client = self._create_client()
|
|
||||||
return self._client
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
if self._client is not None:
|
|
||||||
self._client.close()
|
|
||||||
self._client = None
|
|
||||||
|
|
||||||
def _create_metadata_error(
|
def _create_metadata_error(
|
||||||
self, error_msg: str, contest_id: str = ""
|
self, error_msg: str, contest_id: str = ""
|
||||||
) -> MetadataResult:
|
) -> MetadataResult:
|
||||||
|
|
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
import time
|
|
||||||
|
|
||||||
import backoff
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from .base import HttpClient, ScraperConfig
|
|
||||||
|
|
||||||
|
|
||||||
class RequestsClient:
|
|
||||||
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
|
|
||||||
self.config = config
|
|
||||||
self.session = requests.Session()
|
|
||||||
|
|
||||||
default_headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
if headers:
|
|
||||||
default_headers.update(headers)
|
|
||||||
|
|
||||||
self.session.headers.update(default_headers)
|
|
||||||
|
|
||||||
@backoff.on_exception(
|
|
||||||
backoff.expo,
|
|
||||||
(requests.RequestException, requests.HTTPError),
|
|
||||||
max_tries=3,
|
|
||||||
base=2.0,
|
|
||||||
jitter=backoff.random_jitter,
|
|
||||||
)
|
|
||||||
@backoff.on_predicate(
|
|
||||||
backoff.expo,
|
|
||||||
lambda response: response.status_code == 429,
|
|
||||||
max_tries=3,
|
|
||||||
base=2.0,
|
|
||||||
jitter=backoff.random_jitter,
|
|
||||||
)
|
|
||||||
def get(self, url: str, **kwargs) -> requests.Response:
|
|
||||||
timeout = kwargs.get("timeout", self.config.timeout_seconds)
|
|
||||||
response = self.session.get(url, timeout=timeout, **kwargs)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
if (
|
|
||||||
hasattr(self.config, "rate_limit_delay")
|
|
||||||
and self.config.rate_limit_delay > 0
|
|
||||||
):
|
|
||||||
time.sleep(self.config.rate_limit_delay)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
self.session.close()
|
|
||||||
|
|
||||||
|
|
||||||
class CloudScraperClient:
|
|
||||||
def __init__(self, config: ScraperConfig):
|
|
||||||
import cloudscraper
|
|
||||||
|
|
||||||
self.config = config
|
|
||||||
self.scraper = cloudscraper.create_scraper()
|
|
||||||
|
|
||||||
@backoff.on_exception(
|
|
||||||
backoff.expo,
|
|
||||||
(requests.RequestException, requests.HTTPError),
|
|
||||||
max_tries=3,
|
|
||||||
base=2.0,
|
|
||||||
jitter=backoff.random_jitter,
|
|
||||||
)
|
|
||||||
def get(self, url: str, **kwargs) -> requests.Response:
|
|
||||||
timeout = kwargs.get("timeout", self.config.timeout_seconds)
|
|
||||||
response = self.scraper.get(url, timeout=timeout, **kwargs)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
if (
|
|
||||||
hasattr(self.config, "rate_limit_delay")
|
|
||||||
and self.config.rate_limit_delay > 0
|
|
||||||
):
|
|
||||||
time.sleep(self.config.rate_limit_delay)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
if hasattr(self.scraper, "close"):
|
|
||||||
self.scraper.close()
|
|
||||||
|
|
@ -5,10 +5,10 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
|
|
||||||
|
import cloudscraper
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
from .base import BaseScraper, HttpClient
|
from .base import BaseScraper
|
||||||
from .clients import CloudScraperClient
|
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -24,9 +24,6 @@ class CodeforcesScraper(BaseScraper):
|
||||||
def platform_name(self) -> str:
|
def platform_name(self) -> str:
|
||||||
return "codeforces"
|
return "codeforces"
|
||||||
|
|
||||||
def _create_client(self) -> HttpClient:
|
|
||||||
return CloudScraperClient(self.config)
|
|
||||||
|
|
||||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||||
return self._safe_execute(
|
return self._safe_execute(
|
||||||
"metadata", self._scrape_contest_metadata_impl, contest_id
|
"metadata", self._scrape_contest_metadata_impl, contest_id
|
||||||
|
|
@ -41,7 +38,7 @@ class CodeforcesScraper(BaseScraper):
|
||||||
return self._safe_execute("contests", self._scrape_contest_list_impl)
|
return self._safe_execute("contests", self._scrape_contest_list_impl)
|
||||||
|
|
||||||
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
|
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
|
||||||
problems = scrape_contest_problems(contest_id, self.client)
|
problems = scrape_contest_problems(contest_id)
|
||||||
if not problems:
|
if not problems:
|
||||||
return self._create_metadata_error(
|
return self._create_metadata_error(
|
||||||
f"No problems found for contest {contest_id}", contest_id
|
f"No problems found for contest {contest_id}", contest_id
|
||||||
|
|
@ -55,9 +52,11 @@ class CodeforcesScraper(BaseScraper):
|
||||||
) -> TestsResult:
|
) -> TestsResult:
|
||||||
problem_id = contest_id + problem_letter.lower()
|
problem_id = contest_id + problem_letter.lower()
|
||||||
url = parse_problem_url(contest_id, problem_letter)
|
url = parse_problem_url(contest_id, problem_letter)
|
||||||
tests = scrape_sample_tests(url, self.client)
|
tests = scrape_sample_tests(url)
|
||||||
|
|
||||||
response = self.client.get(url)
|
scraper = cloudscraper.create_scraper()
|
||||||
|
response = scraper.get(url, timeout=self.config.timeout_seconds)
|
||||||
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||||
|
|
||||||
|
|
@ -77,15 +76,17 @@ class CodeforcesScraper(BaseScraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _scrape_contest_list_impl(self) -> ContestListResult:
|
def _scrape_contest_list_impl(self) -> ContestListResult:
|
||||||
contests = scrape_contests(self.client)
|
contests = scrape_contests()
|
||||||
if not contests:
|
if not contests:
|
||||||
return self._create_contests_error("No contests found")
|
return self._create_contests_error("No contests found")
|
||||||
return ContestListResult(success=True, error="", contests=contests)
|
return ContestListResult(success=True, error="", contests=contests)
|
||||||
|
|
||||||
|
|
||||||
def scrape(url: str, client: HttpClient) -> list[TestCase]:
|
def scrape(url: str) -> list[TestCase]:
|
||||||
try:
|
try:
|
||||||
response = client.get(url)
|
scraper = cloudscraper.create_scraper()
|
||||||
|
response = scraper.get(url, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
input_sections = soup.find_all("div", class_="input")
|
input_sections = soup.find_all("div", class_="input")
|
||||||
|
|
@ -239,12 +240,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
|
||||||
return timeout_ms, memory_mb
|
return timeout_ms, memory_mb
|
||||||
|
|
||||||
|
|
||||||
def scrape_contest_problems(
|
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
|
||||||
contest_id: str, client: HttpClient
|
|
||||||
) -> list[ProblemSummary]:
|
|
||||||
try:
|
try:
|
||||||
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
|
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
|
||||||
response = client.get(contest_url)
|
scraper = cloudscraper.create_scraper()
|
||||||
|
response = scraper.get(contest_url, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
problems: list[ProblemSummary] = []
|
problems: list[ProblemSummary] = []
|
||||||
|
|
@ -280,13 +281,15 @@ def scrape_contest_problems(
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]:
|
def scrape_sample_tests(url: str) -> list[TestCase]:
|
||||||
print(f"Scraping: {url}", file=sys.stderr)
|
print(f"Scraping: {url}", file=sys.stderr)
|
||||||
return scrape(url, client)
|
return scrape(url)
|
||||||
|
|
||||||
|
|
||||||
def scrape_contests(client: HttpClient) -> list[ContestSummary]:
|
def scrape_contests() -> list[ContestSummary]:
|
||||||
response = client.get("https://codeforces.com/api/contest.list")
|
scraper = cloudscraper.create_scraper()
|
||||||
|
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
data = response.json()
|
data = response.json()
|
||||||
if data["status"] != "OK":
|
if data["status"] != "OK":
|
||||||
|
|
@ -364,8 +367,6 @@ def main() -> None:
|
||||||
print(json.dumps(asdict(result)))
|
print(json.dumps(asdict(result)))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
scraper.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
202
scrapers/cses.py
202
scrapers/cses.py
|
|
@ -9,6 +9,7 @@ import backoff
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup, Tag
|
from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
|
from .base import BaseScraper
|
||||||
from .models import (
|
from .models import (
|
||||||
ContestListResult,
|
ContestListResult,
|
||||||
ContestSummary,
|
ContestSummary,
|
||||||
|
|
@ -322,6 +323,111 @@ def scrape(url: str) -> list[TestCase]:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class CSESScraper(BaseScraper):
|
||||||
|
@property
|
||||||
|
def platform_name(self) -> str:
|
||||||
|
return "cses"
|
||||||
|
|
||||||
|
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||||
|
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
|
||||||
|
|
||||||
|
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||||
|
return self._safe_execute(
|
||||||
|
"tests", self._scrape_tests_impl, contest_id, problem_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def scrape_contest_list(self) -> ContestListResult:
|
||||||
|
return self._safe_execute("contests", self._scrape_contests_impl)
|
||||||
|
|
||||||
|
def _safe_execute(self, operation: str, func, *args):
|
||||||
|
try:
|
||||||
|
return func(*args)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"{self.platform_name}: {str(e)}"
|
||||||
|
|
||||||
|
if operation == "metadata":
|
||||||
|
return MetadataResult(success=False, error=error_msg)
|
||||||
|
elif operation == "tests":
|
||||||
|
return TestsResult(
|
||||||
|
success=False,
|
||||||
|
error=error_msg,
|
||||||
|
problem_id="",
|
||||||
|
url="",
|
||||||
|
tests=[],
|
||||||
|
timeout_ms=0,
|
||||||
|
memory_mb=0,
|
||||||
|
)
|
||||||
|
elif operation == "contests":
|
||||||
|
return ContestListResult(success=False, error=error_msg)
|
||||||
|
|
||||||
|
def _scrape_metadata_impl(self, category_id: str) -> MetadataResult:
|
||||||
|
problems = scrape_category_problems(category_id)
|
||||||
|
if not problems:
|
||||||
|
return MetadataResult(
|
||||||
|
success=False,
|
||||||
|
error=f"{self.platform_name}: No problems found for category: {category_id}",
|
||||||
|
)
|
||||||
|
return MetadataResult(
|
||||||
|
success=True, error="", contest_id=category_id, problems=problems
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scrape_tests_impl(self, category: str, problem_id: str) -> TestsResult:
|
||||||
|
url = parse_problem_url(problem_id)
|
||||||
|
if not url:
|
||||||
|
return TestsResult(
|
||||||
|
success=False,
|
||||||
|
error=f"{self.platform_name}: Invalid problem input: {problem_id}. Use either problem ID (e.g., 1068) or full URL",
|
||||||
|
problem_id=problem_id if problem_id.isdigit() else "",
|
||||||
|
url="",
|
||||||
|
tests=[],
|
||||||
|
timeout_ms=0,
|
||||||
|
memory_mb=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
tests = scrape(url)
|
||||||
|
actual_problem_id = (
|
||||||
|
problem_id if problem_id.isdigit() else problem_id.split("/")[-1]
|
||||||
|
)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
}
|
||||||
|
response = requests.get(url, headers=headers, timeout=10)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||||
|
|
||||||
|
if not tests:
|
||||||
|
return TestsResult(
|
||||||
|
success=False,
|
||||||
|
error=f"{self.platform_name}: No tests found for {problem_id}",
|
||||||
|
problem_id=actual_problem_id,
|
||||||
|
url=url,
|
||||||
|
tests=[],
|
||||||
|
timeout_ms=timeout_ms,
|
||||||
|
memory_mb=memory_mb,
|
||||||
|
)
|
||||||
|
|
||||||
|
return TestsResult(
|
||||||
|
success=True,
|
||||||
|
error="",
|
||||||
|
problem_id=actual_problem_id,
|
||||||
|
url=url,
|
||||||
|
tests=tests,
|
||||||
|
timeout_ms=timeout_ms,
|
||||||
|
memory_mb=memory_mb,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scrape_contests_impl(self) -> ContestListResult:
|
||||||
|
categories = scrape_categories()
|
||||||
|
if not categories:
|
||||||
|
return ContestListResult(
|
||||||
|
success=False, error=f"{self.platform_name}: No contests found"
|
||||||
|
)
|
||||||
|
return ContestListResult(success=True, error="", contests=categories)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
result = MetadataResult(
|
result = MetadataResult(
|
||||||
|
|
@ -332,6 +438,7 @@ def main() -> None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
mode: str = sys.argv[1]
|
mode: str = sys.argv[1]
|
||||||
|
scraper = CSESScraper()
|
||||||
|
|
||||||
if mode == "metadata":
|
if mode == "metadata":
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
|
|
@ -343,18 +450,10 @@ def main() -> None:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
category_id = sys.argv[2]
|
category_id = sys.argv[2]
|
||||||
problems = scrape_category_problems(category_id)
|
result = scraper.scrape_contest_metadata(category_id)
|
||||||
|
|
||||||
if not problems:
|
|
||||||
result = MetadataResult(
|
|
||||||
success=False,
|
|
||||||
error=f"No problems found for category: {category_id}",
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(result)))
|
|
||||||
return
|
|
||||||
|
|
||||||
result = MetadataResult(success=True, error="", problems=problems)
|
|
||||||
print(json.dumps(asdict(result)))
|
print(json.dumps(asdict(result)))
|
||||||
|
if not result.success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
elif mode == "tests":
|
elif mode == "tests":
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 4:
|
||||||
|
|
@ -370,73 +469,12 @@ def main() -> None:
|
||||||
print(json.dumps(asdict(tests_result)))
|
print(json.dumps(asdict(tests_result)))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
problem_input: str = sys.argv[3]
|
category = sys.argv[2]
|
||||||
url: str | None = parse_problem_url(problem_input)
|
problem_id = sys.argv[3]
|
||||||
|
tests_result = scraper.scrape_problem_tests(category, problem_id)
|
||||||
if not url:
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=False,
|
|
||||||
error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
|
|
||||||
problem_id=problem_input if problem_input.isdigit() else "",
|
|
||||||
url="",
|
|
||||||
tests=[],
|
|
||||||
timeout_ms=0,
|
|
||||||
memory_mb=0,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
tests: list[TestCase] = scrape(url)
|
|
||||||
|
|
||||||
problem_id: str = (
|
|
||||||
problem_input if problem_input.isdigit() else problem_input.split("/")[-1]
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
response = requests.get(url, headers=headers, timeout=10)
|
|
||||||
response.raise_for_status()
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
|
||||||
except Exception as e:
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=False,
|
|
||||||
error=f"Failed to extract constraints: {e}",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=[],
|
|
||||||
timeout_ms=0,
|
|
||||||
memory_mb=0,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if not tests:
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=False,
|
|
||||||
error=f"No tests found for {problem_input}",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=[],
|
|
||||||
timeout_ms=timeout_ms,
|
|
||||||
memory_mb=memory_mb,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
test_cases = tests
|
|
||||||
tests_result = TestsResult(
|
|
||||||
success=True,
|
|
||||||
error="",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=test_cases,
|
|
||||||
timeout_ms=timeout_ms,
|
|
||||||
memory_mb=memory_mb,
|
|
||||||
)
|
|
||||||
print(json.dumps(asdict(tests_result)))
|
print(json.dumps(asdict(tests_result)))
|
||||||
|
if not tests_result.success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
elif mode == "contests":
|
elif mode == "contests":
|
||||||
if len(sys.argv) != 2:
|
if len(sys.argv) != 2:
|
||||||
|
|
@ -446,14 +484,10 @@ def main() -> None:
|
||||||
print(json.dumps(asdict(contest_result)))
|
print(json.dumps(asdict(contest_result)))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
categories = scrape_categories()
|
contest_result = scraper.scrape_contest_list()
|
||||||
if not categories:
|
|
||||||
contest_result = ContestListResult(success=False, error="No contests found")
|
|
||||||
print(json.dumps(asdict(contest_result)))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
contest_result = ContestListResult(success=True, error="", contests=categories)
|
|
||||||
print(json.dumps(asdict(contest_result)))
|
print(json.dumps(asdict(contest_result)))
|
||||||
|
if not contest_result.success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
result = MetadataResult(
|
result = MetadataResult(
|
||||||
|
|
|
||||||
|
|
@ -5,14 +5,16 @@ from scrapers.models import ContestSummary, ProblemSummary
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_success(mocker, mock_codeforces_html):
|
def test_scrape_success(mocker, mock_codeforces_html):
|
||||||
mock_client = Mock()
|
mock_scraper = Mock()
|
||||||
mock_response = Mock()
|
mock_response = Mock()
|
||||||
mock_response.text = mock_codeforces_html
|
mock_response.text = mock_codeforces_html
|
||||||
mock_client.get.return_value = mock_response
|
mock_scraper.get.return_value = mock_response
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||||
|
)
|
||||||
|
|
||||||
scraper = CodeforcesScraper()
|
scraper = CodeforcesScraper()
|
||||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
|
||||||
|
|
||||||
result = scraper.scrape_problem_tests("1900", "A")
|
result = scraper.scrape_problem_tests("1900", "A")
|
||||||
|
|
||||||
assert result.success == True
|
assert result.success == True
|
||||||
|
|
@ -22,17 +24,19 @@ def test_scrape_success(mocker, mock_codeforces_html):
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_contest_problems(mocker):
|
def test_scrape_contest_problems(mocker):
|
||||||
mock_client = Mock()
|
mock_scraper = Mock()
|
||||||
mock_response = Mock()
|
mock_response = Mock()
|
||||||
mock_response.text = """
|
mock_response.text = """
|
||||||
<a href="/contest/1900/problem/A">A. Problem A</a>
|
<a href="/contest/1900/problem/A">A. Problem A</a>
|
||||||
<a href="/contest/1900/problem/B">B. Problem B</a>
|
<a href="/contest/1900/problem/B">B. Problem B</a>
|
||||||
"""
|
"""
|
||||||
mock_client.get.return_value = mock_response
|
mock_scraper.get.return_value = mock_response
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||||
|
)
|
||||||
|
|
||||||
scraper = CodeforcesScraper()
|
scraper = CodeforcesScraper()
|
||||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
|
||||||
|
|
||||||
result = scraper.scrape_contest_metadata("1900")
|
result = scraper.scrape_contest_metadata("1900")
|
||||||
|
|
||||||
assert result.success == True
|
assert result.success == True
|
||||||
|
|
@ -42,12 +46,14 @@ def test_scrape_contest_problems(mocker):
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_network_error(mocker):
|
def test_scrape_network_error(mocker):
|
||||||
mock_client = Mock()
|
mock_scraper = Mock()
|
||||||
mock_client.get.side_effect = Exception("Network error")
|
mock_scraper.get.side_effect = Exception("Network error")
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||||
|
)
|
||||||
|
|
||||||
scraper = CodeforcesScraper()
|
scraper = CodeforcesScraper()
|
||||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
|
||||||
|
|
||||||
result = scraper.scrape_problem_tests("1900", "A")
|
result = scraper.scrape_problem_tests("1900", "A")
|
||||||
|
|
||||||
assert result.success == False
|
assert result.success == False
|
||||||
|
|
@ -55,7 +61,7 @@ def test_scrape_network_error(mocker):
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_contests_success(mocker):
|
def test_scrape_contests_success(mocker):
|
||||||
mock_client = Mock()
|
mock_scraper = Mock()
|
||||||
mock_response = Mock()
|
mock_response = Mock()
|
||||||
mock_response.json.return_value = {
|
mock_response.json.return_value = {
|
||||||
"status": "OK",
|
"status": "OK",
|
||||||
|
|
@ -65,11 +71,13 @@ def test_scrape_contests_success(mocker):
|
||||||
{"id": 1949, "name": "Codeforces Global Round 26"},
|
{"id": 1949, "name": "Codeforces Global Round 26"},
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
mock_client.get.return_value = mock_response
|
mock_scraper.get.return_value = mock_response
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||||
|
)
|
||||||
|
|
||||||
scraper = CodeforcesScraper()
|
scraper = CodeforcesScraper()
|
||||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
|
||||||
|
|
||||||
result = scraper.scrape_contest_list()
|
result = scraper.scrape_contest_list()
|
||||||
|
|
||||||
assert result.success == True
|
assert result.success == True
|
||||||
|
|
@ -92,14 +100,16 @@ def test_scrape_contests_success(mocker):
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_contests_api_error(mocker):
|
def test_scrape_contests_api_error(mocker):
|
||||||
mock_client = Mock()
|
mock_scraper = Mock()
|
||||||
mock_response = Mock()
|
mock_response = Mock()
|
||||||
mock_response.json.return_value = {"status": "FAILED", "result": []}
|
mock_response.json.return_value = {"status": "FAILED", "result": []}
|
||||||
mock_client.get.return_value = mock_response
|
mock_scraper.get.return_value = mock_response
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||||
|
)
|
||||||
|
|
||||||
scraper = CodeforcesScraper()
|
scraper = CodeforcesScraper()
|
||||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
|
||||||
|
|
||||||
result = scraper.scrape_contest_list()
|
result = scraper.scrape_contest_list()
|
||||||
|
|
||||||
assert result.success == False
|
assert result.success == False
|
||||||
|
|
@ -107,12 +117,14 @@ def test_scrape_contests_api_error(mocker):
|
||||||
|
|
||||||
|
|
||||||
def test_scrape_contests_network_error(mocker):
|
def test_scrape_contests_network_error(mocker):
|
||||||
mock_client = Mock()
|
mock_scraper = Mock()
|
||||||
mock_client.get.side_effect = Exception("Network error")
|
mock_scraper.get.side_effect = Exception("Network error")
|
||||||
|
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||||
|
)
|
||||||
|
|
||||||
scraper = CodeforcesScraper()
|
scraper = CodeforcesScraper()
|
||||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
|
||||||
|
|
||||||
result = scraper.scrape_contest_list()
|
result = scraper.scrape_contest_list()
|
||||||
|
|
||||||
assert result.success == False
|
assert result.success == False
|
||||||
|
|
|
||||||
162
tests/scrapers/test_interface_compliance.py
Normal file
162
tests/scrapers/test_interface_compliance.py
Normal file
|
|
@ -0,0 +1,162 @@
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from scrapers import ALL_SCRAPERS, BaseScraper
|
||||||
|
from scrapers.models import ContestListResult, MetadataResult, TestsResult
|
||||||
|
|
||||||
|
ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values())
|
||||||
|
|
||||||
|
|
||||||
|
class TestScraperInterfaceCompliance:
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_implements_base_interface(self, scraper_class):
|
||||||
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
assert isinstance(scraper, BaseScraper)
|
||||||
|
assert hasattr(scraper, "platform_name")
|
||||||
|
assert hasattr(scraper, "scrape_contest_metadata")
|
||||||
|
assert hasattr(scraper, "scrape_problem_tests")
|
||||||
|
assert hasattr(scraper, "scrape_contest_list")
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_platform_name_is_string(self, scraper_class):
|
||||||
|
scraper = scraper_class()
|
||||||
|
platform_name = scraper.platform_name
|
||||||
|
|
||||||
|
assert isinstance(platform_name, str)
|
||||||
|
assert len(platform_name) > 0
|
||||||
|
assert platform_name.islower() # Convention: lowercase platform names
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_metadata_method_signature(self, scraper_class, mocker):
|
||||||
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
# Mock the underlying HTTP calls to avoid network requests
|
||||||
|
if scraper.platform_name == "codeforces":
|
||||||
|
mock_scraper = Mock()
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.text = "<a href='/contest/1900/problem/A'>A. Test</a>"
|
||||||
|
mock_scraper.get.return_value = mock_response
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||||
|
return_value=mock_scraper,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = scraper.scrape_contest_metadata("test_contest")
|
||||||
|
|
||||||
|
assert isinstance(result, MetadataResult)
|
||||||
|
assert hasattr(result, "success")
|
||||||
|
assert hasattr(result, "error")
|
||||||
|
assert hasattr(result, "problems")
|
||||||
|
assert hasattr(result, "contest_id")
|
||||||
|
assert isinstance(result.success, bool)
|
||||||
|
assert isinstance(result.error, str)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_problem_tests_method_signature(self, scraper_class, mocker):
|
||||||
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
if scraper.platform_name == "codeforces":
|
||||||
|
mock_scraper = Mock()
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.text = """
|
||||||
|
<div class="time-limit">Time limit: 1 seconds</div>
|
||||||
|
<div class="memory-limit">Memory limit: 256 megabytes</div>
|
||||||
|
<div class="input"><pre><div class="test-example-line-1">3</div></pre></div>
|
||||||
|
<div class="output"><pre><div class="test-example-line-1">6</div></pre></div>
|
||||||
|
"""
|
||||||
|
mock_scraper.get.return_value = mock_response
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||||
|
return_value=mock_scraper,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = scraper.scrape_problem_tests("test_contest", "A")
|
||||||
|
|
||||||
|
assert isinstance(result, TestsResult)
|
||||||
|
assert hasattr(result, "success")
|
||||||
|
assert hasattr(result, "error")
|
||||||
|
assert hasattr(result, "tests")
|
||||||
|
assert hasattr(result, "problem_id")
|
||||||
|
assert hasattr(result, "url")
|
||||||
|
assert hasattr(result, "timeout_ms")
|
||||||
|
assert hasattr(result, "memory_mb")
|
||||||
|
assert isinstance(result.success, bool)
|
||||||
|
assert isinstance(result.error, str)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_contest_list_method_signature(self, scraper_class, mocker):
|
||||||
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
if scraper.platform_name == "codeforces":
|
||||||
|
mock_scraper = Mock()
|
||||||
|
mock_response = Mock()
|
||||||
|
mock_response.json.return_value = {
|
||||||
|
"status": "OK",
|
||||||
|
"result": [{"id": 1900, "name": "Test Contest"}],
|
||||||
|
}
|
||||||
|
mock_scraper.get.return_value = mock_response
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||||
|
return_value=mock_scraper,
|
||||||
|
)
|
||||||
|
|
||||||
|
result = scraper.scrape_contest_list()
|
||||||
|
|
||||||
|
assert isinstance(result, ContestListResult)
|
||||||
|
assert hasattr(result, "success")
|
||||||
|
assert hasattr(result, "error")
|
||||||
|
assert hasattr(result, "contests")
|
||||||
|
assert isinstance(result.success, bool)
|
||||||
|
assert isinstance(result.error, str)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_error_message_format(self, scraper_class, mocker):
|
||||||
|
scraper = scraper_class()
|
||||||
|
platform_name = scraper.platform_name
|
||||||
|
|
||||||
|
# Force an error by mocking HTTP failure
|
||||||
|
if scraper.platform_name == "codeforces":
|
||||||
|
mock_scraper = Mock()
|
||||||
|
mock_scraper.get.side_effect = Exception("Network error")
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||||
|
return_value=mock_scraper,
|
||||||
|
)
|
||||||
|
elif scraper.platform_name == "atcoder":
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.atcoder.requests.get", side_effect=Exception("Network error")
|
||||||
|
)
|
||||||
|
elif scraper.platform_name == "cses":
|
||||||
|
mocker.patch(
|
||||||
|
"scrapers.cses.make_request", side_effect=Exception("Network error")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Test metadata error format
|
||||||
|
result = scraper.scrape_contest_metadata("test")
|
||||||
|
assert result.success == False
|
||||||
|
assert result.error.startswith(f"{platform_name}: ")
|
||||||
|
|
||||||
|
# Test problem tests error format
|
||||||
|
result = scraper.scrape_problem_tests("test", "A")
|
||||||
|
assert result.success == False
|
||||||
|
assert result.error.startswith(f"{platform_name}: ")
|
||||||
|
|
||||||
|
# Test contest list error format
|
||||||
|
result = scraper.scrape_contest_list()
|
||||||
|
assert result.success == False
|
||||||
|
assert result.error.startswith(f"{platform_name}: ")
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||||
|
def test_scraper_instantiation(self, scraper_class):
|
||||||
|
scraper1 = scraper_class()
|
||||||
|
assert isinstance(scraper1, BaseScraper)
|
||||||
|
assert scraper1.config is not None
|
||||||
|
|
||||||
|
from scrapers.base import ScraperConfig
|
||||||
|
|
||||||
|
custom_config = ScraperConfig(timeout_seconds=60)
|
||||||
|
scraper2 = scraper_class(custom_config)
|
||||||
|
assert isinstance(scraper2, BaseScraper)
|
||||||
|
assert scraper2.config.timeout_seconds == 60
|
||||||
58
tests/scrapers/test_registry.py
Normal file
58
tests/scrapers/test_registry.py
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from scrapers import ALL_SCRAPERS, get_scraper, list_platforms
|
||||||
|
from scrapers.base import BaseScraper
|
||||||
|
from scrapers.codeforces import CodeforcesScraper
|
||||||
|
|
||||||
|
|
||||||
|
class TestScraperRegistry:
|
||||||
|
def test_get_scraper_valid_platform(self):
|
||||||
|
scraper_class = get_scraper("codeforces")
|
||||||
|
assert scraper_class == CodeforcesScraper
|
||||||
|
assert issubclass(scraper_class, BaseScraper)
|
||||||
|
|
||||||
|
scraper = scraper_class()
|
||||||
|
assert isinstance(scraper, BaseScraper)
|
||||||
|
assert scraper.platform_name == "codeforces"
|
||||||
|
|
||||||
|
def test_get_scraper_invalid_platform(self):
|
||||||
|
with pytest.raises(KeyError) as exc_info:
|
||||||
|
get_scraper("nonexistent")
|
||||||
|
|
||||||
|
error_msg = str(exc_info.value)
|
||||||
|
assert "nonexistent" in error_msg
|
||||||
|
assert "Available platforms" in error_msg
|
||||||
|
|
||||||
|
def test_list_platforms(self):
|
||||||
|
platforms = list_platforms()
|
||||||
|
|
||||||
|
assert isinstance(platforms, list)
|
||||||
|
assert len(platforms) > 0
|
||||||
|
assert "codeforces" in platforms
|
||||||
|
|
||||||
|
assert set(platforms) == set(ALL_SCRAPERS.keys())
|
||||||
|
|
||||||
|
def test_all_scrapers_registry(self):
|
||||||
|
assert isinstance(ALL_SCRAPERS, dict)
|
||||||
|
assert len(ALL_SCRAPERS) > 0
|
||||||
|
|
||||||
|
for platform_name, scraper_class in ALL_SCRAPERS.items():
|
||||||
|
assert isinstance(platform_name, str)
|
||||||
|
assert platform_name.islower()
|
||||||
|
|
||||||
|
assert issubclass(scraper_class, BaseScraper)
|
||||||
|
|
||||||
|
scraper = scraper_class()
|
||||||
|
assert scraper.platform_name == platform_name
|
||||||
|
|
||||||
|
def test_registry_import_consistency(self):
|
||||||
|
from scrapers.codeforces import CodeforcesScraper as DirectImport
|
||||||
|
|
||||||
|
registry_class = get_scraper("codeforces")
|
||||||
|
assert registry_class == DirectImport
|
||||||
|
|
||||||
|
def test_all_scrapers_can_be_instantiated(self):
|
||||||
|
for platform_name, scraper_class in ALL_SCRAPERS.items():
|
||||||
|
scraper = scraper_class()
|
||||||
|
assert isinstance(scraper, BaseScraper)
|
||||||
|
assert scraper.platform_name == platform_name
|
||||||
Loading…
Add table
Add a link
Reference in a new issue