feat(scrapers): total refactor
This commit is contained in:
parent
eb3f7762de
commit
db391da52c
9 changed files with 559 additions and 307 deletions
|
|
@ -0,0 +1,56 @@
|
|||
from .atcoder import AtCoderScraper
|
||||
from .base import BaseScraper, ScraperConfig
|
||||
from .codeforces import CodeforcesScraper
|
||||
from .cses import CSESScraper
|
||||
from .models import (
|
||||
ContestListResult,
|
||||
ContestSummary,
|
||||
MetadataResult,
|
||||
ProblemSummary,
|
||||
TestCase,
|
||||
TestsResult,
|
||||
)
|
||||
|
||||
ALL_SCRAPERS: dict[str, type[BaseScraper]] = {
|
||||
"atcoder": AtCoderScraper,
|
||||
"codeforces": CodeforcesScraper,
|
||||
"cses": CSESScraper,
|
||||
}
|
||||
|
||||
_SCRAPER_CLASSES = [
|
||||
"AtCoderScraper",
|
||||
"CodeforcesScraper",
|
||||
"CSESScraper",
|
||||
]
|
||||
|
||||
_BASE_EXPORTS = [
|
||||
"BaseScraper",
|
||||
"ScraperConfig",
|
||||
"ContestListResult",
|
||||
"ContestSummary",
|
||||
"MetadataResult",
|
||||
"ProblemSummary",
|
||||
"TestCase",
|
||||
"TestsResult",
|
||||
]
|
||||
|
||||
_REGISTRY_FUNCTIONS = [
|
||||
"get_scraper",
|
||||
"list_platforms",
|
||||
"ALL_SCRAPERS",
|
||||
]
|
||||
|
||||
__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS
|
||||
|
||||
|
||||
def get_scraper(platform: str) -> type[BaseScraper]:
|
||||
if platform not in ALL_SCRAPERS:
|
||||
available = ", ".join(ALL_SCRAPERS.keys())
|
||||
raise KeyError(
|
||||
f"Unknown platform '{platform}'. Available platforms: {available}"
|
||||
)
|
||||
return ALL_SCRAPERS[platform]
|
||||
|
||||
|
||||
def list_platforms() -> list[str]:
|
||||
return list(ALL_SCRAPERS.keys())
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import concurrent.futures
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
|
@ -9,6 +10,7 @@ import backoff
|
|||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from .base import BaseScraper
|
||||
from .models import (
|
||||
ContestListResult,
|
||||
ContestSummary,
|
||||
|
|
@ -167,8 +169,6 @@ def scrape(url: str) -> list[TestCase]:
|
|||
|
||||
|
||||
def scrape_contests() -> list[ContestSummary]:
|
||||
import concurrent.futures
|
||||
|
||||
def get_max_pages() -> int:
|
||||
try:
|
||||
headers = {
|
||||
|
|
@ -296,6 +296,101 @@ def scrape_contests() -> list[ContestSummary]:
|
|||
return all_contests
|
||||
|
||||
|
||||
class AtCoderScraper(BaseScraper):
|
||||
@property
|
||||
def platform_name(self) -> str:
|
||||
return "atcoder"
|
||||
|
||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
|
||||
|
||||
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||
return self._safe_execute(
|
||||
"tests", self._scrape_tests_impl, contest_id, problem_id
|
||||
)
|
||||
|
||||
def scrape_contest_list(self) -> ContestListResult:
|
||||
return self._safe_execute("contests", self._scrape_contests_impl)
|
||||
|
||||
def _safe_execute(self, operation: str, func, *args):
|
||||
try:
|
||||
return func(*args)
|
||||
except Exception as e:
|
||||
error_msg = f"{self.platform_name}: {str(e)}"
|
||||
|
||||
if operation == "metadata":
|
||||
return MetadataResult(success=False, error=error_msg)
|
||||
elif operation == "tests":
|
||||
return TestsResult(
|
||||
success=False,
|
||||
error=error_msg,
|
||||
problem_id="",
|
||||
url="",
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
elif operation == "contests":
|
||||
return ContestListResult(success=False, error=error_msg)
|
||||
|
||||
def _scrape_metadata_impl(self, contest_id: str) -> MetadataResult:
|
||||
problems = scrape_contest_problems(contest_id)
|
||||
if not problems:
|
||||
return MetadataResult(
|
||||
success=False,
|
||||
error=f"{self.platform_name}: No problems found for contest {contest_id}",
|
||||
)
|
||||
return MetadataResult(
|
||||
success=True, error="", contest_id=contest_id, problems=problems
|
||||
)
|
||||
|
||||
def _scrape_tests_impl(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||
problem_letter = problem_id.upper()
|
||||
url = parse_problem_url(contest_id, problem_letter)
|
||||
tests = scrape(url)
|
||||
|
||||
response = requests.get(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||
|
||||
if not tests:
|
||||
return TestsResult(
|
||||
success=False,
|
||||
error=f"{self.platform_name}: No tests found for {contest_id} {problem_letter}",
|
||||
problem_id=f"{contest_id}_{problem_id.lower()}",
|
||||
url=url,
|
||||
tests=[],
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
|
||||
return TestsResult(
|
||||
success=True,
|
||||
error="",
|
||||
problem_id=f"{contest_id}_{problem_id.lower()}",
|
||||
url=url,
|
||||
tests=tests,
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
|
||||
def _scrape_contests_impl(self) -> ContestListResult:
|
||||
contests = scrape_contests()
|
||||
if not contests:
|
||||
return ContestListResult(
|
||||
success=False, error=f"{self.platform_name}: No contests found"
|
||||
)
|
||||
return ContestListResult(success=True, error="", contests=contests)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) < 2:
|
||||
result = MetadataResult(
|
||||
|
|
@ -306,6 +401,7 @@ def main() -> None:
|
|||
sys.exit(1)
|
||||
|
||||
mode: str = sys.argv[1]
|
||||
scraper = AtCoderScraper()
|
||||
|
||||
if mode == "metadata":
|
||||
if len(sys.argv) != 3:
|
||||
|
|
@ -317,23 +413,10 @@ def main() -> None:
|
|||
sys.exit(1)
|
||||
|
||||
contest_id: str = sys.argv[2]
|
||||
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
|
||||
|
||||
if not problems:
|
||||
result = MetadataResult(
|
||||
success=False,
|
||||
error=f"No problems found for contest {contest_id}",
|
||||
)
|
||||
print(json.dumps(asdict(result)))
|
||||
sys.exit(1)
|
||||
|
||||
result = MetadataResult(
|
||||
success=True,
|
||||
error="",
|
||||
contest_id=contest_id,
|
||||
problems=problems,
|
||||
)
|
||||
result = scraper.scrape_contest_metadata(contest_id)
|
||||
print(json.dumps(asdict(result)))
|
||||
if not result.success:
|
||||
sys.exit(1)
|
||||
|
||||
elif mode == "tests":
|
||||
if len(sys.argv) != 4:
|
||||
|
|
@ -351,55 +434,10 @@ def main() -> None:
|
|||
|
||||
test_contest_id: str = sys.argv[2]
|
||||
problem_letter: str = sys.argv[3]
|
||||
problem_id: str = f"{test_contest_id}_{problem_letter.lower()}"
|
||||
|
||||
url: str = parse_problem_url(test_contest_id, problem_letter)
|
||||
tests: list[TestCase] = scrape(url)
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||
except Exception as e:
|
||||
tests_result = TestsResult(
|
||||
success=False,
|
||||
error=f"Failed to extract constraints: {e}",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
if not tests:
|
||||
tests_result = TestsResult(
|
||||
success=False,
|
||||
error=f"No tests found for {test_contest_id} {problem_letter}",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=[],
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
tests_result = TestsResult(
|
||||
success=True,
|
||||
error="",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=tests,
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
tests_result = scraper.scrape_problem_tests(test_contest_id, problem_letter)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
if not tests_result.success:
|
||||
sys.exit(1)
|
||||
|
||||
elif mode == "contests":
|
||||
if len(sys.argv) != 2:
|
||||
|
|
@ -409,14 +447,10 @@ def main() -> None:
|
|||
print(json.dumps(asdict(contest_result)))
|
||||
sys.exit(1)
|
||||
|
||||
contests = scrape_contests()
|
||||
if not contests:
|
||||
contest_result = ContestListResult(success=False, error="No contests found")
|
||||
print(json.dumps(asdict(contest_result)))
|
||||
sys.exit(1)
|
||||
|
||||
contest_result = ContestListResult(success=True, error="", contests=contests)
|
||||
contest_result = scraper.scrape_contest_list()
|
||||
print(json.dumps(asdict(contest_result)))
|
||||
if not contest_result.success:
|
||||
sys.exit(1)
|
||||
|
||||
else:
|
||||
result = MetadataResult(
|
||||
|
|
|
|||
|
|
@ -1,8 +1,5 @@
|
|||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Protocol
|
||||
|
||||
import requests
|
||||
|
||||
from .models import ContestListResult, MetadataResult, TestsResult
|
||||
|
||||
|
|
@ -15,23 +12,14 @@ class ScraperConfig:
|
|||
rate_limit_delay: float = 1.0
|
||||
|
||||
|
||||
class HttpClient(Protocol):
|
||||
def get(self, url: str, **kwargs) -> requests.Response: ...
|
||||
def close(self) -> None: ...
|
||||
|
||||
|
||||
class BaseScraper(ABC):
|
||||
def __init__(self, config: ScraperConfig | None = None):
|
||||
self.config = config or ScraperConfig()
|
||||
self._client: HttpClient | None = None
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def platform_name(self) -> str: ...
|
||||
|
||||
@abstractmethod
|
||||
def _create_client(self) -> HttpClient: ...
|
||||
|
||||
@abstractmethod
|
||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ...
|
||||
|
||||
|
|
@ -41,17 +29,6 @@ class BaseScraper(ABC):
|
|||
@abstractmethod
|
||||
def scrape_contest_list(self) -> ContestListResult: ...
|
||||
|
||||
@property
|
||||
def client(self) -> HttpClient:
|
||||
if self._client is None:
|
||||
self._client = self._create_client()
|
||||
return self._client
|
||||
|
||||
def close(self) -> None:
|
||||
if self._client is not None:
|
||||
self._client.close()
|
||||
self._client = None
|
||||
|
||||
def _create_metadata_error(
|
||||
self, error_msg: str, contest_id: str = ""
|
||||
) -> MetadataResult:
|
||||
|
|
|
|||
|
|
@ -1,82 +0,0 @@
|
|||
import time
|
||||
|
||||
import backoff
|
||||
import requests
|
||||
|
||||
from .base import HttpClient, ScraperConfig
|
||||
|
||||
|
||||
class RequestsClient:
|
||||
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
|
||||
self.config = config
|
||||
self.session = requests.Session()
|
||||
|
||||
default_headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
if headers:
|
||||
default_headers.update(headers)
|
||||
|
||||
self.session.headers.update(default_headers)
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(requests.RequestException, requests.HTTPError),
|
||||
max_tries=3,
|
||||
base=2.0,
|
||||
jitter=backoff.random_jitter,
|
||||
)
|
||||
@backoff.on_predicate(
|
||||
backoff.expo,
|
||||
lambda response: response.status_code == 429,
|
||||
max_tries=3,
|
||||
base=2.0,
|
||||
jitter=backoff.random_jitter,
|
||||
)
|
||||
def get(self, url: str, **kwargs) -> requests.Response:
|
||||
timeout = kwargs.get("timeout", self.config.timeout_seconds)
|
||||
response = self.session.get(url, timeout=timeout, **kwargs)
|
||||
response.raise_for_status()
|
||||
|
||||
if (
|
||||
hasattr(self.config, "rate_limit_delay")
|
||||
and self.config.rate_limit_delay > 0
|
||||
):
|
||||
time.sleep(self.config.rate_limit_delay)
|
||||
|
||||
return response
|
||||
|
||||
def close(self) -> None:
|
||||
self.session.close()
|
||||
|
||||
|
||||
class CloudScraperClient:
|
||||
def __init__(self, config: ScraperConfig):
|
||||
import cloudscraper
|
||||
|
||||
self.config = config
|
||||
self.scraper = cloudscraper.create_scraper()
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(requests.RequestException, requests.HTTPError),
|
||||
max_tries=3,
|
||||
base=2.0,
|
||||
jitter=backoff.random_jitter,
|
||||
)
|
||||
def get(self, url: str, **kwargs) -> requests.Response:
|
||||
timeout = kwargs.get("timeout", self.config.timeout_seconds)
|
||||
response = self.scraper.get(url, timeout=timeout, **kwargs)
|
||||
response.raise_for_status()
|
||||
|
||||
if (
|
||||
hasattr(self.config, "rate_limit_delay")
|
||||
and self.config.rate_limit_delay > 0
|
||||
):
|
||||
time.sleep(self.config.rate_limit_delay)
|
||||
|
||||
return response
|
||||
|
||||
def close(self) -> None:
|
||||
if hasattr(self.scraper, "close"):
|
||||
self.scraper.close()
|
||||
|
|
@ -5,10 +5,10 @@ import re
|
|||
import sys
|
||||
from dataclasses import asdict
|
||||
|
||||
import cloudscraper
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from .base import BaseScraper, HttpClient
|
||||
from .clients import CloudScraperClient
|
||||
from .base import BaseScraper
|
||||
from .models import (
|
||||
ContestListResult,
|
||||
ContestSummary,
|
||||
|
|
@ -24,9 +24,6 @@ class CodeforcesScraper(BaseScraper):
|
|||
def platform_name(self) -> str:
|
||||
return "codeforces"
|
||||
|
||||
def _create_client(self) -> HttpClient:
|
||||
return CloudScraperClient(self.config)
|
||||
|
||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||
return self._safe_execute(
|
||||
"metadata", self._scrape_contest_metadata_impl, contest_id
|
||||
|
|
@ -41,7 +38,7 @@ class CodeforcesScraper(BaseScraper):
|
|||
return self._safe_execute("contests", self._scrape_contest_list_impl)
|
||||
|
||||
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
|
||||
problems = scrape_contest_problems(contest_id, self.client)
|
||||
problems = scrape_contest_problems(contest_id)
|
||||
if not problems:
|
||||
return self._create_metadata_error(
|
||||
f"No problems found for contest {contest_id}", contest_id
|
||||
|
|
@ -55,9 +52,11 @@ class CodeforcesScraper(BaseScraper):
|
|||
) -> TestsResult:
|
||||
problem_id = contest_id + problem_letter.lower()
|
||||
url = parse_problem_url(contest_id, problem_letter)
|
||||
tests = scrape_sample_tests(url, self.client)
|
||||
tests = scrape_sample_tests(url)
|
||||
|
||||
response = self.client.get(url)
|
||||
scraper = cloudscraper.create_scraper()
|
||||
response = scraper.get(url, timeout=self.config.timeout_seconds)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||
|
||||
|
|
@ -77,15 +76,17 @@ class CodeforcesScraper(BaseScraper):
|
|||
)
|
||||
|
||||
def _scrape_contest_list_impl(self) -> ContestListResult:
|
||||
contests = scrape_contests(self.client)
|
||||
contests = scrape_contests()
|
||||
if not contests:
|
||||
return self._create_contests_error("No contests found")
|
||||
return ContestListResult(success=True, error="", contests=contests)
|
||||
|
||||
|
||||
def scrape(url: str, client: HttpClient) -> list[TestCase]:
|
||||
def scrape(url: str) -> list[TestCase]:
|
||||
try:
|
||||
response = client.get(url)
|
||||
scraper = cloudscraper.create_scraper()
|
||||
response = scraper.get(url, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
input_sections = soup.find_all("div", class_="input")
|
||||
|
|
@ -239,12 +240,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
|
|||
return timeout_ms, memory_mb
|
||||
|
||||
|
||||
def scrape_contest_problems(
|
||||
contest_id: str, client: HttpClient
|
||||
) -> list[ProblemSummary]:
|
||||
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
|
||||
try:
|
||||
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
|
||||
response = client.get(contest_url)
|
||||
scraper = cloudscraper.create_scraper()
|
||||
response = scraper.get(contest_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
problems: list[ProblemSummary] = []
|
||||
|
|
@ -280,13 +281,15 @@ def scrape_contest_problems(
|
|||
return []
|
||||
|
||||
|
||||
def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]:
|
||||
def scrape_sample_tests(url: str) -> list[TestCase]:
|
||||
print(f"Scraping: {url}", file=sys.stderr)
|
||||
return scrape(url, client)
|
||||
return scrape(url)
|
||||
|
||||
|
||||
def scrape_contests(client: HttpClient) -> list[ContestSummary]:
|
||||
response = client.get("https://codeforces.com/api/contest.list")
|
||||
def scrape_contests() -> list[ContestSummary]:
|
||||
scraper = cloudscraper.create_scraper()
|
||||
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data["status"] != "OK":
|
||||
|
|
@ -364,8 +367,6 @@ def main() -> None:
|
|||
print(json.dumps(asdict(result)))
|
||||
sys.exit(1)
|
||||
|
||||
scraper.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
202
scrapers/cses.py
202
scrapers/cses.py
|
|
@ -9,6 +9,7 @@ import backoff
|
|||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from .base import BaseScraper
|
||||
from .models import (
|
||||
ContestListResult,
|
||||
ContestSummary,
|
||||
|
|
@ -322,6 +323,111 @@ def scrape(url: str) -> list[TestCase]:
|
|||
return []
|
||||
|
||||
|
||||
class CSESScraper(BaseScraper):
|
||||
@property
|
||||
def platform_name(self) -> str:
|
||||
return "cses"
|
||||
|
||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
|
||||
|
||||
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||
return self._safe_execute(
|
||||
"tests", self._scrape_tests_impl, contest_id, problem_id
|
||||
)
|
||||
|
||||
def scrape_contest_list(self) -> ContestListResult:
|
||||
return self._safe_execute("contests", self._scrape_contests_impl)
|
||||
|
||||
def _safe_execute(self, operation: str, func, *args):
|
||||
try:
|
||||
return func(*args)
|
||||
except Exception as e:
|
||||
error_msg = f"{self.platform_name}: {str(e)}"
|
||||
|
||||
if operation == "metadata":
|
||||
return MetadataResult(success=False, error=error_msg)
|
||||
elif operation == "tests":
|
||||
return TestsResult(
|
||||
success=False,
|
||||
error=error_msg,
|
||||
problem_id="",
|
||||
url="",
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
elif operation == "contests":
|
||||
return ContestListResult(success=False, error=error_msg)
|
||||
|
||||
def _scrape_metadata_impl(self, category_id: str) -> MetadataResult:
|
||||
problems = scrape_category_problems(category_id)
|
||||
if not problems:
|
||||
return MetadataResult(
|
||||
success=False,
|
||||
error=f"{self.platform_name}: No problems found for category: {category_id}",
|
||||
)
|
||||
return MetadataResult(
|
||||
success=True, error="", contest_id=category_id, problems=problems
|
||||
)
|
||||
|
||||
def _scrape_tests_impl(self, category: str, problem_id: str) -> TestsResult:
|
||||
url = parse_problem_url(problem_id)
|
||||
if not url:
|
||||
return TestsResult(
|
||||
success=False,
|
||||
error=f"{self.platform_name}: Invalid problem input: {problem_id}. Use either problem ID (e.g., 1068) or full URL",
|
||||
problem_id=problem_id if problem_id.isdigit() else "",
|
||||
url="",
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
|
||||
tests = scrape(url)
|
||||
actual_problem_id = (
|
||||
problem_id if problem_id.isdigit() else problem_id.split("/")[-1]
|
||||
)
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||
|
||||
if not tests:
|
||||
return TestsResult(
|
||||
success=False,
|
||||
error=f"{self.platform_name}: No tests found for {problem_id}",
|
||||
problem_id=actual_problem_id,
|
||||
url=url,
|
||||
tests=[],
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
|
||||
return TestsResult(
|
||||
success=True,
|
||||
error="",
|
||||
problem_id=actual_problem_id,
|
||||
url=url,
|
||||
tests=tests,
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
|
||||
def _scrape_contests_impl(self) -> ContestListResult:
|
||||
categories = scrape_categories()
|
||||
if not categories:
|
||||
return ContestListResult(
|
||||
success=False, error=f"{self.platform_name}: No contests found"
|
||||
)
|
||||
return ContestListResult(success=True, error="", contests=categories)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) < 2:
|
||||
result = MetadataResult(
|
||||
|
|
@ -332,6 +438,7 @@ def main() -> None:
|
|||
sys.exit(1)
|
||||
|
||||
mode: str = sys.argv[1]
|
||||
scraper = CSESScraper()
|
||||
|
||||
if mode == "metadata":
|
||||
if len(sys.argv) != 3:
|
||||
|
|
@ -343,18 +450,10 @@ def main() -> None:
|
|||
sys.exit(1)
|
||||
|
||||
category_id = sys.argv[2]
|
||||
problems = scrape_category_problems(category_id)
|
||||
|
||||
if not problems:
|
||||
result = MetadataResult(
|
||||
success=False,
|
||||
error=f"No problems found for category: {category_id}",
|
||||
)
|
||||
print(json.dumps(asdict(result)))
|
||||
return
|
||||
|
||||
result = MetadataResult(success=True, error="", problems=problems)
|
||||
result = scraper.scrape_contest_metadata(category_id)
|
||||
print(json.dumps(asdict(result)))
|
||||
if not result.success:
|
||||
sys.exit(1)
|
||||
|
||||
elif mode == "tests":
|
||||
if len(sys.argv) != 4:
|
||||
|
|
@ -370,73 +469,12 @@ def main() -> None:
|
|||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
problem_input: str = sys.argv[3]
|
||||
url: str | None = parse_problem_url(problem_input)
|
||||
|
||||
if not url:
|
||||
tests_result = TestsResult(
|
||||
success=False,
|
||||
error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
|
||||
problem_id=problem_input if problem_input.isdigit() else "",
|
||||
url="",
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
tests: list[TestCase] = scrape(url)
|
||||
|
||||
problem_id: str = (
|
||||
problem_input if problem_input.isdigit() else problem_input.split("/")[-1]
|
||||
)
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||
except Exception as e:
|
||||
tests_result = TestsResult(
|
||||
success=False,
|
||||
error=f"Failed to extract constraints: {e}",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
if not tests:
|
||||
tests_result = TestsResult(
|
||||
success=False,
|
||||
error=f"No tests found for {problem_input}",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=[],
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
test_cases = tests
|
||||
tests_result = TestsResult(
|
||||
success=True,
|
||||
error="",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=test_cases,
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
)
|
||||
category = sys.argv[2]
|
||||
problem_id = sys.argv[3]
|
||||
tests_result = scraper.scrape_problem_tests(category, problem_id)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
if not tests_result.success:
|
||||
sys.exit(1)
|
||||
|
||||
elif mode == "contests":
|
||||
if len(sys.argv) != 2:
|
||||
|
|
@ -446,14 +484,10 @@ def main() -> None:
|
|||
print(json.dumps(asdict(contest_result)))
|
||||
sys.exit(1)
|
||||
|
||||
categories = scrape_categories()
|
||||
if not categories:
|
||||
contest_result = ContestListResult(success=False, error="No contests found")
|
||||
print(json.dumps(asdict(contest_result)))
|
||||
sys.exit(1)
|
||||
|
||||
contest_result = ContestListResult(success=True, error="", contests=categories)
|
||||
contest_result = scraper.scrape_contest_list()
|
||||
print(json.dumps(asdict(contest_result)))
|
||||
if not contest_result.success:
|
||||
sys.exit(1)
|
||||
|
||||
else:
|
||||
result = MetadataResult(
|
||||
|
|
|
|||
|
|
@ -5,14 +5,16 @@ from scrapers.models import ContestSummary, ProblemSummary
|
|||
|
||||
|
||||
def test_scrape_success(mocker, mock_codeforces_html):
|
||||
mock_client = Mock()
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = mock_codeforces_html
|
||||
mock_client.get.return_value = mock_response
|
||||
mock_scraper.get.return_value = mock_response
|
||||
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||
)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
||||
|
||||
result = scraper.scrape_problem_tests("1900", "A")
|
||||
|
||||
assert result.success == True
|
||||
|
|
@ -22,17 +24,19 @@ def test_scrape_success(mocker, mock_codeforces_html):
|
|||
|
||||
|
||||
def test_scrape_contest_problems(mocker):
|
||||
mock_client = Mock()
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = """
|
||||
<a href="/contest/1900/problem/A">A. Problem A</a>
|
||||
<a href="/contest/1900/problem/B">B. Problem B</a>
|
||||
"""
|
||||
mock_client.get.return_value = mock_response
|
||||
mock_scraper.get.return_value = mock_response
|
||||
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||
)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
||||
|
||||
result = scraper.scrape_contest_metadata("1900")
|
||||
|
||||
assert result.success == True
|
||||
|
|
@ -42,12 +46,14 @@ def test_scrape_contest_problems(mocker):
|
|||
|
||||
|
||||
def test_scrape_network_error(mocker):
|
||||
mock_client = Mock()
|
||||
mock_client.get.side_effect = Exception("Network error")
|
||||
mock_scraper = Mock()
|
||||
mock_scraper.get.side_effect = Exception("Network error")
|
||||
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||
)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
||||
|
||||
result = scraper.scrape_problem_tests("1900", "A")
|
||||
|
||||
assert result.success == False
|
||||
|
|
@ -55,7 +61,7 @@ def test_scrape_network_error(mocker):
|
|||
|
||||
|
||||
def test_scrape_contests_success(mocker):
|
||||
mock_client = Mock()
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {
|
||||
"status": "OK",
|
||||
|
|
@ -65,11 +71,13 @@ def test_scrape_contests_success(mocker):
|
|||
{"id": 1949, "name": "Codeforces Global Round 26"},
|
||||
],
|
||||
}
|
||||
mock_client.get.return_value = mock_response
|
||||
mock_scraper.get.return_value = mock_response
|
||||
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||
)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
||||
|
||||
result = scraper.scrape_contest_list()
|
||||
|
||||
assert result.success == True
|
||||
|
|
@ -92,14 +100,16 @@ def test_scrape_contests_success(mocker):
|
|||
|
||||
|
||||
def test_scrape_contests_api_error(mocker):
|
||||
mock_client = Mock()
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {"status": "FAILED", "result": []}
|
||||
mock_client.get.return_value = mock_response
|
||||
mock_scraper.get.return_value = mock_response
|
||||
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||
)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
||||
|
||||
result = scraper.scrape_contest_list()
|
||||
|
||||
assert result.success == False
|
||||
|
|
@ -107,12 +117,14 @@ def test_scrape_contests_api_error(mocker):
|
|||
|
||||
|
||||
def test_scrape_contests_network_error(mocker):
|
||||
mock_client = Mock()
|
||||
mock_client.get.side_effect = Exception("Network error")
|
||||
mock_scraper = Mock()
|
||||
mock_scraper.get.side_effect = Exception("Network error")
|
||||
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
|
||||
)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
|
||||
|
||||
result = scraper.scrape_contest_list()
|
||||
|
||||
assert result.success == False
|
||||
|
|
|
|||
162
tests/scrapers/test_interface_compliance.py
Normal file
162
tests/scrapers/test_interface_compliance.py
Normal file
|
|
@ -0,0 +1,162 @@
|
|||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from scrapers import ALL_SCRAPERS, BaseScraper
|
||||
from scrapers.models import ContestListResult, MetadataResult, TestsResult
|
||||
|
||||
ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values())
|
||||
|
||||
|
||||
class TestScraperInterfaceCompliance:
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_implements_base_interface(self, scraper_class):
|
||||
scraper = scraper_class()
|
||||
|
||||
assert isinstance(scraper, BaseScraper)
|
||||
assert hasattr(scraper, "platform_name")
|
||||
assert hasattr(scraper, "scrape_contest_metadata")
|
||||
assert hasattr(scraper, "scrape_problem_tests")
|
||||
assert hasattr(scraper, "scrape_contest_list")
|
||||
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_platform_name_is_string(self, scraper_class):
|
||||
scraper = scraper_class()
|
||||
platform_name = scraper.platform_name
|
||||
|
||||
assert isinstance(platform_name, str)
|
||||
assert len(platform_name) > 0
|
||||
assert platform_name.islower() # Convention: lowercase platform names
|
||||
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_metadata_method_signature(self, scraper_class, mocker):
|
||||
scraper = scraper_class()
|
||||
|
||||
# Mock the underlying HTTP calls to avoid network requests
|
||||
if scraper.platform_name == "codeforces":
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = "<a href='/contest/1900/problem/A'>A. Test</a>"
|
||||
mock_scraper.get.return_value = mock_response
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||
return_value=mock_scraper,
|
||||
)
|
||||
|
||||
result = scraper.scrape_contest_metadata("test_contest")
|
||||
|
||||
assert isinstance(result, MetadataResult)
|
||||
assert hasattr(result, "success")
|
||||
assert hasattr(result, "error")
|
||||
assert hasattr(result, "problems")
|
||||
assert hasattr(result, "contest_id")
|
||||
assert isinstance(result.success, bool)
|
||||
assert isinstance(result.error, str)
|
||||
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_problem_tests_method_signature(self, scraper_class, mocker):
|
||||
scraper = scraper_class()
|
||||
|
||||
if scraper.platform_name == "codeforces":
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.text = """
|
||||
<div class="time-limit">Time limit: 1 seconds</div>
|
||||
<div class="memory-limit">Memory limit: 256 megabytes</div>
|
||||
<div class="input"><pre><div class="test-example-line-1">3</div></pre></div>
|
||||
<div class="output"><pre><div class="test-example-line-1">6</div></pre></div>
|
||||
"""
|
||||
mock_scraper.get.return_value = mock_response
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||
return_value=mock_scraper,
|
||||
)
|
||||
|
||||
result = scraper.scrape_problem_tests("test_contest", "A")
|
||||
|
||||
assert isinstance(result, TestsResult)
|
||||
assert hasattr(result, "success")
|
||||
assert hasattr(result, "error")
|
||||
assert hasattr(result, "tests")
|
||||
assert hasattr(result, "problem_id")
|
||||
assert hasattr(result, "url")
|
||||
assert hasattr(result, "timeout_ms")
|
||||
assert hasattr(result, "memory_mb")
|
||||
assert isinstance(result.success, bool)
|
||||
assert isinstance(result.error, str)
|
||||
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_contest_list_method_signature(self, scraper_class, mocker):
|
||||
scraper = scraper_class()
|
||||
|
||||
if scraper.platform_name == "codeforces":
|
||||
mock_scraper = Mock()
|
||||
mock_response = Mock()
|
||||
mock_response.json.return_value = {
|
||||
"status": "OK",
|
||||
"result": [{"id": 1900, "name": "Test Contest"}],
|
||||
}
|
||||
mock_scraper.get.return_value = mock_response
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||
return_value=mock_scraper,
|
||||
)
|
||||
|
||||
result = scraper.scrape_contest_list()
|
||||
|
||||
assert isinstance(result, ContestListResult)
|
||||
assert hasattr(result, "success")
|
||||
assert hasattr(result, "error")
|
||||
assert hasattr(result, "contests")
|
||||
assert isinstance(result.success, bool)
|
||||
assert isinstance(result.error, str)
|
||||
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_error_message_format(self, scraper_class, mocker):
|
||||
scraper = scraper_class()
|
||||
platform_name = scraper.platform_name
|
||||
|
||||
# Force an error by mocking HTTP failure
|
||||
if scraper.platform_name == "codeforces":
|
||||
mock_scraper = Mock()
|
||||
mock_scraper.get.side_effect = Exception("Network error")
|
||||
mocker.patch(
|
||||
"scrapers.codeforces.cloudscraper.create_scraper",
|
||||
return_value=mock_scraper,
|
||||
)
|
||||
elif scraper.platform_name == "atcoder":
|
||||
mocker.patch(
|
||||
"scrapers.atcoder.requests.get", side_effect=Exception("Network error")
|
||||
)
|
||||
elif scraper.platform_name == "cses":
|
||||
mocker.patch(
|
||||
"scrapers.cses.make_request", side_effect=Exception("Network error")
|
||||
)
|
||||
|
||||
# Test metadata error format
|
||||
result = scraper.scrape_contest_metadata("test")
|
||||
assert result.success == False
|
||||
assert result.error.startswith(f"{platform_name}: ")
|
||||
|
||||
# Test problem tests error format
|
||||
result = scraper.scrape_problem_tests("test", "A")
|
||||
assert result.success == False
|
||||
assert result.error.startswith(f"{platform_name}: ")
|
||||
|
||||
# Test contest list error format
|
||||
result = scraper.scrape_contest_list()
|
||||
assert result.success == False
|
||||
assert result.error.startswith(f"{platform_name}: ")
|
||||
|
||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
||||
def test_scraper_instantiation(self, scraper_class):
|
||||
scraper1 = scraper_class()
|
||||
assert isinstance(scraper1, BaseScraper)
|
||||
assert scraper1.config is not None
|
||||
|
||||
from scrapers.base import ScraperConfig
|
||||
|
||||
custom_config = ScraperConfig(timeout_seconds=60)
|
||||
scraper2 = scraper_class(custom_config)
|
||||
assert isinstance(scraper2, BaseScraper)
|
||||
assert scraper2.config.timeout_seconds == 60
|
||||
58
tests/scrapers/test_registry.py
Normal file
58
tests/scrapers/test_registry.py
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
import pytest
|
||||
|
||||
from scrapers import ALL_SCRAPERS, get_scraper, list_platforms
|
||||
from scrapers.base import BaseScraper
|
||||
from scrapers.codeforces import CodeforcesScraper
|
||||
|
||||
|
||||
class TestScraperRegistry:
|
||||
def test_get_scraper_valid_platform(self):
|
||||
scraper_class = get_scraper("codeforces")
|
||||
assert scraper_class == CodeforcesScraper
|
||||
assert issubclass(scraper_class, BaseScraper)
|
||||
|
||||
scraper = scraper_class()
|
||||
assert isinstance(scraper, BaseScraper)
|
||||
assert scraper.platform_name == "codeforces"
|
||||
|
||||
def test_get_scraper_invalid_platform(self):
|
||||
with pytest.raises(KeyError) as exc_info:
|
||||
get_scraper("nonexistent")
|
||||
|
||||
error_msg = str(exc_info.value)
|
||||
assert "nonexistent" in error_msg
|
||||
assert "Available platforms" in error_msg
|
||||
|
||||
def test_list_platforms(self):
|
||||
platforms = list_platforms()
|
||||
|
||||
assert isinstance(platforms, list)
|
||||
assert len(platforms) > 0
|
||||
assert "codeforces" in platforms
|
||||
|
||||
assert set(platforms) == set(ALL_SCRAPERS.keys())
|
||||
|
||||
def test_all_scrapers_registry(self):
|
||||
assert isinstance(ALL_SCRAPERS, dict)
|
||||
assert len(ALL_SCRAPERS) > 0
|
||||
|
||||
for platform_name, scraper_class in ALL_SCRAPERS.items():
|
||||
assert isinstance(platform_name, str)
|
||||
assert platform_name.islower()
|
||||
|
||||
assert issubclass(scraper_class, BaseScraper)
|
||||
|
||||
scraper = scraper_class()
|
||||
assert scraper.platform_name == platform_name
|
||||
|
||||
def test_registry_import_consistency(self):
|
||||
from scrapers.codeforces import CodeforcesScraper as DirectImport
|
||||
|
||||
registry_class = get_scraper("codeforces")
|
||||
assert registry_class == DirectImport
|
||||
|
||||
def test_all_scrapers_can_be_instantiated(self):
|
||||
for platform_name, scraper_class in ALL_SCRAPERS.items():
|
||||
scraper = scraper_class()
|
||||
assert isinstance(scraper, BaseScraper)
|
||||
assert scraper.platform_name == platform_name
|
||||
Loading…
Add table
Add a link
Reference in a new issue