feat(scrapers): total refactor

This commit is contained in:
Barrett Ruth 2025-09-22 22:00:20 -04:00
parent eb3f7762de
commit db391da52c
9 changed files with 559 additions and 307 deletions

View file

@ -0,0 +1,56 @@
from .atcoder import AtCoderScraper
from .base import BaseScraper, ScraperConfig
from .codeforces import CodeforcesScraper
from .cses import CSESScraper
from .models import (
ContestListResult,
ContestSummary,
MetadataResult,
ProblemSummary,
TestCase,
TestsResult,
)
ALL_SCRAPERS: dict[str, type[BaseScraper]] = {
"atcoder": AtCoderScraper,
"codeforces": CodeforcesScraper,
"cses": CSESScraper,
}
_SCRAPER_CLASSES = [
"AtCoderScraper",
"CodeforcesScraper",
"CSESScraper",
]
_BASE_EXPORTS = [
"BaseScraper",
"ScraperConfig",
"ContestListResult",
"ContestSummary",
"MetadataResult",
"ProblemSummary",
"TestCase",
"TestsResult",
]
_REGISTRY_FUNCTIONS = [
"get_scraper",
"list_platforms",
"ALL_SCRAPERS",
]
__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS
def get_scraper(platform: str) -> type[BaseScraper]:
if platform not in ALL_SCRAPERS:
available = ", ".join(ALL_SCRAPERS.keys())
raise KeyError(
f"Unknown platform '{platform}'. Available platforms: {available}"
)
return ALL_SCRAPERS[platform]
def list_platforms() -> list[str]:
return list(ALL_SCRAPERS.keys())

View file

@ -1,5 +1,6 @@
#!/usr/bin/env python3
import concurrent.futures
import json
import re
import sys
@ -9,6 +10,7 @@ import backoff
import requests
from bs4 import BeautifulSoup, Tag
from .base import BaseScraper
from .models import (
ContestListResult,
ContestSummary,
@ -167,8 +169,6 @@ def scrape(url: str) -> list[TestCase]:
def scrape_contests() -> list[ContestSummary]:
import concurrent.futures
def get_max_pages() -> int:
try:
headers = {
@ -296,6 +296,101 @@ def scrape_contests() -> list[ContestSummary]:
return all_contests
class AtCoderScraper(BaseScraper):
@property
def platform_name(self) -> str:
return "atcoder"
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
return self._safe_execute(
"tests", self._scrape_tests_impl, contest_id, problem_id
)
def scrape_contest_list(self) -> ContestListResult:
return self._safe_execute("contests", self._scrape_contests_impl)
def _safe_execute(self, operation: str, func, *args):
try:
return func(*args)
except Exception as e:
error_msg = f"{self.platform_name}: {str(e)}"
if operation == "metadata":
return MetadataResult(success=False, error=error_msg)
elif operation == "tests":
return TestsResult(
success=False,
error=error_msg,
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
elif operation == "contests":
return ContestListResult(success=False, error=error_msg)
def _scrape_metadata_impl(self, contest_id: str) -> MetadataResult:
problems = scrape_contest_problems(contest_id)
if not problems:
return MetadataResult(
success=False,
error=f"{self.platform_name}: No problems found for contest {contest_id}",
)
return MetadataResult(
success=True, error="", contest_id=contest_id, problems=problems
)
def _scrape_tests_impl(self, contest_id: str, problem_id: str) -> TestsResult:
problem_letter = problem_id.upper()
url = parse_problem_url(contest_id, problem_letter)
tests = scrape(url)
response = requests.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
},
timeout=10,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
if not tests:
return TestsResult(
success=False,
error=f"{self.platform_name}: No tests found for {contest_id} {problem_letter}",
problem_id=f"{contest_id}_{problem_id.lower()}",
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
return TestsResult(
success=True,
error="",
problem_id=f"{contest_id}_{problem_id.lower()}",
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
def _scrape_contests_impl(self) -> ContestListResult:
contests = scrape_contests()
if not contests:
return ContestListResult(
success=False, error=f"{self.platform_name}: No contests found"
)
return ContestListResult(success=True, error="", contests=contests)
def main() -> None:
if len(sys.argv) < 2:
result = MetadataResult(
@ -306,6 +401,7 @@ def main() -> None:
sys.exit(1)
mode: str = sys.argv[1]
scraper = AtCoderScraper()
if mode == "metadata":
if len(sys.argv) != 3:
@ -317,23 +413,10 @@ def main() -> None:
sys.exit(1)
contest_id: str = sys.argv[2]
problems: list[ProblemSummary] = scrape_contest_problems(contest_id)
if not problems:
result = MetadataResult(
success=False,
error=f"No problems found for contest {contest_id}",
)
print(json.dumps(asdict(result)))
sys.exit(1)
result = MetadataResult(
success=True,
error="",
contest_id=contest_id,
problems=problems,
)
result = scraper.scrape_contest_metadata(contest_id)
print(json.dumps(asdict(result)))
if not result.success:
sys.exit(1)
elif mode == "tests":
if len(sys.argv) != 4:
@ -351,55 +434,10 @@ def main() -> None:
test_contest_id: str = sys.argv[2]
problem_letter: str = sys.argv[3]
problem_id: str = f"{test_contest_id}_{problem_letter.lower()}"
url: str = parse_problem_url(test_contest_id, problem_letter)
tests: list[TestCase] = scrape(url)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {test_contest_id} {problem_letter}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
tests_result = scraper.scrape_problem_tests(test_contest_id, problem_letter)
print(json.dumps(asdict(tests_result)))
if not tests_result.success:
sys.exit(1)
elif mode == "contests":
if len(sys.argv) != 2:
@ -409,14 +447,10 @@ def main() -> None:
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contests = scrape_contests()
if not contests:
contest_result = ContestListResult(success=False, error="No contests found")
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contest_result = ContestListResult(success=True, error="", contests=contests)
contest_result = scraper.scrape_contest_list()
print(json.dumps(asdict(contest_result)))
if not contest_result.success:
sys.exit(1)
else:
result = MetadataResult(

View file

@ -1,8 +1,5 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Protocol
import requests
from .models import ContestListResult, MetadataResult, TestsResult
@ -15,23 +12,14 @@ class ScraperConfig:
rate_limit_delay: float = 1.0
class HttpClient(Protocol):
def get(self, url: str, **kwargs) -> requests.Response: ...
def close(self) -> None: ...
class BaseScraper(ABC):
def __init__(self, config: ScraperConfig | None = None):
self.config = config or ScraperConfig()
self._client: HttpClient | None = None
@property
@abstractmethod
def platform_name(self) -> str: ...
@abstractmethod
def _create_client(self) -> HttpClient: ...
@abstractmethod
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ...
@ -41,17 +29,6 @@ class BaseScraper(ABC):
@abstractmethod
def scrape_contest_list(self) -> ContestListResult: ...
@property
def client(self) -> HttpClient:
if self._client is None:
self._client = self._create_client()
return self._client
def close(self) -> None:
if self._client is not None:
self._client.close()
self._client = None
def _create_metadata_error(
self, error_msg: str, contest_id: str = ""
) -> MetadataResult:

View file

@ -1,82 +0,0 @@
import time
import backoff
import requests
from .base import HttpClient, ScraperConfig
class RequestsClient:
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
self.config = config
self.session = requests.Session()
default_headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
if headers:
default_headers.update(headers)
self.session.headers.update(default_headers)
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.session.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
self.session.close()
class CloudScraperClient:
def __init__(self, config: ScraperConfig):
import cloudscraper
self.config = config
self.scraper = cloudscraper.create_scraper()
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.scraper.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
if hasattr(self.scraper, "close"):
self.scraper.close()

View file

@ -5,10 +5,10 @@ import re
import sys
from dataclasses import asdict
import cloudscraper
from bs4 import BeautifulSoup, Tag
from .base import BaseScraper, HttpClient
from .clients import CloudScraperClient
from .base import BaseScraper
from .models import (
ContestListResult,
ContestSummary,
@ -24,9 +24,6 @@ class CodeforcesScraper(BaseScraper):
def platform_name(self) -> str:
return "codeforces"
def _create_client(self) -> HttpClient:
return CloudScraperClient(self.config)
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute(
"metadata", self._scrape_contest_metadata_impl, contest_id
@ -41,7 +38,7 @@ class CodeforcesScraper(BaseScraper):
return self._safe_execute("contests", self._scrape_contest_list_impl)
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
problems = scrape_contest_problems(contest_id, self.client)
problems = scrape_contest_problems(contest_id)
if not problems:
return self._create_metadata_error(
f"No problems found for contest {contest_id}", contest_id
@ -55,9 +52,11 @@ class CodeforcesScraper(BaseScraper):
) -> TestsResult:
problem_id = contest_id + problem_letter.lower()
url = parse_problem_url(contest_id, problem_letter)
tests = scrape_sample_tests(url, self.client)
tests = scrape_sample_tests(url)
response = self.client.get(url)
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=self.config.timeout_seconds)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
@ -77,15 +76,17 @@ class CodeforcesScraper(BaseScraper):
)
def _scrape_contest_list_impl(self) -> ContestListResult:
contests = scrape_contests(self.client)
contests = scrape_contests()
if not contests:
return self._create_contests_error("No contests found")
return ContestListResult(success=True, error="", contests=contests)
def scrape(url: str, client: HttpClient) -> list[TestCase]:
def scrape(url: str) -> list[TestCase]:
try:
response = client.get(url)
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
input_sections = soup.find_all("div", class_="input")
@ -239,12 +240,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
return timeout_ms, memory_mb
def scrape_contest_problems(
contest_id: str, client: HttpClient
) -> list[ProblemSummary]:
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
response = client.get(contest_url)
scraper = cloudscraper.create_scraper()
response = scraper.get(contest_url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
problems: list[ProblemSummary] = []
@ -280,13 +281,15 @@ def scrape_contest_problems(
return []
def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]:
def scrape_sample_tests(url: str) -> list[TestCase]:
print(f"Scraping: {url}", file=sys.stderr)
return scrape(url, client)
return scrape(url)
def scrape_contests(client: HttpClient) -> list[ContestSummary]:
response = client.get("https://codeforces.com/api/contest.list")
def scrape_contests() -> list[ContestSummary]:
scraper = cloudscraper.create_scraper()
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
response.raise_for_status()
data = response.json()
if data["status"] != "OK":
@ -364,8 +367,6 @@ def main() -> None:
print(json.dumps(asdict(result)))
sys.exit(1)
scraper.close()
if __name__ == "__main__":
main()

View file

@ -9,6 +9,7 @@ import backoff
import requests
from bs4 import BeautifulSoup, Tag
from .base import BaseScraper
from .models import (
ContestListResult,
ContestSummary,
@ -322,6 +323,111 @@ def scrape(url: str) -> list[TestCase]:
return []
class CSESScraper(BaseScraper):
@property
def platform_name(self) -> str:
return "cses"
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute("metadata", self._scrape_metadata_impl, contest_id)
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
return self._safe_execute(
"tests", self._scrape_tests_impl, contest_id, problem_id
)
def scrape_contest_list(self) -> ContestListResult:
return self._safe_execute("contests", self._scrape_contests_impl)
def _safe_execute(self, operation: str, func, *args):
try:
return func(*args)
except Exception as e:
error_msg = f"{self.platform_name}: {str(e)}"
if operation == "metadata":
return MetadataResult(success=False, error=error_msg)
elif operation == "tests":
return TestsResult(
success=False,
error=error_msg,
problem_id="",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
elif operation == "contests":
return ContestListResult(success=False, error=error_msg)
def _scrape_metadata_impl(self, category_id: str) -> MetadataResult:
problems = scrape_category_problems(category_id)
if not problems:
return MetadataResult(
success=False,
error=f"{self.platform_name}: No problems found for category: {category_id}",
)
return MetadataResult(
success=True, error="", contest_id=category_id, problems=problems
)
def _scrape_tests_impl(self, category: str, problem_id: str) -> TestsResult:
url = parse_problem_url(problem_id)
if not url:
return TestsResult(
success=False,
error=f"{self.platform_name}: Invalid problem input: {problem_id}. Use either problem ID (e.g., 1068) or full URL",
problem_id=problem_id if problem_id.isdigit() else "",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
tests = scrape(url)
actual_problem_id = (
problem_id if problem_id.isdigit() else problem_id.split("/")[-1]
)
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
if not tests:
return TestsResult(
success=False,
error=f"{self.platform_name}: No tests found for {problem_id}",
problem_id=actual_problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
return TestsResult(
success=True,
error="",
problem_id=actual_problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
def _scrape_contests_impl(self) -> ContestListResult:
categories = scrape_categories()
if not categories:
return ContestListResult(
success=False, error=f"{self.platform_name}: No contests found"
)
return ContestListResult(success=True, error="", contests=categories)
def main() -> None:
if len(sys.argv) < 2:
result = MetadataResult(
@ -332,6 +438,7 @@ def main() -> None:
sys.exit(1)
mode: str = sys.argv[1]
scraper = CSESScraper()
if mode == "metadata":
if len(sys.argv) != 3:
@ -343,18 +450,10 @@ def main() -> None:
sys.exit(1)
category_id = sys.argv[2]
problems = scrape_category_problems(category_id)
if not problems:
result = MetadataResult(
success=False,
error=f"No problems found for category: {category_id}",
)
print(json.dumps(asdict(result)))
return
result = MetadataResult(success=True, error="", problems=problems)
result = scraper.scrape_contest_metadata(category_id)
print(json.dumps(asdict(result)))
if not result.success:
sys.exit(1)
elif mode == "tests":
if len(sys.argv) != 4:
@ -370,73 +469,12 @@ def main() -> None:
print(json.dumps(asdict(tests_result)))
sys.exit(1)
problem_input: str = sys.argv[3]
url: str | None = parse_problem_url(problem_input)
if not url:
tests_result = TestsResult(
success=False,
error=f"Invalid problem input: {problem_input}. Use either problem ID (e.g., 1068) or full URL",
problem_id=problem_input if problem_input.isdigit() else "",
url="",
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests: list[TestCase] = scrape(url)
problem_id: str = (
problem_input if problem_input.isdigit() else problem_input.split("/")[-1]
)
try:
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {problem_input}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
test_cases = tests
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=test_cases,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
category = sys.argv[2]
problem_id = sys.argv[3]
tests_result = scraper.scrape_problem_tests(category, problem_id)
print(json.dumps(asdict(tests_result)))
if not tests_result.success:
sys.exit(1)
elif mode == "contests":
if len(sys.argv) != 2:
@ -446,14 +484,10 @@ def main() -> None:
print(json.dumps(asdict(contest_result)))
sys.exit(1)
categories = scrape_categories()
if not categories:
contest_result = ContestListResult(success=False, error="No contests found")
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contest_result = ContestListResult(success=True, error="", contests=categories)
contest_result = scraper.scrape_contest_list()
print(json.dumps(asdict(contest_result)))
if not contest_result.success:
sys.exit(1)
else:
result = MetadataResult(