feat(scrapers): refactor

This commit is contained in:
Barrett Ruth 2026-01-27 14:44:08 -05:00
parent 7dafb7ea43
commit 5293515aca
3 changed files with 83 additions and 221 deletions

View file

@ -1,9 +1,8 @@
import asyncio
import sys
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Awaitable, Callable, ParamSpec, cast
from .models import ContestListResult, MetadataResult, TestsResult from .models import CombinedTest, ContestListResult, MetadataResult, TestsResult
P = ParamSpec("P")
class BaseScraper(ABC): class BaseScraper(ABC):
@ -20,57 +19,65 @@ class BaseScraper(ABC):
@abstractmethod @abstractmethod
async def stream_tests_for_category_async(self, category_id: str) -> None: ... async def stream_tests_for_category_async(self, category_id: str) -> None: ...
def _create_metadata_error( def _usage(self) -> str:
self, error_msg: str, contest_id: str = "" name = self.platform_name
) -> MetadataResult: return f"Usage: {name}.py metadata <id> | tests <id> | contests"
return MetadataResult(
success=False,
error=f"{self.platform_name}: {error_msg}",
contest_id=contest_id,
problems=[],
url="",
)
def _create_tests_error( def _metadata_error(self, msg: str) -> MetadataResult:
self, error_msg: str, problem_id: str = "", url: str = "" return MetadataResult(success=False, error=msg, url="")
) -> TestsResult:
from .models import CombinedTest
def _tests_error(self, msg: str) -> TestsResult:
return TestsResult( return TestsResult(
success=False, success=False,
error=f"{self.platform_name}: {error_msg}", error=msg,
problem_id=problem_id, problem_id="",
combined=CombinedTest(input="", expected=""), combined=CombinedTest(input="", expected=""),
tests=[], tests=[],
timeout_ms=0, timeout_ms=0,
memory_mb=0, memory_mb=0,
interactive=False,
) )
def _create_contests_error(self, error_msg: str) -> ContestListResult: def _contests_error(self, msg: str) -> ContestListResult:
return ContestListResult( return ContestListResult(success=False, error=msg)
success=False,
error=f"{self.platform_name}: {error_msg}",
contests=[],
)
async def _safe_execute( async def _run_cli_async(self, args: list[str]) -> int:
self, if len(args) < 2:
operation: str, print(self._metadata_error(self._usage()).model_dump_json())
func: Callable[P, Awaitable[Any]], return 1
*args: P.args,
**kwargs: P.kwargs, mode = args[1]
):
try: match mode:
return await func(*args, **kwargs) case "metadata":
except Exception as e: if len(args) != 3:
if operation == "metadata": print(self._metadata_error(self._usage()).model_dump_json())
contest_id = cast(str, args[0]) if args else "" return 1
return self._create_metadata_error(str(e), contest_id) result = await self.scrape_contest_metadata(args[2])
elif operation == "tests": print(result.model_dump_json())
problem_id = cast(str, args[1]) if len(args) > 1 else "" return 0 if result.success else 1
return self._create_tests_error(str(e), problem_id)
elif operation == "contests": case "tests":
return self._create_contests_error(str(e)) if len(args) != 3:
else: print(self._tests_error(self._usage()).model_dump_json())
raise return 1
await self.stream_tests_for_category_async(args[2])
return 0
case "contests":
if len(args) != 2:
print(self._contests_error(self._usage()).model_dump_json())
return 1
result = await self.scrape_contest_list()
print(result.model_dump_json())
return 0 if result.success else 1
case _:
print(
self._metadata_error(
f"Unknown mode: {mode}. {self._usage()}"
).model_dump_json()
)
return 1
def run_cli(self) -> None:
sys.exit(asyncio.run(self._run_cli_async(sys.argv)))

View file

@ -4,7 +4,6 @@ import asyncio
import json import json
import logging import logging
import re import re
import sys
from typing import Any from typing import Any
import requests import requests
@ -13,13 +12,11 @@ from scrapling.fetchers import Fetcher
from .base import BaseScraper from .base import BaseScraper
from .models import ( from .models import (
CombinedTest,
ContestListResult, ContestListResult,
ContestSummary, ContestSummary,
MetadataResult, MetadataResult,
ProblemSummary, ProblemSummary,
TestCase, TestCase,
TestsResult,
) )
# suppress scrapling logging - https://github.com/D4Vinci/Scrapling/issues/31) # suppress scrapling logging - https://github.com/D4Vinci/Scrapling/issues/31)
@ -209,49 +206,46 @@ class CodeforcesScraper(BaseScraper):
return "codeforces" return "codeforces"
async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: async def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
async def impl(cid: str) -> MetadataResult: try:
problems = await asyncio.to_thread(_scrape_contest_problems_sync, cid) problems = await asyncio.to_thread(
_scrape_contest_problems_sync, contest_id
)
if not problems: if not problems:
return self._create_metadata_error( return self._metadata_error(
f"No problems found for contest {cid}", cid f"No problems found for contest {contest_id}"
) )
return MetadataResult( return MetadataResult(
success=True, success=True,
error="", error="",
contest_id=cid, contest_id=contest_id,
problems=problems, problems=problems,
url=f"https://codeforces.com/contest/{contest_id}/problem/%s", url=f"https://codeforces.com/contest/{contest_id}/problem/%s",
) )
except Exception as e:
return await self._safe_execute("metadata", impl, contest_id) return self._metadata_error(str(e))
async def scrape_contest_list(self) -> ContestListResult: async def scrape_contest_list(self) -> ContestListResult:
async def impl() -> ContestListResult: try:
try: r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS)
r = requests.get(API_CONTEST_LIST_URL, timeout=TIMEOUT_SECONDS) r.raise_for_status()
r.raise_for_status() data = r.json()
data = r.json() if data.get("status") != "OK":
if data.get("status") != "OK": return self._contests_error("Invalid API response")
return self._create_contests_error("Invalid API response")
contests: list[ContestSummary] = [] contests: list[ContestSummary] = []
for c in data["result"]: for c in data["result"]:
if c.get("phase") != "FINISHED": if c.get("phase") != "FINISHED":
continue continue
cid = str(c["id"]) cid = str(c["id"])
name = c["name"] name = c["name"]
contests.append( contests.append(ContestSummary(id=cid, name=name, display_name=name))
ContestSummary(id=cid, name=name, display_name=name)
)
if not contests: if not contests:
return self._create_contests_error("No contests found") return self._contests_error("No contests found")
return ContestListResult(success=True, error="", contests=contests) return ContestListResult(success=True, error="", contests=contests)
except Exception as e: except Exception as e:
return self._create_contests_error(str(e)) return self._contests_error(str(e))
return await self._safe_execute("contests", impl)
async def stream_tests_for_category_async(self, category_id: str) -> None: async def stream_tests_for_category_async(self, category_id: str) -> None:
html = await asyncio.to_thread(_fetch_problems_html, category_id) html = await asyncio.to_thread(_fetch_problems_html, category_id)
@ -281,73 +275,5 @@ class CodeforcesScraper(BaseScraper):
) )
async def main_async() -> int:
if len(sys.argv) < 2:
result = MetadataResult(
success=False,
error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> OR codeforces.py contests",
url="",
)
print(result.model_dump_json())
return 1
mode: str = sys.argv[1]
scraper = CodeforcesScraper()
if mode == "metadata":
if len(sys.argv) != 3:
result = MetadataResult(
success=False,
error="Usage: codeforces.py metadata <contest_id>",
url="",
)
print(result.model_dump_json())
return 1
contest_id = sys.argv[2]
result = await scraper.scrape_contest_metadata(contest_id)
print(result.model_dump_json())
return 0 if result.success else 1
if mode == "tests":
if len(sys.argv) != 3:
tests_result = TestsResult(
success=False,
error="Usage: codeforces.py tests <contest_id>",
problem_id="",
combined=CombinedTest(input="", expected=""),
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(tests_result.model_dump_json())
return 1
contest_id = sys.argv[2]
await scraper.stream_tests_for_category_async(contest_id)
return 0
if mode == "contests":
if len(sys.argv) != 2:
contest_result = ContestListResult(
success=False, error="Usage: codeforces.py contests"
)
print(contest_result.model_dump_json())
return 1
contest_result = await scraper.scrape_contest_list()
print(contest_result.model_dump_json())
return 0 if contest_result.success else 1
result = MetadataResult(
success=False,
error="Unknown mode. Use 'metadata <contest_id>', 'tests <contest_id>', or 'contests'",
url="",
)
print(result.model_dump_json())
return 1
def main() -> None:
sys.exit(asyncio.run(main_async()))
if __name__ == "__main__": if __name__ == "__main__":
main() CodeforcesScraper().run_cli()

View file

@ -3,20 +3,17 @@
import asyncio import asyncio
import json import json
import re import re
import sys
from typing import Any from typing import Any
import httpx import httpx
from .base import BaseScraper from .base import BaseScraper
from .models import ( from .models import (
CombinedTest,
ContestListResult, ContestListResult,
ContestSummary, ContestSummary,
MetadataResult, MetadataResult,
ProblemSummary, ProblemSummary,
TestCase, TestCase,
TestsResult,
) )
BASE_URL = "https://cses.fi" BASE_URL = "https://cses.fi"
@ -261,73 +258,5 @@ class CSESScraper(BaseScraper):
print(json.dumps(payload), flush=True) print(json.dumps(payload), flush=True)
async def main_async() -> int:
if len(sys.argv) < 2:
result = MetadataResult(
success=False,
error="Usage: cses.py metadata <category_id> OR cses.py tests <category> OR cses.py contests",
url="",
)
print(result.model_dump_json())
return 1
mode: str = sys.argv[1]
scraper = CSESScraper()
if mode == "metadata":
if len(sys.argv) != 3:
result = MetadataResult(
success=False,
error="Usage: cses.py metadata <category_id>",
url="",
)
print(result.model_dump_json())
return 1
category_id = sys.argv[2]
result = await scraper.scrape_contest_metadata(category_id)
print(result.model_dump_json())
return 0 if result.success else 1
if mode == "tests":
if len(sys.argv) != 3:
tests_result = TestsResult(
success=False,
error="Usage: cses.py tests <category>",
problem_id="",
combined=CombinedTest(input="", expected=""),
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(tests_result.model_dump_json())
return 1
category = sys.argv[2]
await scraper.stream_tests_for_category_async(category)
return 0
if mode == "contests":
if len(sys.argv) != 2:
contest_result = ContestListResult(
success=False, error="Usage: cses.py contests"
)
print(contest_result.model_dump_json())
return 1
contest_result = await scraper.scrape_contest_list()
print(contest_result.model_dump_json())
return 0 if contest_result.success else 1
result = MetadataResult(
success=False,
error=f"Unknown mode: {mode}. Use 'metadata <category>', 'tests <category>', or 'contests'",
url="",
)
print(result.model_dump_json())
return 1
def main() -> None:
sys.exit(asyncio.run(main_async()))
if __name__ == "__main__": if __name__ == "__main__":
main() CSESScraper().run_cli()