Merge pull request #101 from barrett-ruth/refactor/scraper-reorganize
scraper qol
This commit is contained in:
commit
5707a28d58
4 changed files with 80 additions and 172 deletions
|
|
@ -11,20 +11,11 @@ from .models import (
|
||||||
TestsResult,
|
TestsResult,
|
||||||
)
|
)
|
||||||
|
|
||||||
ALL_SCRAPERS: dict[str, type[BaseScraper]] = {
|
__all__ = [
|
||||||
"atcoder": AtCoderScraper,
|
|
||||||
"codeforces": CodeforcesScraper,
|
|
||||||
"cses": CSESScraper,
|
|
||||||
}
|
|
||||||
|
|
||||||
_SCRAPER_CLASSES = [
|
|
||||||
"AtCoderScraper",
|
"AtCoderScraper",
|
||||||
|
"BaseScraper",
|
||||||
"CodeforcesScraper",
|
"CodeforcesScraper",
|
||||||
"CSESScraper",
|
"CSESScraper",
|
||||||
]
|
|
||||||
|
|
||||||
_BASE_EXPORTS = [
|
|
||||||
"BaseScraper",
|
|
||||||
"ScraperConfig",
|
"ScraperConfig",
|
||||||
"ContestListResult",
|
"ContestListResult",
|
||||||
"ContestSummary",
|
"ContestSummary",
|
||||||
|
|
@ -33,34 +24,3 @@ _BASE_EXPORTS = [
|
||||||
"TestCase",
|
"TestCase",
|
||||||
"TestsResult",
|
"TestsResult",
|
||||||
]
|
]
|
||||||
|
|
||||||
_REGISTRY_FUNCTIONS = [
|
|
||||||
"get_scraper",
|
|
||||||
"list_platforms",
|
|
||||||
"ALL_SCRAPERS",
|
|
||||||
]
|
|
||||||
|
|
||||||
__all__ = _BASE_EXPORTS + _SCRAPER_CLASSES + _REGISTRY_FUNCTIONS
|
|
||||||
|
|
||||||
_exported_types = (
|
|
||||||
ScraperConfig,
|
|
||||||
ContestListResult,
|
|
||||||
ContestSummary,
|
|
||||||
MetadataResult,
|
|
||||||
ProblemSummary,
|
|
||||||
TestCase,
|
|
||||||
TestsResult,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_scraper(platform: str) -> type[BaseScraper]:
|
|
||||||
if platform not in ALL_SCRAPERS:
|
|
||||||
available = ", ".join(ALL_SCRAPERS.keys())
|
|
||||||
raise KeyError(
|
|
||||||
f"Unknown platform '{platform}'. Available platforms: {available}"
|
|
||||||
)
|
|
||||||
return ALL_SCRAPERS[platform]
|
|
||||||
|
|
||||||
|
|
||||||
def list_platforms() -> list[str]:
|
|
||||||
return list(ALL_SCRAPERS.keys())
|
|
||||||
|
|
|
||||||
|
|
@ -19,69 +19,6 @@ from .models import (
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class CodeforcesScraper(BaseScraper):
|
|
||||||
@property
|
|
||||||
def platform_name(self) -> str:
|
|
||||||
return "codeforces"
|
|
||||||
|
|
||||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
|
||||||
return self._safe_execute(
|
|
||||||
"metadata", self._scrape_contest_metadata_impl, contest_id
|
|
||||||
)
|
|
||||||
|
|
||||||
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
|
||||||
return self._safe_execute(
|
|
||||||
"tests", self._scrape_problem_tests_impl, contest_id, problem_id
|
|
||||||
)
|
|
||||||
|
|
||||||
def scrape_contest_list(self) -> ContestListResult:
|
|
||||||
return self._safe_execute("contests", self._scrape_contest_list_impl)
|
|
||||||
|
|
||||||
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
|
|
||||||
problems = scrape_contest_problems(contest_id)
|
|
||||||
if not problems:
|
|
||||||
return self._create_metadata_error(
|
|
||||||
f"No problems found for contest {contest_id}", contest_id
|
|
||||||
)
|
|
||||||
return MetadataResult(
|
|
||||||
success=True, error="", contest_id=contest_id, problems=problems
|
|
||||||
)
|
|
||||||
|
|
||||||
def _scrape_problem_tests_impl(
|
|
||||||
self, contest_id: str, problem_letter: str
|
|
||||||
) -> TestsResult:
|
|
||||||
problem_id = contest_id + problem_letter.lower()
|
|
||||||
url = parse_problem_url(contest_id, problem_letter)
|
|
||||||
tests = scrape_sample_tests(url)
|
|
||||||
|
|
||||||
scraper = cloudscraper.create_scraper()
|
|
||||||
response = scraper.get(url, timeout=self.config.timeout_seconds)
|
|
||||||
response.raise_for_status()
|
|
||||||
soup = BeautifulSoup(response.text, "html.parser")
|
|
||||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
|
||||||
|
|
||||||
if not tests:
|
|
||||||
return self._create_tests_error(
|
|
||||||
f"No tests found for {contest_id} {problem_letter}", problem_id, url
|
|
||||||
)
|
|
||||||
|
|
||||||
return TestsResult(
|
|
||||||
success=True,
|
|
||||||
error="",
|
|
||||||
problem_id=problem_id,
|
|
||||||
url=url,
|
|
||||||
tests=tests,
|
|
||||||
timeout_ms=timeout_ms,
|
|
||||||
memory_mb=memory_mb,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _scrape_contest_list_impl(self) -> ContestListResult:
|
|
||||||
contests = scrape_contests()
|
|
||||||
if not contests:
|
|
||||||
return self._create_contests_error("No contests found")
|
|
||||||
return ContestListResult(success=True, error="", contests=contests)
|
|
||||||
|
|
||||||
|
|
||||||
def scrape(url: str) -> list[TestCase]:
|
def scrape(url: str) -> list[TestCase]:
|
||||||
try:
|
try:
|
||||||
scraper = cloudscraper.create_scraper()
|
scraper = cloudscraper.create_scraper()
|
||||||
|
|
@ -305,6 +242,69 @@ def scrape_contests() -> list[ContestSummary]:
|
||||||
return contests
|
return contests
|
||||||
|
|
||||||
|
|
||||||
|
class CodeforcesScraper(BaseScraper):
|
||||||
|
@property
|
||||||
|
def platform_name(self) -> str:
|
||||||
|
return "codeforces"
|
||||||
|
|
||||||
|
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||||
|
return self._safe_execute(
|
||||||
|
"metadata", self._scrape_contest_metadata_impl, contest_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||||
|
return self._safe_execute(
|
||||||
|
"tests", self._scrape_problem_tests_impl, contest_id, problem_id
|
||||||
|
)
|
||||||
|
|
||||||
|
def scrape_contest_list(self) -> ContestListResult:
|
||||||
|
return self._safe_execute("contests", self._scrape_contest_list_impl)
|
||||||
|
|
||||||
|
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
|
||||||
|
problems = scrape_contest_problems(contest_id)
|
||||||
|
if not problems:
|
||||||
|
return self._create_metadata_error(
|
||||||
|
f"No problems found for contest {contest_id}", contest_id
|
||||||
|
)
|
||||||
|
return MetadataResult(
|
||||||
|
success=True, error="", contest_id=contest_id, problems=problems
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scrape_problem_tests_impl(
|
||||||
|
self, contest_id: str, problem_letter: str
|
||||||
|
) -> TestsResult:
|
||||||
|
problem_id = contest_id + problem_letter.lower()
|
||||||
|
url = parse_problem_url(contest_id, problem_letter)
|
||||||
|
tests = scrape_sample_tests(url)
|
||||||
|
|
||||||
|
scraper = cloudscraper.create_scraper()
|
||||||
|
response = scraper.get(url, timeout=self.config.timeout_seconds)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||||
|
|
||||||
|
if not tests:
|
||||||
|
return self._create_tests_error(
|
||||||
|
f"No tests found for {contest_id} {problem_letter}", problem_id, url
|
||||||
|
)
|
||||||
|
|
||||||
|
return TestsResult(
|
||||||
|
success=True,
|
||||||
|
error="",
|
||||||
|
problem_id=problem_id,
|
||||||
|
url=url,
|
||||||
|
tests=tests,
|
||||||
|
timeout_ms=timeout_ms,
|
||||||
|
memory_mb=memory_mb,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _scrape_contest_list_impl(self) -> ContestListResult:
|
||||||
|
contests = scrape_contests()
|
||||||
|
if not contests:
|
||||||
|
return self._create_contests_error("No contests found")
|
||||||
|
return ContestListResult(success=True, error="", contests=contests)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
result = MetadataResult(
|
result = MetadataResult(
|
||||||
|
|
|
||||||
|
|
@ -1,15 +1,21 @@
|
||||||
|
import inspect
|
||||||
from unittest.mock import Mock
|
from unittest.mock import Mock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from scrapers import ALL_SCRAPERS, BaseScraper
|
import scrapers
|
||||||
|
from scrapers.base import BaseScraper
|
||||||
from scrapers.models import ContestListResult, MetadataResult, TestsResult
|
from scrapers.models import ContestListResult, MetadataResult, TestsResult
|
||||||
|
|
||||||
ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values())
|
SCRAPERS = [
|
||||||
|
cls
|
||||||
|
for name, cls in inspect.getmembers(scrapers, inspect.isclass)
|
||||||
|
if issubclass(cls, BaseScraper) and cls != BaseScraper
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class TestScraperInterfaceCompliance:
|
class TestScraperInterfaceCompliance:
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_implements_base_interface(self, scraper_class):
|
def test_implements_base_interface(self, scraper_class):
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
|
@ -19,7 +25,7 @@ class TestScraperInterfaceCompliance:
|
||||||
assert hasattr(scraper, "scrape_problem_tests")
|
assert hasattr(scraper, "scrape_problem_tests")
|
||||||
assert hasattr(scraper, "scrape_contest_list")
|
assert hasattr(scraper, "scrape_contest_list")
|
||||||
|
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_platform_name_is_string(self, scraper_class):
|
def test_platform_name_is_string(self, scraper_class):
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
platform_name = scraper.platform_name
|
platform_name = scraper.platform_name
|
||||||
|
|
@ -28,7 +34,7 @@ class TestScraperInterfaceCompliance:
|
||||||
assert len(platform_name) > 0
|
assert len(platform_name) > 0
|
||||||
assert platform_name.islower() # Convention: lowercase platform names
|
assert platform_name.islower() # Convention: lowercase platform names
|
||||||
|
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_metadata_method_signature(self, scraper_class, mocker):
|
def test_metadata_method_signature(self, scraper_class, mocker):
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
|
@ -53,7 +59,7 @@ class TestScraperInterfaceCompliance:
|
||||||
assert isinstance(result.success, bool)
|
assert isinstance(result.success, bool)
|
||||||
assert isinstance(result.error, str)
|
assert isinstance(result.error, str)
|
||||||
|
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_problem_tests_method_signature(self, scraper_class, mocker):
|
def test_problem_tests_method_signature(self, scraper_class, mocker):
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
|
@ -85,7 +91,7 @@ class TestScraperInterfaceCompliance:
|
||||||
assert isinstance(result.success, bool)
|
assert isinstance(result.success, bool)
|
||||||
assert isinstance(result.error, str)
|
assert isinstance(result.error, str)
|
||||||
|
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_contest_list_method_signature(self, scraper_class, mocker):
|
def test_contest_list_method_signature(self, scraper_class, mocker):
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
|
|
||||||
|
|
@ -111,7 +117,7 @@ class TestScraperInterfaceCompliance:
|
||||||
assert isinstance(result.success, bool)
|
assert isinstance(result.success, bool)
|
||||||
assert isinstance(result.error, str)
|
assert isinstance(result.error, str)
|
||||||
|
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_error_message_format(self, scraper_class, mocker):
|
def test_error_message_format(self, scraper_class, mocker):
|
||||||
scraper = scraper_class()
|
scraper = scraper_class()
|
||||||
platform_name = scraper.platform_name
|
platform_name = scraper.platform_name
|
||||||
|
|
@ -148,7 +154,7 @@ class TestScraperInterfaceCompliance:
|
||||||
assert not result.success
|
assert not result.success
|
||||||
assert result.error.startswith(f"{platform_name}: ")
|
assert result.error.startswith(f"{platform_name}: ")
|
||||||
|
|
||||||
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
|
@pytest.mark.parametrize("scraper_class", SCRAPERS)
|
||||||
def test_scraper_instantiation(self, scraper_class):
|
def test_scraper_instantiation(self, scraper_class):
|
||||||
scraper1 = scraper_class()
|
scraper1 = scraper_class()
|
||||||
assert isinstance(scraper1, BaseScraper)
|
assert isinstance(scraper1, BaseScraper)
|
||||||
|
|
|
||||||
|
|
@ -1,58 +0,0 @@
|
||||||
import pytest
|
|
||||||
|
|
||||||
from scrapers import ALL_SCRAPERS, get_scraper, list_platforms
|
|
||||||
from scrapers.base import BaseScraper
|
|
||||||
from scrapers.codeforces import CodeforcesScraper
|
|
||||||
|
|
||||||
|
|
||||||
class TestScraperRegistry:
|
|
||||||
def test_get_scraper_valid_platform(self):
|
|
||||||
scraper_class = get_scraper("codeforces")
|
|
||||||
assert scraper_class == CodeforcesScraper
|
|
||||||
assert issubclass(scraper_class, BaseScraper)
|
|
||||||
|
|
||||||
scraper = scraper_class()
|
|
||||||
assert isinstance(scraper, BaseScraper)
|
|
||||||
assert scraper.platform_name == "codeforces"
|
|
||||||
|
|
||||||
def test_get_scraper_invalid_platform(self):
|
|
||||||
with pytest.raises(KeyError) as exc_info:
|
|
||||||
get_scraper("nonexistent")
|
|
||||||
|
|
||||||
error_msg = str(exc_info.value)
|
|
||||||
assert "nonexistent" in error_msg
|
|
||||||
assert "Available platforms" in error_msg
|
|
||||||
|
|
||||||
def test_list_platforms(self):
|
|
||||||
platforms = list_platforms()
|
|
||||||
|
|
||||||
assert isinstance(platforms, list)
|
|
||||||
assert len(platforms) > 0
|
|
||||||
assert "codeforces" in platforms
|
|
||||||
|
|
||||||
assert set(platforms) == set(ALL_SCRAPERS.keys())
|
|
||||||
|
|
||||||
def test_all_scrapers_registry(self):
|
|
||||||
assert isinstance(ALL_SCRAPERS, dict)
|
|
||||||
assert len(ALL_SCRAPERS) > 0
|
|
||||||
|
|
||||||
for platform_name, scraper_class in ALL_SCRAPERS.items():
|
|
||||||
assert isinstance(platform_name, str)
|
|
||||||
assert platform_name.islower()
|
|
||||||
|
|
||||||
assert issubclass(scraper_class, BaseScraper)
|
|
||||||
|
|
||||||
scraper = scraper_class()
|
|
||||||
assert scraper.platform_name == platform_name
|
|
||||||
|
|
||||||
def test_registry_import_consistency(self):
|
|
||||||
from scrapers.codeforces import CodeforcesScraper as DirectImport
|
|
||||||
|
|
||||||
registry_class = get_scraper("codeforces")
|
|
||||||
assert registry_class == DirectImport
|
|
||||||
|
|
||||||
def test_all_scrapers_can_be_instantiated(self):
|
|
||||||
for platform_name, scraper_class in ALL_SCRAPERS.items():
|
|
||||||
scraper = scraper_class()
|
|
||||||
assert isinstance(scraper, BaseScraper)
|
|
||||||
assert scraper.platform_name == platform_name
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue