feat(scrapers): total refactor

This commit is contained in:
Barrett Ruth 2025-09-22 22:00:20 -04:00
parent eb3f7762de
commit db391da52c
9 changed files with 559 additions and 307 deletions

View file

@ -5,14 +5,16 @@ from scrapers.models import ContestSummary, ProblemSummary
def test_scrape_success(mocker, mock_codeforces_html):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = mock_codeforces_html
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_problem_tests("1900", "A")
assert result.success == True
@ -22,17 +24,19 @@ def test_scrape_success(mocker, mock_codeforces_html):
def test_scrape_contest_problems(mocker):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = """
<a href="/contest/1900/problem/A">A. Problem A</a>
<a href="/contest/1900/problem/B">B. Problem B</a>
"""
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_metadata("1900")
assert result.success == True
@ -42,12 +46,14 @@ def test_scrape_contest_problems(mocker):
def test_scrape_network_error(mocker):
mock_client = Mock()
mock_client.get.side_effect = Exception("Network error")
mock_scraper = Mock()
mock_scraper.get.side_effect = Exception("Network error")
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_problem_tests("1900", "A")
assert result.success == False
@ -55,7 +61,7 @@ def test_scrape_network_error(mocker):
def test_scrape_contests_success(mocker):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.json.return_value = {
"status": "OK",
@ -65,11 +71,13 @@ def test_scrape_contests_success(mocker):
{"id": 1949, "name": "Codeforces Global Round 26"},
],
}
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_list()
assert result.success == True
@ -92,14 +100,16 @@ def test_scrape_contests_success(mocker):
def test_scrape_contests_api_error(mocker):
mock_client = Mock()
mock_scraper = Mock()
mock_response = Mock()
mock_response.json.return_value = {"status": "FAILED", "result": []}
mock_client.get.return_value = mock_response
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_list()
assert result.success == False
@ -107,12 +117,14 @@ def test_scrape_contests_api_error(mocker):
def test_scrape_contests_network_error(mocker):
mock_client = Mock()
mock_client.get.side_effect = Exception("Network error")
mock_scraper = Mock()
mock_scraper.get.side_effect = Exception("Network error")
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
)
scraper = CodeforcesScraper()
mocker.patch.object(scraper, "_create_client", return_value=mock_client)
result = scraper.scrape_contest_list()
assert result.success == False

View file

@ -0,0 +1,162 @@
from unittest.mock import Mock
import pytest
from scrapers import ALL_SCRAPERS, BaseScraper
from scrapers.models import ContestListResult, MetadataResult, TestsResult
ALL_SCRAPER_CLASSES = list(ALL_SCRAPERS.values())
class TestScraperInterfaceCompliance:
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_implements_base_interface(self, scraper_class):
scraper = scraper_class()
assert isinstance(scraper, BaseScraper)
assert hasattr(scraper, "platform_name")
assert hasattr(scraper, "scrape_contest_metadata")
assert hasattr(scraper, "scrape_problem_tests")
assert hasattr(scraper, "scrape_contest_list")
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_platform_name_is_string(self, scraper_class):
scraper = scraper_class()
platform_name = scraper.platform_name
assert isinstance(platform_name, str)
assert len(platform_name) > 0
assert platform_name.islower() # Convention: lowercase platform names
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_metadata_method_signature(self, scraper_class, mocker):
scraper = scraper_class()
# Mock the underlying HTTP calls to avoid network requests
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = "<a href='/contest/1900/problem/A'>A. Test</a>"
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
result = scraper.scrape_contest_metadata("test_contest")
assert isinstance(result, MetadataResult)
assert hasattr(result, "success")
assert hasattr(result, "error")
assert hasattr(result, "problems")
assert hasattr(result, "contest_id")
assert isinstance(result.success, bool)
assert isinstance(result.error, str)
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_problem_tests_method_signature(self, scraper_class, mocker):
scraper = scraper_class()
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_response = Mock()
mock_response.text = """
<div class="time-limit">Time limit: 1 seconds</div>
<div class="memory-limit">Memory limit: 256 megabytes</div>
<div class="input"><pre><div class="test-example-line-1">3</div></pre></div>
<div class="output"><pre><div class="test-example-line-1">6</div></pre></div>
"""
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
result = scraper.scrape_problem_tests("test_contest", "A")
assert isinstance(result, TestsResult)
assert hasattr(result, "success")
assert hasattr(result, "error")
assert hasattr(result, "tests")
assert hasattr(result, "problem_id")
assert hasattr(result, "url")
assert hasattr(result, "timeout_ms")
assert hasattr(result, "memory_mb")
assert isinstance(result.success, bool)
assert isinstance(result.error, str)
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_contest_list_method_signature(self, scraper_class, mocker):
scraper = scraper_class()
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_response = Mock()
mock_response.json.return_value = {
"status": "OK",
"result": [{"id": 1900, "name": "Test Contest"}],
}
mock_scraper.get.return_value = mock_response
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
result = scraper.scrape_contest_list()
assert isinstance(result, ContestListResult)
assert hasattr(result, "success")
assert hasattr(result, "error")
assert hasattr(result, "contests")
assert isinstance(result.success, bool)
assert isinstance(result.error, str)
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_error_message_format(self, scraper_class, mocker):
scraper = scraper_class()
platform_name = scraper.platform_name
# Force an error by mocking HTTP failure
if scraper.platform_name == "codeforces":
mock_scraper = Mock()
mock_scraper.get.side_effect = Exception("Network error")
mocker.patch(
"scrapers.codeforces.cloudscraper.create_scraper",
return_value=mock_scraper,
)
elif scraper.platform_name == "atcoder":
mocker.patch(
"scrapers.atcoder.requests.get", side_effect=Exception("Network error")
)
elif scraper.platform_name == "cses":
mocker.patch(
"scrapers.cses.make_request", side_effect=Exception("Network error")
)
# Test metadata error format
result = scraper.scrape_contest_metadata("test")
assert result.success == False
assert result.error.startswith(f"{platform_name}: ")
# Test problem tests error format
result = scraper.scrape_problem_tests("test", "A")
assert result.success == False
assert result.error.startswith(f"{platform_name}: ")
# Test contest list error format
result = scraper.scrape_contest_list()
assert result.success == False
assert result.error.startswith(f"{platform_name}: ")
@pytest.mark.parametrize("scraper_class", ALL_SCRAPER_CLASSES)
def test_scraper_instantiation(self, scraper_class):
scraper1 = scraper_class()
assert isinstance(scraper1, BaseScraper)
assert scraper1.config is not None
from scrapers.base import ScraperConfig
custom_config = ScraperConfig(timeout_seconds=60)
scraper2 = scraper_class(custom_config)
assert isinstance(scraper2, BaseScraper)
assert scraper2.config.timeout_seconds == 60

View file

@ -0,0 +1,58 @@
import pytest
from scrapers import ALL_SCRAPERS, get_scraper, list_platforms
from scrapers.base import BaseScraper
from scrapers.codeforces import CodeforcesScraper
class TestScraperRegistry:
def test_get_scraper_valid_platform(self):
scraper_class = get_scraper("codeforces")
assert scraper_class == CodeforcesScraper
assert issubclass(scraper_class, BaseScraper)
scraper = scraper_class()
assert isinstance(scraper, BaseScraper)
assert scraper.platform_name == "codeforces"
def test_get_scraper_invalid_platform(self):
with pytest.raises(KeyError) as exc_info:
get_scraper("nonexistent")
error_msg = str(exc_info.value)
assert "nonexistent" in error_msg
assert "Available platforms" in error_msg
def test_list_platforms(self):
platforms = list_platforms()
assert isinstance(platforms, list)
assert len(platforms) > 0
assert "codeforces" in platforms
assert set(platforms) == set(ALL_SCRAPERS.keys())
def test_all_scrapers_registry(self):
assert isinstance(ALL_SCRAPERS, dict)
assert len(ALL_SCRAPERS) > 0
for platform_name, scraper_class in ALL_SCRAPERS.items():
assert isinstance(platform_name, str)
assert platform_name.islower()
assert issubclass(scraper_class, BaseScraper)
scraper = scraper_class()
assert scraper.platform_name == platform_name
def test_registry_import_consistency(self):
from scrapers.codeforces import CodeforcesScraper as DirectImport
registry_class = get_scraper("codeforces")
assert registry_class == DirectImport
def test_all_scrapers_can_be_instantiated(self):
for platform_name, scraper_class in ALL_SCRAPERS.items():
scraper = scraper_class()
assert isinstance(scraper, BaseScraper)
assert scraper.platform_name == platform_name