fix(ci): typing

This commit is contained in:
Barrett Ruth 2025-09-22 20:46:27 -04:00
parent 87f9439607
commit eb3f7762de
9 changed files with 339 additions and 155 deletions

95
scrapers/base.py Normal file
View file

@ -0,0 +1,95 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Protocol
import requests
from .models import ContestListResult, MetadataResult, TestsResult
@dataclass
class ScraperConfig:
timeout_seconds: int = 30
max_retries: int = 3
backoff_base: float = 2.0
rate_limit_delay: float = 1.0
class HttpClient(Protocol):
def get(self, url: str, **kwargs) -> requests.Response: ...
def close(self) -> None: ...
class BaseScraper(ABC):
def __init__(self, config: ScraperConfig | None = None):
self.config = config or ScraperConfig()
self._client: HttpClient | None = None
@property
@abstractmethod
def platform_name(self) -> str: ...
@abstractmethod
def _create_client(self) -> HttpClient: ...
@abstractmethod
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ...
@abstractmethod
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: ...
@abstractmethod
def scrape_contest_list(self) -> ContestListResult: ...
@property
def client(self) -> HttpClient:
if self._client is None:
self._client = self._create_client()
return self._client
def close(self) -> None:
if self._client is not None:
self._client.close()
self._client = None
def _create_metadata_error(
self, error_msg: str, contest_id: str = ""
) -> MetadataResult:
return MetadataResult(
success=False,
error=f"{self.platform_name}: {error_msg}",
contest_id=contest_id,
)
def _create_tests_error(
self, error_msg: str, problem_id: str = "", url: str = ""
) -> TestsResult:
return TestsResult(
success=False,
error=f"{self.platform_name}: {error_msg}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
def _create_contests_error(self, error_msg: str) -> ContestListResult:
return ContestListResult(
success=False, error=f"{self.platform_name}: {error_msg}"
)
def _safe_execute(self, operation: str, func, *args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
if operation == "metadata":
contest_id = args[0] if args else ""
return self._create_metadata_error(str(e), contest_id)
elif operation == "tests":
problem_id = args[1] if len(args) > 1 else ""
return self._create_tests_error(str(e), problem_id)
elif operation == "contests":
return self._create_contests_error(str(e))
else:
raise

82
scrapers/clients.py Normal file
View file

@ -0,0 +1,82 @@
import time
import backoff
import requests
from .base import HttpClient, ScraperConfig
class RequestsClient:
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
self.config = config
self.session = requests.Session()
default_headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
if headers:
default_headers.update(headers)
self.session.headers.update(default_headers)
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.session.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
self.session.close()
class CloudScraperClient:
def __init__(self, config: ScraperConfig):
import cloudscraper
self.config = config
self.scraper = cloudscraper.create_scraper()
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.scraper.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
if hasattr(self.scraper, "close"):
self.scraper.close()

View file

@ -5,9 +5,10 @@ import re
import sys import sys
from dataclasses import asdict from dataclasses import asdict
import cloudscraper
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from .base import BaseScraper, HttpClient
from .clients import CloudScraperClient
from .models import ( from .models import (
ContestListResult, ContestListResult,
ContestSummary, ContestSummary,
@ -18,11 +19,73 @@ from .models import (
) )
def scrape(url: str) -> list[TestCase]: class CodeforcesScraper(BaseScraper):
@property
def platform_name(self) -> str:
return "codeforces"
def _create_client(self) -> HttpClient:
return CloudScraperClient(self.config)
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
return self._safe_execute(
"metadata", self._scrape_contest_metadata_impl, contest_id
)
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
return self._safe_execute(
"tests", self._scrape_problem_tests_impl, contest_id, problem_id
)
def scrape_contest_list(self) -> ContestListResult:
return self._safe_execute("contests", self._scrape_contest_list_impl)
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
problems = scrape_contest_problems(contest_id, self.client)
if not problems:
return self._create_metadata_error(
f"No problems found for contest {contest_id}", contest_id
)
return MetadataResult(
success=True, error="", contest_id=contest_id, problems=problems
)
def _scrape_problem_tests_impl(
self, contest_id: str, problem_letter: str
) -> TestsResult:
problem_id = contest_id + problem_letter.lower()
url = parse_problem_url(contest_id, problem_letter)
tests = scrape_sample_tests(url, self.client)
response = self.client.get(url)
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
if not tests:
return self._create_tests_error(
f"No tests found for {contest_id} {problem_letter}", problem_id, url
)
return TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
def _scrape_contest_list_impl(self) -> ContestListResult:
contests = scrape_contests(self.client)
if not contests:
return self._create_contests_error("No contests found")
return ContestListResult(success=True, error="", contests=contests)
def scrape(url: str, client: HttpClient) -> list[TestCase]:
try: try:
scraper = cloudscraper.create_scraper() response = client.get(url)
response = scraper.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
input_sections = soup.find_all("div", class_="input") input_sections = soup.find_all("div", class_="input")
@ -176,12 +239,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
return timeout_ms, memory_mb return timeout_ms, memory_mb
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: def scrape_contest_problems(
contest_id: str, client: HttpClient
) -> list[ProblemSummary]:
try: try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}" contest_url: str = f"https://codeforces.com/contest/{contest_id}"
scraper = cloudscraper.create_scraper() response = client.get(contest_url)
response = scraper.get(contest_url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
problems: list[ProblemSummary] = [] problems: list[ProblemSummary] = []
@ -217,16 +280,13 @@ def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
return [] return []
def scrape_sample_tests(url: str) -> list[TestCase]: def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]:
print(f"Scraping: {url}", file=sys.stderr) print(f"Scraping: {url}", file=sys.stderr)
return scrape(url) return scrape(url, client)
def scrape_contests() -> list[ContestSummary]: def scrape_contests(client: HttpClient) -> list[ContestSummary]:
try: response = client.get("https://codeforces.com/api/contest.list")
scraper = cloudscraper.create_scraper()
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
response.raise_for_status()
data = response.json() data = response.json()
if data["status"] != "OK": if data["status"] != "OK":
@ -241,10 +301,6 @@ def scrape_contests() -> list[ContestSummary]:
return contests return contests
except Exception as e:
print(f"Failed to fetch contests: {e}", file=sys.stderr)
return []
def main() -> None: def main() -> None:
if len(sys.argv) < 2: if len(sys.argv) < 2:
@ -255,6 +311,7 @@ def main() -> None:
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
scraper = CodeforcesScraper()
mode: str = sys.argv[1] mode: str = sys.argv[1]
if mode == "metadata": if mode == "metadata":
@ -266,18 +323,7 @@ def main() -> None:
sys.exit(1) sys.exit(1)
contest_id: str = sys.argv[2] contest_id: str = sys.argv[2]
problems: list[ProblemSummary] = scrape_contest_problems(contest_id) result = scraper.scrape_contest_metadata(contest_id)
if not problems:
result = MetadataResult(
success=False, error=f"No problems found for contest {contest_id}"
)
print(json.dumps(asdict(result)))
sys.exit(1)
result = MetadataResult(
success=True, error="", contest_id=contest_id, problems=problems
)
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
elif mode == "tests": elif mode == "tests":
@ -296,52 +342,7 @@ def main() -> None:
tests_contest_id: str = sys.argv[2] tests_contest_id: str = sys.argv[2]
problem_letter: str = sys.argv[3] problem_letter: str = sys.argv[3]
problem_id: str = tests_contest_id + problem_letter.lower() tests_result = scraper.scrape_problem_tests(tests_contest_id, problem_letter)
url: str = parse_problem_url(tests_contest_id, problem_letter)
tests: list[TestCase] = scrape_sample_tests(url)
try:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup)
except Exception as e:
tests_result = TestsResult(
success=False,
error=f"Failed to extract constraints: {e}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=0,
memory_mb=0,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
if not tests:
tests_result = TestsResult(
success=False,
error=f"No tests found for {tests_contest_id} {problem_letter}",
problem_id=problem_id,
url=url,
tests=[],
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result)))
sys.exit(1)
tests_result = TestsResult(
success=True,
error="",
problem_id=problem_id,
url=url,
tests=tests,
timeout_ms=timeout_ms,
memory_mb=memory_mb,
)
print(json.dumps(asdict(tests_result))) print(json.dumps(asdict(tests_result)))
elif mode == "contests": elif mode == "contests":
@ -352,13 +353,7 @@ def main() -> None:
print(json.dumps(asdict(contest_result))) print(json.dumps(asdict(contest_result)))
sys.exit(1) sys.exit(1)
contests = scrape_contests() contest_result = scraper.scrape_contest_list()
if not contests:
contest_result = ContestListResult(success=False, error="No contests found")
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contest_result = ContestListResult(success=True, error="", contests=contests)
print(json.dumps(asdict(contest_result))) print(json.dumps(asdict(contest_result)))
else: else:
@ -369,6 +364,8 @@ def main() -> None:
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
scraper.close()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -158,9 +158,7 @@ describe('cp.picker', function()
end, end,
} }
package.loaded['cp.pickers.init'] = nil picker = spec_helper.fresh_require('cp.pickers', { 'cp.pickers.init' })
package.loaded['cp.pickers'] = nil
picker = require('cp.pickers')
local problems = picker.get_problems_for_contest('test_platform', 'test_contest') local problems = picker.get_problems_for_contest('test_platform', 'test_contest')
assert.is_table(problems) assert.is_table(problems)
@ -183,6 +181,8 @@ describe('cp.picker', function()
} }
end end
picker = spec_helper.fresh_require('cp.pickers', { 'cp.pickers.init' })
local problems = picker.get_problems_for_contest('test_platform', 'test_contest') local problems = picker.get_problems_for_contest('test_platform', 'test_contest')
assert.is_table(problems) assert.is_table(problems)
assert.equals(0, #problems) assert.equals(0, #problems)

View file

@ -56,8 +56,7 @@ describe('cp.scrape', function()
package.loaded['cp.cache'] = mock_cache package.loaded['cp.cache'] = mock_cache
package.loaded['cp.utils'] = mock_utils package.loaded['cp.utils'] = mock_utils
package.loaded['cp.scrape'] = nil scrape = spec_helper.fresh_require('cp.scrape')
scrape = require('cp.scrape')
local original_fn = vim.fn local original_fn = vim.fn
vim.fn = vim.tbl_extend('force', vim.fn, { vim.fn = vim.tbl_extend('force', vim.fn, {
@ -125,8 +124,7 @@ describe('cp.scrape', function()
stored_data = { platform = platform, contest_id = contest_id, problems = problems } stored_data = { platform = platform, contest_id = contest_id, problems = problems }
end end
package.loaded['cp.scrape'] = nil scrape = spec_helper.fresh_require('cp.scrape')
scrape = require('cp.scrape')
local result = scrape.scrape_contest_metadata('atcoder', 'abc123') local result = scrape.scrape_contest_metadata('atcoder', 'abc123')

View file

@ -5,8 +5,7 @@ describe('cp.snippets', function()
before_each(function() before_each(function()
spec_helper.setup() spec_helper.setup()
package.loaded['cp.snippets'] = nil snippets = spec_helper.fresh_require('cp.snippets')
snippets = require('cp.snippets')
mock_luasnip = { mock_luasnip = {
snippet = function(trigger, body) snippet = function(trigger, body)
return { trigger = trigger, body = body } return { trigger = trigger, body = body }

View file

@ -121,6 +121,17 @@ function M.find_logged_message(pattern)
return nil return nil
end end
function M.fresh_require(module_name, additional_clears)
additional_clears = additional_clears or {}
for _, clear_module in ipairs(additional_clears) do
package.loaded[clear_module] = nil
end
package.loaded[module_name] = nil
return require(module_name)
end
function M.teardown() function M.teardown()
package.loaded['cp.log'] = nil package.loaded['cp.log'] = nil
package.loaded['cp.scrape'] = nil package.loaded['cp.scrape'] = nil

View file

@ -4,6 +4,8 @@ import pytest
@pytest.fixture @pytest.fixture
def mock_codeforces_html(): def mock_codeforces_html():
return """ return """
<div class="time-limit">Time limit: 1 seconds</div>
<div class="memory-limit">Memory limit: 256 megabytes</div>
<div class="input"> <div class="input">
<pre> <pre>
<div class="test-example-line-1">3</div> <div class="test-example-line-1">3</div>

View file

@ -1,61 +1,61 @@
from unittest.mock import Mock from unittest.mock import Mock
from scrapers.codeforces import scrape, scrape_contest_problems, scrape_contests from scrapers.codeforces import CodeforcesScraper
from scrapers.models import ContestSummary, ProblemSummary from scrapers.models import ContestSummary, ProblemSummary
def test_scrape_success(mocker, mock_codeforces_html): def test_scrape_success(mocker, mock_codeforces_html):
mock_scraper = Mock() mock_client = Mock()
mock_response = Mock() mock_response = Mock()
mock_response.text = mock_codeforces_html mock_response.text = mock_codeforces_html
mock_scraper.get.return_value = mock_response mock_client.get.return_value = mock_response
mocker.patch( scraper = CodeforcesScraper()
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper mocker.patch.object(scraper, "_create_client", return_value=mock_client)
)
result = scrape("https://codeforces.com/contest/1900/problem/A") result = scraper.scrape_problem_tests("1900", "A")
assert len(result) == 1 assert result.success == True
assert result[0].input == "1\n3\n1 2 3" assert len(result.tests) == 1
assert result[0].expected == "6" assert result.tests[0].input == "1\n3\n1 2 3"
assert result.tests[0].expected == "6"
def test_scrape_contest_problems(mocker): def test_scrape_contest_problems(mocker):
mock_scraper = Mock() mock_client = Mock()
mock_response = Mock() mock_response = Mock()
mock_response.text = """ mock_response.text = """
<a href="/contest/1900/problem/A">A. Problem A</a> <a href="/contest/1900/problem/A">A. Problem A</a>
<a href="/contest/1900/problem/B">B. Problem B</a> <a href="/contest/1900/problem/B">B. Problem B</a>
""" """
mock_scraper.get.return_value = mock_response mock_client.get.return_value = mock_response
mocker.patch( scraper = CodeforcesScraper()
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper mocker.patch.object(scraper, "_create_client", return_value=mock_client)
)
result = scrape_contest_problems("1900") result = scraper.scrape_contest_metadata("1900")
assert len(result) == 2 assert result.success == True
assert result[0] == ProblemSummary(id="a", name="A. Problem A") assert len(result.problems) == 2
assert result[1] == ProblemSummary(id="b", name="B. Problem B") assert result.problems[0] == ProblemSummary(id="a", name="A. Problem A")
assert result.problems[1] == ProblemSummary(id="b", name="B. Problem B")
def test_scrape_network_error(mocker): def test_scrape_network_error(mocker):
mock_scraper = Mock() mock_client = Mock()
mock_scraper.get.side_effect = Exception("Network error") mock_client.get.side_effect = Exception("Network error")
mocker.patch( scraper = CodeforcesScraper()
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper mocker.patch.object(scraper, "_create_client", return_value=mock_client)
)
result = scrape("https://codeforces.com/contest/1900/problem/A") result = scraper.scrape_problem_tests("1900", "A")
assert result == [] assert result.success == False
assert "network error" in result.error.lower()
def test_scrape_contests_success(mocker): def test_scrape_contests_success(mocker):
mock_scraper = Mock() mock_client = Mock()
mock_response = Mock() mock_response = Mock()
mock_response.json.return_value = { mock_response.json.return_value = {
"status": "OK", "status": "OK",
@ -65,26 +65,26 @@ def test_scrape_contests_success(mocker):
{"id": 1949, "name": "Codeforces Global Round 26"}, {"id": 1949, "name": "Codeforces Global Round 26"},
], ],
} }
mock_scraper.get.return_value = mock_response mock_client.get.return_value = mock_response
mocker.patch( scraper = CodeforcesScraper()
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper mocker.patch.object(scraper, "_create_client", return_value=mock_client)
)
result = scrape_contests() result = scraper.scrape_contest_list()
assert len(result) == 3 assert result.success == True
assert result[0] == ContestSummary( assert len(result.contests) == 3
assert result.contests[0] == ContestSummary(
id="1951", id="1951",
name="Educational Codeforces Round 168 (Rated for Div. 2)", name="Educational Codeforces Round 168 (Rated for Div. 2)",
display_name="Educational Codeforces Round 168 (Rated for Div. 2)", display_name="Educational Codeforces Round 168 (Rated for Div. 2)",
) )
assert result[1] == ContestSummary( assert result.contests[1] == ContestSummary(
id="1950", id="1950",
name="Codeforces Round 936 (Div. 2)", name="Codeforces Round 936 (Div. 2)",
display_name="Codeforces Round 936 (Div. 2)", display_name="Codeforces Round 936 (Div. 2)",
) )
assert result[2] == ContestSummary( assert result.contests[2] == ContestSummary(
id="1949", id="1949",
name="Codeforces Global Round 26", name="Codeforces Global Round 26",
display_name="Codeforces Global Round 26", display_name="Codeforces Global Round 26",
@ -92,28 +92,28 @@ def test_scrape_contests_success(mocker):
def test_scrape_contests_api_error(mocker): def test_scrape_contests_api_error(mocker):
mock_scraper = Mock() mock_client = Mock()
mock_response = Mock() mock_response = Mock()
mock_response.json.return_value = {"status": "FAILED", "result": []} mock_response.json.return_value = {"status": "FAILED", "result": []}
mock_scraper.get.return_value = mock_response mock_client.get.return_value = mock_response
mocker.patch( scraper = CodeforcesScraper()
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper mocker.patch.object(scraper, "_create_client", return_value=mock_client)
)
result = scrape_contests() result = scraper.scrape_contest_list()
assert result == [] assert result.success == False
assert "no contests found" in result.error.lower()
def test_scrape_contests_network_error(mocker): def test_scrape_contests_network_error(mocker):
mock_scraper = Mock() mock_client = Mock()
mock_scraper.get.side_effect = Exception("Network error") mock_client.get.side_effect = Exception("Network error")
mocker.patch( scraper = CodeforcesScraper()
"scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper mocker.patch.object(scraper, "_create_client", return_value=mock_client)
)
result = scrape_contests() result = scraper.scrape_contest_list()
assert result == [] assert result.success == False
assert "network error" in result.error.lower()