diff --git a/scrapers/base.py b/scrapers/base.py new file mode 100644 index 0000000..bf96241 --- /dev/null +++ b/scrapers/base.py @@ -0,0 +1,95 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Protocol + +import requests + +from .models import ContestListResult, MetadataResult, TestsResult + + +@dataclass +class ScraperConfig: + timeout_seconds: int = 30 + max_retries: int = 3 + backoff_base: float = 2.0 + rate_limit_delay: float = 1.0 + + +class HttpClient(Protocol): + def get(self, url: str, **kwargs) -> requests.Response: ... + def close(self) -> None: ... + + +class BaseScraper(ABC): + def __init__(self, config: ScraperConfig | None = None): + self.config = config or ScraperConfig() + self._client: HttpClient | None = None + + @property + @abstractmethod + def platform_name(self) -> str: ... + + @abstractmethod + def _create_client(self) -> HttpClient: ... + + @abstractmethod + def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: ... + + @abstractmethod + def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: ... + + @abstractmethod + def scrape_contest_list(self) -> ContestListResult: ... + + @property + def client(self) -> HttpClient: + if self._client is None: + self._client = self._create_client() + return self._client + + def close(self) -> None: + if self._client is not None: + self._client.close() + self._client = None + + def _create_metadata_error( + self, error_msg: str, contest_id: str = "" + ) -> MetadataResult: + return MetadataResult( + success=False, + error=f"{self.platform_name}: {error_msg}", + contest_id=contest_id, + ) + + def _create_tests_error( + self, error_msg: str, problem_id: str = "", url: str = "" + ) -> TestsResult: + return TestsResult( + success=False, + error=f"{self.platform_name}: {error_msg}", + problem_id=problem_id, + url=url, + tests=[], + timeout_ms=0, + memory_mb=0, + ) + + def _create_contests_error(self, error_msg: str) -> ContestListResult: + return ContestListResult( + success=False, error=f"{self.platform_name}: {error_msg}" + ) + + def _safe_execute(self, operation: str, func, *args, **kwargs): + try: + return func(*args, **kwargs) + except Exception as e: + if operation == "metadata": + contest_id = args[0] if args else "" + return self._create_metadata_error(str(e), contest_id) + elif operation == "tests": + problem_id = args[1] if len(args) > 1 else "" + return self._create_tests_error(str(e), problem_id) + elif operation == "contests": + return self._create_contests_error(str(e)) + else: + raise diff --git a/scrapers/clients.py b/scrapers/clients.py new file mode 100644 index 0000000..d5bd232 --- /dev/null +++ b/scrapers/clients.py @@ -0,0 +1,82 @@ +import time + +import backoff +import requests + +from .base import HttpClient, ScraperConfig + + +class RequestsClient: + def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None): + self.config = config + self.session = requests.Session() + + default_headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + if headers: + default_headers.update(headers) + + self.session.headers.update(default_headers) + + @backoff.on_exception( + backoff.expo, + (requests.RequestException, requests.HTTPError), + max_tries=3, + base=2.0, + jitter=backoff.random_jitter, + ) + @backoff.on_predicate( + backoff.expo, + lambda response: response.status_code == 429, + max_tries=3, + base=2.0, + jitter=backoff.random_jitter, + ) + def get(self, url: str, **kwargs) -> requests.Response: + timeout = kwargs.get("timeout", self.config.timeout_seconds) + response = self.session.get(url, timeout=timeout, **kwargs) + response.raise_for_status() + + if ( + hasattr(self.config, "rate_limit_delay") + and self.config.rate_limit_delay > 0 + ): + time.sleep(self.config.rate_limit_delay) + + return response + + def close(self) -> None: + self.session.close() + + +class CloudScraperClient: + def __init__(self, config: ScraperConfig): + import cloudscraper + + self.config = config + self.scraper = cloudscraper.create_scraper() + + @backoff.on_exception( + backoff.expo, + (requests.RequestException, requests.HTTPError), + max_tries=3, + base=2.0, + jitter=backoff.random_jitter, + ) + def get(self, url: str, **kwargs) -> requests.Response: + timeout = kwargs.get("timeout", self.config.timeout_seconds) + response = self.scraper.get(url, timeout=timeout, **kwargs) + response.raise_for_status() + + if ( + hasattr(self.config, "rate_limit_delay") + and self.config.rate_limit_delay > 0 + ): + time.sleep(self.config.rate_limit_delay) + + return response + + def close(self) -> None: + if hasattr(self.scraper, "close"): + self.scraper.close() diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 89d568e..3bacaf5 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -5,9 +5,10 @@ import re import sys from dataclasses import asdict -import cloudscraper from bs4 import BeautifulSoup, Tag +from .base import BaseScraper, HttpClient +from .clients import CloudScraperClient from .models import ( ContestListResult, ContestSummary, @@ -18,11 +19,73 @@ from .models import ( ) -def scrape(url: str) -> list[TestCase]: +class CodeforcesScraper(BaseScraper): + @property + def platform_name(self) -> str: + return "codeforces" + + def _create_client(self) -> HttpClient: + return CloudScraperClient(self.config) + + def scrape_contest_metadata(self, contest_id: str) -> MetadataResult: + return self._safe_execute( + "metadata", self._scrape_contest_metadata_impl, contest_id + ) + + def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult: + return self._safe_execute( + "tests", self._scrape_problem_tests_impl, contest_id, problem_id + ) + + def scrape_contest_list(self) -> ContestListResult: + return self._safe_execute("contests", self._scrape_contest_list_impl) + + def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult: + problems = scrape_contest_problems(contest_id, self.client) + if not problems: + return self._create_metadata_error( + f"No problems found for contest {contest_id}", contest_id + ) + return MetadataResult( + success=True, error="", contest_id=contest_id, problems=problems + ) + + def _scrape_problem_tests_impl( + self, contest_id: str, problem_letter: str + ) -> TestsResult: + problem_id = contest_id + problem_letter.lower() + url = parse_problem_url(contest_id, problem_letter) + tests = scrape_sample_tests(url, self.client) + + response = self.client.get(url) + soup = BeautifulSoup(response.text, "html.parser") + timeout_ms, memory_mb = extract_problem_limits(soup) + + if not tests: + return self._create_tests_error( + f"No tests found for {contest_id} {problem_letter}", problem_id, url + ) + + return TestsResult( + success=True, + error="", + problem_id=problem_id, + url=url, + tests=tests, + timeout_ms=timeout_ms, + memory_mb=memory_mb, + ) + + def _scrape_contest_list_impl(self) -> ContestListResult: + contests = scrape_contests(self.client) + if not contests: + return self._create_contests_error("No contests found") + return ContestListResult(success=True, error="", contests=contests) + + +def scrape(url: str, client: HttpClient) -> list[TestCase]: try: - scraper = cloudscraper.create_scraper() - response = scraper.get(url, timeout=10) - response.raise_for_status() + response = client.get(url) soup = BeautifulSoup(response.text, "html.parser") input_sections = soup.find_all("div", class_="input") @@ -176,12 +239,12 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: return timeout_ms, memory_mb -def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: +def scrape_contest_problems( + contest_id: str, client: HttpClient +) -> list[ProblemSummary]: try: contest_url: str = f"https://codeforces.com/contest/{contest_id}" - scraper = cloudscraper.create_scraper() - response = scraper.get(contest_url, timeout=10) - response.raise_for_status() + response = client.get(contest_url) soup = BeautifulSoup(response.text, "html.parser") problems: list[ProblemSummary] = [] @@ -217,34 +280,27 @@ def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: return [] -def scrape_sample_tests(url: str) -> list[TestCase]: +def scrape_sample_tests(url: str, client: HttpClient) -> list[TestCase]: print(f"Scraping: {url}", file=sys.stderr) - return scrape(url) + return scrape(url, client) -def scrape_contests() -> list[ContestSummary]: - try: - scraper = cloudscraper.create_scraper() - response = scraper.get("https://codeforces.com/api/contest.list", timeout=10) - response.raise_for_status() +def scrape_contests(client: HttpClient) -> list[ContestSummary]: + response = client.get("https://codeforces.com/api/contest.list") - data = response.json() - if data["status"] != "OK": - return [] - - contests = [] - for contest in data["result"]: - contest_id = str(contest["id"]) - name = contest["name"] - - contests.append(ContestSummary(id=contest_id, name=name, display_name=name)) - - return contests - - except Exception as e: - print(f"Failed to fetch contests: {e}", file=sys.stderr) + data = response.json() + if data["status"] != "OK": return [] + contests = [] + for contest in data["result"]: + contest_id = str(contest["id"]) + name = contest["name"] + + contests.append(ContestSummary(id=contest_id, name=name, display_name=name)) + + return contests + def main() -> None: if len(sys.argv) < 2: @@ -255,6 +311,7 @@ def main() -> None: print(json.dumps(asdict(result))) sys.exit(1) + scraper = CodeforcesScraper() mode: str = sys.argv[1] if mode == "metadata": @@ -266,18 +323,7 @@ def main() -> None: sys.exit(1) contest_id: str = sys.argv[2] - problems: list[ProblemSummary] = scrape_contest_problems(contest_id) - - if not problems: - result = MetadataResult( - success=False, error=f"No problems found for contest {contest_id}" - ) - print(json.dumps(asdict(result))) - sys.exit(1) - - result = MetadataResult( - success=True, error="", contest_id=contest_id, problems=problems - ) + result = scraper.scrape_contest_metadata(contest_id) print(json.dumps(asdict(result))) elif mode == "tests": @@ -296,52 +342,7 @@ def main() -> None: tests_contest_id: str = sys.argv[2] problem_letter: str = sys.argv[3] - problem_id: str = tests_contest_id + problem_letter.lower() - - url: str = parse_problem_url(tests_contest_id, problem_letter) - tests: list[TestCase] = scrape_sample_tests(url) - - try: - scraper = cloudscraper.create_scraper() - response = scraper.get(url, timeout=10) - response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") - timeout_ms, memory_mb = extract_problem_limits(soup) - except Exception as e: - tests_result = TestsResult( - success=False, - error=f"Failed to extract constraints: {e}", - problem_id=problem_id, - url=url, - tests=[], - timeout_ms=0, - memory_mb=0, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - if not tests: - tests_result = TestsResult( - success=False, - error=f"No tests found for {tests_contest_id} {problem_letter}", - problem_id=problem_id, - url=url, - tests=[], - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) - print(json.dumps(asdict(tests_result))) - sys.exit(1) - - tests_result = TestsResult( - success=True, - error="", - problem_id=problem_id, - url=url, - tests=tests, - timeout_ms=timeout_ms, - memory_mb=memory_mb, - ) + tests_result = scraper.scrape_problem_tests(tests_contest_id, problem_letter) print(json.dumps(asdict(tests_result))) elif mode == "contests": @@ -352,13 +353,7 @@ def main() -> None: print(json.dumps(asdict(contest_result))) sys.exit(1) - contests = scrape_contests() - if not contests: - contest_result = ContestListResult(success=False, error="No contests found") - print(json.dumps(asdict(contest_result))) - sys.exit(1) - - contest_result = ContestListResult(success=True, error="", contests=contests) + contest_result = scraper.scrape_contest_list() print(json.dumps(asdict(contest_result))) else: @@ -369,6 +364,8 @@ def main() -> None: print(json.dumps(asdict(result))) sys.exit(1) + scraper.close() + if __name__ == "__main__": main() diff --git a/spec/picker_spec.lua b/spec/picker_spec.lua index 106fd03..6fd5a81 100644 --- a/spec/picker_spec.lua +++ b/spec/picker_spec.lua @@ -158,9 +158,7 @@ describe('cp.picker', function() end, } - package.loaded['cp.pickers.init'] = nil - package.loaded['cp.pickers'] = nil - picker = require('cp.pickers') + picker = spec_helper.fresh_require('cp.pickers', { 'cp.pickers.init' }) local problems = picker.get_problems_for_contest('test_platform', 'test_contest') assert.is_table(problems) @@ -183,6 +181,8 @@ describe('cp.picker', function() } end + picker = spec_helper.fresh_require('cp.pickers', { 'cp.pickers.init' }) + local problems = picker.get_problems_for_contest('test_platform', 'test_contest') assert.is_table(problems) assert.equals(0, #problems) diff --git a/spec/scraper_spec.lua b/spec/scraper_spec.lua index cc02b6b..c81f8e2 100644 --- a/spec/scraper_spec.lua +++ b/spec/scraper_spec.lua @@ -56,8 +56,7 @@ describe('cp.scrape', function() package.loaded['cp.cache'] = mock_cache package.loaded['cp.utils'] = mock_utils - package.loaded['cp.scrape'] = nil - scrape = require('cp.scrape') + scrape = spec_helper.fresh_require('cp.scrape') local original_fn = vim.fn vim.fn = vim.tbl_extend('force', vim.fn, { @@ -125,8 +124,7 @@ describe('cp.scrape', function() stored_data = { platform = platform, contest_id = contest_id, problems = problems } end - package.loaded['cp.scrape'] = nil - scrape = require('cp.scrape') + scrape = spec_helper.fresh_require('cp.scrape') local result = scrape.scrape_contest_metadata('atcoder', 'abc123') diff --git a/spec/snippets_spec.lua b/spec/snippets_spec.lua index ce34d3d..944e0d9 100644 --- a/spec/snippets_spec.lua +++ b/spec/snippets_spec.lua @@ -5,8 +5,7 @@ describe('cp.snippets', function() before_each(function() spec_helper.setup() - package.loaded['cp.snippets'] = nil - snippets = require('cp.snippets') + snippets = spec_helper.fresh_require('cp.snippets') mock_luasnip = { snippet = function(trigger, body) return { trigger = trigger, body = body } diff --git a/spec/spec_helper.lua b/spec/spec_helper.lua index e238a07..6f87157 100644 --- a/spec/spec_helper.lua +++ b/spec/spec_helper.lua @@ -121,6 +121,17 @@ function M.find_logged_message(pattern) return nil end +function M.fresh_require(module_name, additional_clears) + additional_clears = additional_clears or {} + + for _, clear_module in ipairs(additional_clears) do + package.loaded[clear_module] = nil + end + package.loaded[module_name] = nil + + return require(module_name) +end + function M.teardown() package.loaded['cp.log'] = nil package.loaded['cp.scrape'] = nil diff --git a/tests/scrapers/conftest.py b/tests/scrapers/conftest.py index 3248ec2..ecb8c77 100644 --- a/tests/scrapers/conftest.py +++ b/tests/scrapers/conftest.py @@ -4,6 +4,8 @@ import pytest @pytest.fixture def mock_codeforces_html(): return """ +
3
diff --git a/tests/scrapers/test_codeforces.py b/tests/scrapers/test_codeforces.py
index 14b263c..fd98b1b 100644
--- a/tests/scrapers/test_codeforces.py
+++ b/tests/scrapers/test_codeforces.py
@@ -1,61 +1,61 @@
from unittest.mock import Mock
-from scrapers.codeforces import scrape, scrape_contest_problems, scrape_contests
+from scrapers.codeforces import CodeforcesScraper
from scrapers.models import ContestSummary, ProblemSummary
def test_scrape_success(mocker, mock_codeforces_html):
- mock_scraper = Mock()
+ mock_client = Mock()
mock_response = Mock()
mock_response.text = mock_codeforces_html
- mock_scraper.get.return_value = mock_response
+ mock_client.get.return_value = mock_response
- mocker.patch(
- "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
- )
+ scraper = CodeforcesScraper()
+ mocker.patch.object(scraper, "_create_client", return_value=mock_client)
- result = scrape("https://codeforces.com/contest/1900/problem/A")
+ result = scraper.scrape_problem_tests("1900", "A")
- assert len(result) == 1
- assert result[0].input == "1\n3\n1 2 3"
- assert result[0].expected == "6"
+ assert result.success == True
+ assert len(result.tests) == 1
+ assert result.tests[0].input == "1\n3\n1 2 3"
+ assert result.tests[0].expected == "6"
def test_scrape_contest_problems(mocker):
- mock_scraper = Mock()
+ mock_client = Mock()
mock_response = Mock()
mock_response.text = """
A. Problem A
B. Problem B
"""
- mock_scraper.get.return_value = mock_response
+ mock_client.get.return_value = mock_response
- mocker.patch(
- "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
- )
+ scraper = CodeforcesScraper()
+ mocker.patch.object(scraper, "_create_client", return_value=mock_client)
- result = scrape_contest_problems("1900")
+ result = scraper.scrape_contest_metadata("1900")
- assert len(result) == 2
- assert result[0] == ProblemSummary(id="a", name="A. Problem A")
- assert result[1] == ProblemSummary(id="b", name="B. Problem B")
+ assert result.success == True
+ assert len(result.problems) == 2
+ assert result.problems[0] == ProblemSummary(id="a", name="A. Problem A")
+ assert result.problems[1] == ProblemSummary(id="b", name="B. Problem B")
def test_scrape_network_error(mocker):
- mock_scraper = Mock()
- mock_scraper.get.side_effect = Exception("Network error")
+ mock_client = Mock()
+ mock_client.get.side_effect = Exception("Network error")
- mocker.patch(
- "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
- )
+ scraper = CodeforcesScraper()
+ mocker.patch.object(scraper, "_create_client", return_value=mock_client)
- result = scrape("https://codeforces.com/contest/1900/problem/A")
+ result = scraper.scrape_problem_tests("1900", "A")
- assert result == []
+ assert result.success == False
+ assert "network error" in result.error.lower()
def test_scrape_contests_success(mocker):
- mock_scraper = Mock()
+ mock_client = Mock()
mock_response = Mock()
mock_response.json.return_value = {
"status": "OK",
@@ -65,26 +65,26 @@ def test_scrape_contests_success(mocker):
{"id": 1949, "name": "Codeforces Global Round 26"},
],
}
- mock_scraper.get.return_value = mock_response
+ mock_client.get.return_value = mock_response
- mocker.patch(
- "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
- )
+ scraper = CodeforcesScraper()
+ mocker.patch.object(scraper, "_create_client", return_value=mock_client)
- result = scrape_contests()
+ result = scraper.scrape_contest_list()
- assert len(result) == 3
- assert result[0] == ContestSummary(
+ assert result.success == True
+ assert len(result.contests) == 3
+ assert result.contests[0] == ContestSummary(
id="1951",
name="Educational Codeforces Round 168 (Rated for Div. 2)",
display_name="Educational Codeforces Round 168 (Rated for Div. 2)",
)
- assert result[1] == ContestSummary(
+ assert result.contests[1] == ContestSummary(
id="1950",
name="Codeforces Round 936 (Div. 2)",
display_name="Codeforces Round 936 (Div. 2)",
)
- assert result[2] == ContestSummary(
+ assert result.contests[2] == ContestSummary(
id="1949",
name="Codeforces Global Round 26",
display_name="Codeforces Global Round 26",
@@ -92,28 +92,28 @@ def test_scrape_contests_success(mocker):
def test_scrape_contests_api_error(mocker):
- mock_scraper = Mock()
+ mock_client = Mock()
mock_response = Mock()
mock_response.json.return_value = {"status": "FAILED", "result": []}
- mock_scraper.get.return_value = mock_response
+ mock_client.get.return_value = mock_response
- mocker.patch(
- "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
- )
+ scraper = CodeforcesScraper()
+ mocker.patch.object(scraper, "_create_client", return_value=mock_client)
- result = scrape_contests()
+ result = scraper.scrape_contest_list()
- assert result == []
+ assert result.success == False
+ assert "no contests found" in result.error.lower()
def test_scrape_contests_network_error(mocker):
- mock_scraper = Mock()
- mock_scraper.get.side_effect = Exception("Network error")
+ mock_client = Mock()
+ mock_client.get.side_effect = Exception("Network error")
- mocker.patch(
- "scrapers.codeforces.cloudscraper.create_scraper", return_value=mock_scraper
- )
+ scraper = CodeforcesScraper()
+ mocker.patch.object(scraper, "_create_client", return_value=mock_client)
- result = scrape_contests()
+ result = scraper.scrape_contest_list()
- assert result == []
+ assert result.success == False
+ assert "network error" in result.error.lower()