Merge pull request #120 from barrett-ruth/fix/docs

better scraper config
This commit is contained in:
Barrett Ruth 2025-10-01 04:39:08 +02:00 committed by GitHub
commit 1b0b5e5039
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 18 additions and 29 deletions

View file

@ -28,7 +28,7 @@ COMMANDS *cp-commands*
cp.nvim uses a single :CP command with intelligent argument parsing: cp.nvim uses a single :CP command with intelligent argument parsing:
State Restoration ~ State Restoration ~
:CP Restore contest context from current file. :CP Restore state from current file.
Automatically detects platform, contest, problem, Automatically detects platform, contest, problem,
and language from cached state. Use this after and language from cached state. Use this after
switching files to restore your CP environment. switching files to restore your CP environment.

View file

@ -7,7 +7,7 @@ from dataclasses import asdict
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from scrapling.fetchers import StealthySession from scrapling.fetchers import StealthyFetcher
from .base import BaseScraper from .base import BaseScraper
from .models import ( from .models import (
@ -22,9 +22,8 @@ from .models import (
def scrape(url: str) -> list[TestCase]: def scrape(url: str) -> list[TestCase]:
try: try:
with StealthySession(headless=True, solve_cloudflare=True) as session: page = StealthyFetcher.fetch(url, headless=True, solve_cloudflare=True)
page = session.fetch(url, google_search=False) html = page.html_content
html = page.html_content
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
input_sections = soup.find_all("div", class_="input") input_sections = soup.find_all("div", class_="input")
@ -181,9 +180,8 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try: try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}" contest_url: str = f"https://codeforces.com/contest/{contest_id}"
with StealthySession(headless=True, solve_cloudflare=True) as session: page = StealthyFetcher.fetch(contest_url, headless=True, solve_cloudflare=True)
page = session.fetch(contest_url, google_search=False) html = page.html_content
html = page.html_content
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
problems: list[ProblemSummary] = [] problems: list[ProblemSummary] = []
@ -276,9 +274,8 @@ class CodeforcesScraper(BaseScraper):
url = parse_problem_url(contest_id, problem_letter) url = parse_problem_url(contest_id, problem_letter)
tests = scrape_sample_tests(url) tests = scrape_sample_tests(url)
with StealthySession(headless=True, solve_cloudflare=True) as session: page = StealthyFetcher.fetch(url, headless=True, solve_cloudflare=True)
page = session.fetch(url, google_search=False) html = page.html_content
html = page.html_content
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup) timeout_ms, memory_mb = extract_problem_limits(soup)

View file

@ -4,18 +4,10 @@ from scrapers.codeforces import CodeforcesScraper
from scrapers.models import ContestSummary, ProblemSummary from scrapers.models import ContestSummary, ProblemSummary
def make_mock_session(html: str):
"""Return a mock StealthySession that yields the given HTML."""
mock_session = Mock()
mock_session.fetch.return_value.html_content = html
mock_session.__enter__ = lambda s: s
mock_session.__exit__ = lambda s, exc_type, exc_val, exc_tb: None
return mock_session
def test_scrape_success(mocker, mock_codeforces_html): def test_scrape_success(mocker, mock_codeforces_html):
mock_session = make_mock_session(mock_codeforces_html) mock_page = Mock()
mocker.patch("scrapers.codeforces.StealthySession", return_value=mock_session) mock_page.html_content = mock_codeforces_html
mocker.patch("scrapers.codeforces.StealthyFetcher.fetch", return_value=mock_page)
scraper = CodeforcesScraper() scraper = CodeforcesScraper()
result = scraper.scrape_problem_tests("1900", "A") result = scraper.scrape_problem_tests("1900", "A")
@ -31,8 +23,9 @@ def test_scrape_contest_problems(mocker):
<a href="/contest/1900/problem/A">A. Problem A</a> <a href="/contest/1900/problem/A">A. Problem A</a>
<a href="/contest/1900/problem/B">B. Problem B</a> <a href="/contest/1900/problem/B">B. Problem B</a>
""" """
mock_session = make_mock_session(html) mock_page = Mock()
mocker.patch("scrapers.codeforces.StealthySession", return_value=mock_session) mock_page.html_content = html
mocker.patch("scrapers.codeforces.StealthyFetcher.fetch", return_value=mock_page)
scraper = CodeforcesScraper() scraper = CodeforcesScraper()
result = scraper.scrape_contest_metadata("1900") result = scraper.scrape_contest_metadata("1900")
@ -44,11 +37,10 @@ def test_scrape_contest_problems(mocker):
def test_scrape_network_error(mocker): def test_scrape_network_error(mocker):
mock_session = Mock() mocker.patch(
mock_session.fetch.side_effect = Exception("Network error") "scrapers.codeforces.StealthyFetcher.fetch",
mock_session.__enter__ = lambda s: s side_effect=Exception("Network error"),
mock_session.__exit__ = lambda s, exc_type, exc_val, exc_tb: None )
mocker.patch("scrapers.codeforces.StealthySession", return_value=mock_session)
scraper = CodeforcesScraper() scraper = CodeforcesScraper()
result = scraper.scrape_problem_tests("1900", "A") result = scraper.scrape_problem_tests("1900", "A")