fix(scraper): use scrapling

This commit is contained in:
Barrett Ruth 2025-09-30 20:16:59 -04:00
parent 5d7719ec4a
commit 49ba922ff7
3 changed files with 978 additions and 48 deletions

View file

@ -7,8 +7,10 @@ requires-python = ">=3.11"
dependencies = [ dependencies = [
"backoff>=2.2.1", "backoff>=2.2.1",
"beautifulsoup4>=4.13.5", "beautifulsoup4>=4.13.5",
"cloudscraper>=1.2.71", "curl-cffi>=0.13.0",
"playwright>=1.55.0",
"requests>=2.32.5", "requests>=2.32.5",
"scrapling[fetchers]>=0.3.5",
"scrapy>=2.13.3", "scrapy>=2.13.3",
] ]

View file

@ -5,8 +5,9 @@ import re
import sys import sys
from dataclasses import asdict from dataclasses import asdict
import cloudscraper import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from scrapling.fetchers import StealthySession
from .base import BaseScraper from .base import BaseScraper
from .models import ( from .models import (
@ -21,11 +22,11 @@ from .models import (
def scrape(url: str) -> list[TestCase]: def scrape(url: str) -> list[TestCase]:
try: try:
scraper = cloudscraper.create_scraper() with StealthySession(headless=True, solve_cloudflare=True) as session:
response = scraper.get(url, timeout=10) page = session.fetch(url, google_search=False)
response.raise_for_status() html = page.html_content
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(html, "html.parser")
input_sections = soup.find_all("div", class_="input") input_sections = soup.find_all("div", class_="input")
output_sections = soup.find_all("div", class_="output") output_sections = soup.find_all("div", class_="output")
@ -139,7 +140,7 @@ def scrape(url: str) -> list[TestCase]:
return [TestCase(input=combined_input, expected=combined_output)] return [TestCase(input=combined_input, expected=combined_output)]
except Exception as e: except Exception as e:
print(f"CloudScraper failed: {e}", file=sys.stderr) print(f"Scrapling failed: {e}", file=sys.stderr)
return [] return []
@ -180,11 +181,11 @@ def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]: def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
try: try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}" contest_url: str = f"https://codeforces.com/contest/{contest_id}"
scraper = cloudscraper.create_scraper() with StealthySession(headless=True, solve_cloudflare=True) as session:
response = scraper.get(contest_url, timeout=10) page = session.fetch(contest_url, google_search=False)
response.raise_for_status() html = page.html_content
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(html, "html.parser")
problems: list[ProblemSummary] = [] problems: list[ProblemSummary] = []
problem_links = soup.find_all( problem_links = soup.find_all(
@ -224,8 +225,7 @@ def scrape_sample_tests(url: str) -> list[TestCase]:
def scrape_contests() -> list[ContestSummary]: def scrape_contests() -> list[ContestSummary]:
scraper = cloudscraper.create_scraper() response = requests.get("https://codeforces.com/api/contest.list", timeout=10)
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
@ -236,7 +236,6 @@ def scrape_contests() -> list[ContestSummary]:
for contest in data["result"]: for contest in data["result"]:
contest_id = str(contest["id"]) contest_id = str(contest["id"])
name = contest["name"] name = contest["name"]
contests.append(ContestSummary(id=contest_id, name=name, display_name=name)) contests.append(ContestSummary(id=contest_id, name=name, display_name=name))
return contests return contests
@ -277,10 +276,10 @@ class CodeforcesScraper(BaseScraper):
url = parse_problem_url(contest_id, problem_letter) url = parse_problem_url(contest_id, problem_letter)
tests = scrape_sample_tests(url) tests = scrape_sample_tests(url)
scraper = cloudscraper.create_scraper() with StealthySession(headless=True, solve_cloudflare=True) as session:
response = scraper.get(url, timeout=self.config.timeout_seconds) page = session.fetch(url, google_search=False)
response.raise_for_status() html = page.html_content
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(html, "html.parser")
timeout_ms, memory_mb = extract_problem_limits(soup) timeout_ms, memory_mb = extract_problem_limits(soup)
problem_statement_div = soup.find("div", class_="problem-statement") problem_statement_div = soup.find("div", class_="problem-statement")

989
uv.lock generated

File diff suppressed because it is too large Load diff