fix(ci): type scrapers

This commit is contained in:
Barrett Ruth 2025-09-18 22:14:13 -04:00
parent 8a6b5dc373
commit ffaec3b947
4 changed files with 92 additions and 85 deletions

View file

@ -52,7 +52,7 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]:
if not task_table or not isinstance(task_table, Tag): if not task_table or not isinstance(task_table, Tag):
return [] return []
rows = task_table.find_all("tr")[1:] # skip header rows = task_table.find_all("tr")[1:]
problems: list[dict[str, str]] = [] problems: list[dict[str, str]] = []
for row in rows: for row in rows:
problem = extract_problem_from_row(row, contest_id) problem = extract_problem_from_row(row, contest_id)
@ -115,7 +115,7 @@ def scrape(url: str) -> list[tuple[str, str]]:
test_case = extract_test_case_from_headers(sample_headers, i) test_case = extract_test_case_from_headers(sample_headers, i)
if test_case: if test_case:
tests.append(test_case) tests.append(test_case)
i += 2 # move from "Sample Input n" to after "Sample Output n" i += 2
else: else:
i += 1 i += 1

View file

@ -2,12 +2,15 @@
import json import json
import sys import sys
from dataclasses import asdict
import cloudscraper import cloudscraper
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, Problem, TestCase, TestsResult
def scrape(url: str) -> list[tuple[str, str]]:
def scrape(url: str) -> list[TestCase]:
try: try:
scraper = cloudscraper.create_scraper() scraper = cloudscraper.create_scraper()
response = scraper.get(url, timeout=10) response = scraper.get(url, timeout=10)
@ -88,7 +91,7 @@ def scrape(url: str) -> list[tuple[str, str]]:
input_text = "\n".join(individual_inputs[test_num]) input_text = "\n".join(individual_inputs[test_num])
output_text = "\n".join(individual_outputs[test_num]) output_text = "\n".join(individual_outputs[test_num])
prefixed_input = "1\n" + input_text prefixed_input = "1\n" + input_text
tests.append((prefixed_input, output_text)) tests.append(TestCase(input=prefixed_input, expected=output_text))
return tests return tests
all_inputs = [] all_inputs = []
all_outputs = [] all_outputs = []
@ -124,7 +127,7 @@ def scrape(url: str) -> list[tuple[str, str]]:
combined_input = "\n".join(all_inputs) combined_input = "\n".join(all_inputs)
combined_output = "\n".join(all_outputs) combined_output = "\n".join(all_outputs)
return [(combined_input, combined_output)] return [TestCase(input=combined_input, expected=combined_output)]
except Exception as e: except Exception as e:
print(f"CloudScraper failed: {e}", file=sys.stderr) print(f"CloudScraper failed: {e}", file=sys.stderr)
@ -137,7 +140,7 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str:
) )
def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: def scrape_contest_problems(contest_id: str) -> list[Problem]:
try: try:
contest_url: str = f"https://codeforces.com/contest/{contest_id}" contest_url: str = f"https://codeforces.com/contest/{contest_id}"
scraper = cloudscraper.create_scraper() scraper = cloudscraper.create_scraper()
@ -145,7 +148,7 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]:
response.raise_for_status() response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
problems: list[dict[str, str]] = [] problems: list[Problem] = []
problem_links = soup.find_all( problem_links = soup.find_all(
"a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x "a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x
@ -160,15 +163,15 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]:
problem_name: str = link.get_text(strip=True) problem_name: str = link.get_text(strip=True)
if problem_letter and problem_name: if problem_letter and problem_name:
problems.append({"id": problem_letter, "name": problem_name}) problems.append(Problem(id=problem_letter, name=problem_name))
problems.sort(key=lambda x: x["id"]) problems.sort(key=lambda x: x.id)
seen: set[str] = set() seen: set[str] = set()
unique_problems: list[dict[str, str]] = [] unique_problems: list[Problem] = []
for p in problems: for p in problems:
if p["id"] not in seen: if p.id not in seen:
seen.add(p["id"]) seen.add(p.id)
unique_problems.append(p) unique_problems.append(p)
return unique_problems return unique_problems
@ -178,71 +181,50 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]:
return [] return []
def scrape_sample_tests(url: str) -> list[tuple[str, str]]: def scrape_sample_tests(url: str) -> list[TestCase]:
print(f"Scraping: {url}", file=sys.stderr) print(f"Scraping: {url}", file=sys.stderr)
return scrape(url) return scrape(url)
def main() -> None: def main() -> None:
if len(sys.argv) < 2: if len(sys.argv) < 2:
print( result = MetadataResult(
json.dumps( success=False,
{ error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter>",
"success": False,
"error": "Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter>",
}
)
) )
print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
mode: str = sys.argv[1] mode: str = sys.argv[1]
if mode == "metadata": if mode == "metadata":
if len(sys.argv) != 3: if len(sys.argv) != 3:
print( result = MetadataResult(
json.dumps( success=False, error="Usage: codeforces.py metadata <contest_id>"
{
"success": False,
"error": "Usage: codeforces.py metadata <contest_id>",
}
)
) )
print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
contest_id: str = sys.argv[2] contest_id: str = sys.argv[2]
problems: list[dict[str, str]] = scrape_contest_problems(contest_id) problems: list[Problem] = scrape_contest_problems(contest_id)
if not problems: if not problems:
print( result = MetadataResult(
json.dumps( success=False, error=f"No problems found for contest {contest_id}"
{
"success": False,
"error": f"No problems found for contest {contest_id}",
}
)
) )
print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
print( result = MetadataResult(success=True, contest_id=contest_id, problems=problems)
json.dumps( print(json.dumps(asdict(result)))
{
"success": True,
"contest_id": contest_id,
"problems": problems,
}
)
)
elif mode == "tests": elif mode == "tests":
if len(sys.argv) != 4: if len(sys.argv) != 4:
print( tests_result = TestsResult(
json.dumps( success=False,
{ error="Usage: codeforces.py tests <contest_id> <problem_letter>",
"success": False,
"error": "Usage: codeforces.py tests <contest_id> <problem_letter>",
}
)
) )
print(json.dumps(asdict(tests_result)))
sys.exit(1) sys.exit(1)
tests_contest_id: str = sys.argv[2] tests_contest_id: str = sys.argv[2]
@ -250,45 +232,28 @@ def main() -> None:
problem_id: str = tests_contest_id + problem_letter.lower() problem_id: str = tests_contest_id + problem_letter.lower()
url: str = parse_problem_url(tests_contest_id, problem_letter) url: str = parse_problem_url(tests_contest_id, problem_letter)
tests: list[tuple[str, str]] = scrape_sample_tests(url) tests: list[TestCase] = scrape_sample_tests(url)
if not tests: if not tests:
print( tests_result = TestsResult(
json.dumps( success=False,
{ error=f"No tests found for {tests_contest_id} {problem_letter}",
"success": False, problem_id=problem_id,
"error": f"No tests found for {tests_contest_id} {problem_letter}", url=url,
"problem_id": problem_id,
"url": url,
}
)
) )
print(json.dumps(asdict(tests_result)))
sys.exit(1) sys.exit(1)
test_list: list[dict[str, str]] = [] tests_result = TestsResult(
for input_data, output_data in tests: success=True, problem_id=problem_id, url=url, tests=tests
test_list.append({"input": input_data, "expected": output_data})
print(
json.dumps(
{
"success": True,
"problem_id": problem_id,
"url": url,
"tests": test_list,
}
)
) )
print(json.dumps(asdict(tests_result)))
else: else:
print( result = MetadataResult(
json.dumps( success=False, error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'"
{
"success": False,
"error": f"Unknown mode: {mode}. Use 'metadata' or 'tests'",
}
)
) )
print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)

41
scrapers/models.py Normal file
View file

@ -0,0 +1,41 @@
from dataclasses import dataclass
@dataclass
class TestCase:
input: str
expected: str
@dataclass
class Problem:
id: str
name: str
@dataclass
class ScrapingResult:
success: bool
error: str | None = None
@dataclass
class MetadataResult(ScrapingResult):
contest_id: str | None = None
problems: list[Problem] | None = None
categories: dict[str, list[Problem]] | None = None
def __post_init__(self):
if self.problems is None:
self.problems = []
@dataclass
class TestsResult(ScrapingResult):
problem_id: str = ""
url: str = ""
tests: list[TestCase] | None = None
def __post_init__(self):
if self.tests is None:
self.tests = []

View file

@ -1,5 +1,6 @@
from unittest.mock import Mock from unittest.mock import Mock
from scrapers.codeforces import scrape, scrape_contest_problems from scrapers.codeforces import scrape, scrape_contest_problems
from scrapers.models import Problem, TestCase
def test_scrape_success(mocker, mock_codeforces_html): def test_scrape_success(mocker, mock_codeforces_html):
@ -15,8 +16,8 @@ def test_scrape_success(mocker, mock_codeforces_html):
result = scrape("https://codeforces.com/contest/1900/problem/A") result = scrape("https://codeforces.com/contest/1900/problem/A")
assert len(result) == 1 assert len(result) == 1
assert result[0][0] == "1\n3\n1 2 3" assert result[0].input == "1\n3\n1 2 3"
assert result[0][1] == "6" assert result[0].expected == "6"
def test_scrape_contest_problems(mocker): def test_scrape_contest_problems(mocker):
@ -35,8 +36,8 @@ def test_scrape_contest_problems(mocker):
result = scrape_contest_problems("1900") result = scrape_contest_problems("1900")
assert len(result) == 2 assert len(result) == 2
assert result[0] == {"id": "a", "name": "A. Problem A"} assert result[0] == Problem(id="a", name="A. Problem A")
assert result[1] == {"id": "b", "name": "B. Problem B"} assert result[1] == Problem(id="b", name="B. Problem B")
def test_scrape_network_error(mocker): def test_scrape_network_error(mocker):