From 8e13b8c61d2e9bf1d0b380d82d02d2b80e70b2c3 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Sat, 20 Sep 2025 14:01:18 -0400 Subject: [PATCH] feat(cses): update cses with concept of a category --- scrapers/atcoder.py | 10 ++- scrapers/codeforces.py | 74 +++++++++++++++++++- scrapers/cses.py | 134 +++++++++++++++++++++++++++++++----- scrapers/models.py | 12 ++++ tests/scrapers/test_cses.py | 97 ++++++++++++++++++++++++-- 5 files changed, 299 insertions(+), 28 deletions(-) diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index e251c44..fbc1453 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -3,12 +3,20 @@ import json import re import sys +import time from dataclasses import asdict import requests from bs4 import BeautifulSoup, Tag -from .models import MetadataResult, ProblemSummary, TestCase, TestsResult +from .models import ( + ContestListResult, + ContestSummary, + MetadataResult, + ProblemSummary, + TestCase, + TestsResult, +) def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index a66acbd..b4f6409 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -7,7 +7,14 @@ from dataclasses import asdict import cloudscraper from bs4 import BeautifulSoup, Tag -from .models import MetadataResult, ProblemSummary, TestCase, TestsResult +from .models import ( + ContestListResult, + ContestSummary, + MetadataResult, + ProblemSummary, + TestCase, + TestsResult, +) def scrape(url: str) -> list[TestCase]: @@ -218,11 +225,54 @@ def scrape_sample_tests(url: str) -> list[TestCase]: return scrape(url) +def scrape_contests() -> list[ContestSummary]: + try: + scraper = cloudscraper.create_scraper() + response = scraper.get("https://codeforces.com/api/contest.list", timeout=10) + response.raise_for_status() + + data = response.json() + if data["status"] != "OK": + return [] + + contests = [] + for contest in data["result"]: + contest_id = str(contest["id"]) + name = contest["name"] + + # Clean up contest names for display + display_name = name + if "Educational Codeforces Round" in name: + import re + + match = re.search(r"Educational Codeforces Round (\d+)", name) + if match: + display_name = f"Educational Round {match.group(1)}" + elif "Codeforces Round" in name and "Div" in name: + match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name) + if match: + display_name = f"Round {match.group(1)} (Div. {match.group(2)})" + elif "Codeforces Global Round" in name: + match = re.search(r"Codeforces Global Round (\d+)", name) + if match: + display_name = f"Global Round {match.group(1)}" + + contests.append( + ContestSummary(id=contest_id, name=name, display_name=display_name) + ) + + return contests[:100] # Limit to recent 100 contests + + except Exception as e: + print(f"Failed to fetch contests: {e}", file=sys.stderr) + return [] + + def main() -> None: if len(sys.argv) < 2: result = MetadataResult( success=False, - error="Usage: codeforces.py metadata OR codeforces.py tests ", + error="Usage: codeforces.py metadata OR codeforces.py tests OR codeforces.py contests", ) print(json.dumps(asdict(result))) sys.exit(1) @@ -316,9 +366,27 @@ def main() -> None: ) print(json.dumps(asdict(tests_result))) + elif mode == "contests": + if len(sys.argv) != 2: + contest_result = ContestListResult( + success=False, error="Usage: codeforces.py contests" + ) + print(json.dumps(asdict(contest_result))) + sys.exit(1) + + contests = scrape_contests() + if not contests: + contest_result = ContestListResult(success=False, error="No contests found") + print(json.dumps(asdict(contest_result))) + sys.exit(1) + + contest_result = ContestListResult(success=True, error="", contests=contests) + print(json.dumps(asdict(contest_result))) + else: result = MetadataResult( - success=False, error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'" + success=False, + error=f"Unknown mode: {mode}. Use 'metadata', 'tests', or 'contests'", ) print(json.dumps(asdict(result))) sys.exit(1) diff --git a/scrapers/cses.py b/scrapers/cses.py index edf3224..e07e2ec 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -11,6 +11,85 @@ from bs4 import BeautifulSoup, Tag from .models import MetadataResult, ProblemSummary, TestCase, TestsResult +def normalize_category_name(category_name: str) -> str: + return category_name.lower().replace(" ", "_").replace("&", "and") + + +def denormalize_category_name(category_id: str) -> str: + category_map = { + "introductory_problems": "Introductory Problems", + "sorting_and_searching": "Sorting and Searching", + "dynamic_programming": "Dynamic Programming", + "graph_algorithms": "Graph Algorithms", + "range_queries": "Range Queries", + "tree_algorithms": "Tree Algorithms", + "mathematics": "Mathematics", + "string_algorithms": "String Algorithms", + "geometry": "Geometry", + "advanced_techniques": "Advanced Techniques", + } + + return category_map.get(category_id, category_id.replace("_", " ").title()) + + +def scrape_category_problems(category_id: str) -> list[ProblemSummary]: + category_name = denormalize_category_name(category_id) + + try: + problemset_url = "https://cses.fi/problemset/" + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + + response = requests.get(problemset_url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + + current_category = None + problems = [] + target_found = False + + for element in soup.find_all(["h1", "h2", "ul"]): + if not isinstance(element, Tag): + continue + if element.name in ["h1", "h2"]: + text = element.get_text(strip=True) + if not text or text.startswith("CSES") or text == "CSES Problem Set": + continue + + if target_found and current_category != text: + break + + current_category = text + if text.lower() == category_name.lower(): + target_found = True + + elif element.name == "ul" and current_category and target_found: + problem_links = element.find_all( + "a", href=lambda x: x and "/problemset/task/" in x + ) + for link in problem_links: + href = link.get("href", "") + if not href: + continue + + problem_id = href.split("/")[-1] + problem_name = link.get_text(strip=True) + + if not problem_id.isdigit() or not problem_name: + continue + + problems.append(ProblemSummary(id=problem_id, name=problem_name)) + + problems.sort(key=lambda x: int(x.id)) + return problems + + except Exception as e: + print(f"Failed to scrape CSES category {category_id}: {e}", file=sys.stderr) + return [] + + def parse_problem_url(problem_input: str) -> str | None: if problem_input.startswith("https://cses.fi/problemset/task/"): return problem_input @@ -94,21 +173,39 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]: soup = BeautifulSoup(response.text, "html.parser") all_categories: dict[str, list[ProblemSummary]] = {} - problem_links = soup.find_all( - "a", href=lambda x: x and "/problemset/task/" in x - ) - print(f"Found {len(problem_links)} problem links", file=sys.stderr) - current_category = None - for element in soup.find_all(["h1", "a"]): - current_category = process_problem_element( - element, current_category, all_categories - ) + for element in soup.find_all(["h1", "h2", "ul"]): + if not isinstance(element, Tag): + continue + if element.name in ["h1", "h2"]: + text = element.get_text(strip=True) + if text and not text.startswith("CSES") and text != "CSES Problem Set": + current_category = text + if current_category not in all_categories: + all_categories[current_category] = [] + print(f"Found category: {current_category}", file=sys.stderr) + + elif element.name == "ul" and current_category: + problem_links = element.find_all( + "a", href=lambda x: x and "/problemset/task/" in x + ) + for link in problem_links: + href = link.get("href", "") + if href: + problem_id = href.split("/")[-1] + problem_name = link.get_text(strip=True) + + if problem_id.isdigit() and problem_name: + problem = ProblemSummary(id=problem_id, name=problem_name) + all_categories[current_category].append(problem) for category in all_categories: all_categories[category].sort(key=lambda x: int(x.id)) - print(f"Found {len(all_categories)} categories", file=sys.stderr) + print( + f"Found {len(all_categories)} categories with {sum(len(probs) for probs in all_categories.values())} problems", + file=sys.stderr, + ) return all_categories except Exception as e: @@ -170,7 +267,7 @@ def main() -> None: if len(sys.argv) < 2: result = MetadataResult( success=False, - error="Usage: cses.py metadata OR cses.py tests ", + error="Usage: cses.py metadata OR cses.py tests ", ) print(json.dumps(asdict(result))) sys.exit(1) @@ -178,25 +275,26 @@ def main() -> None: mode: str = sys.argv[1] if mode == "metadata": - if len(sys.argv) != 2: + if len(sys.argv) != 3: result = MetadataResult( success=False, - error="Usage: cses.py metadata", + error="Usage: cses.py metadata ", ) print(json.dumps(asdict(result))) sys.exit(1) - all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems() + category_id = sys.argv[2] + problems = scrape_category_problems(category_id) - if not all_categories: + if not problems: result = MetadataResult( success=False, - error="Failed to scrape CSES problem categories", + error=f"No problems found for category: {category_id}", ) print(json.dumps(asdict(result))) - sys.exit(1) + return - result = MetadataResult(success=True, error="", categories=all_categories) + result = MetadataResult(success=True, error="", problems=problems) print(json.dumps(asdict(result))) elif mode == "tests": diff --git a/scrapers/models.py b/scrapers/models.py index 728e9bb..318404d 100644 --- a/scrapers/models.py +++ b/scrapers/models.py @@ -13,6 +13,13 @@ class ProblemSummary: name: str +@dataclass +class ContestSummary: + id: str + name: str + display_name: str + + @dataclass class ScrapingResult: success: bool @@ -26,6 +33,11 @@ class MetadataResult(ScrapingResult): categories: dict[str, list[ProblemSummary]] = field(default_factory=dict) +@dataclass +class ContestListResult(ScrapingResult): + contests: list[ContestSummary] = field(default_factory=list) + + @dataclass class TestsResult(ScrapingResult): problem_id: str diff --git a/tests/scrapers/test_cses.py b/tests/scrapers/test_cses.py index c91b0f8..68af557 100644 --- a/tests/scrapers/test_cses.py +++ b/tests/scrapers/test_cses.py @@ -1,5 +1,12 @@ from unittest.mock import Mock -from scrapers.cses import scrape, scrape_all_problems + +from scrapers.cses import ( + denormalize_category_name, + normalize_category_name, + scrape, + scrape_all_problems, + scrape_category_problems, +) from scrapers.models import ProblemSummary @@ -19,12 +26,19 @@ def test_scrape_success(mocker, mock_cses_html): def test_scrape_all_problems(mocker): mock_response = Mock() mock_response.text = """ -

Introductory Problems

- Weird Algorithm - Missing Number -

Sorting and Searching

- Apartments +
+

Introductory Problems

+ +

Sorting and Searching

+ +
""" + mock_response.raise_for_status = Mock() mocker.patch("scrapers.cses.requests.get", return_value=mock_response) @@ -45,3 +59,74 @@ def test_scrape_network_error(mocker): result = scrape("https://cses.fi/problemset/task/1068") assert result == [] + + +def test_normalize_category_name(): + assert normalize_category_name("Sorting and Searching") == "sorting_and_searching" + assert normalize_category_name("Dynamic Programming") == "dynamic_programming" + assert normalize_category_name("Graph Algorithms") == "graph_algorithms" + + +def test_denormalize_category_name(): + assert denormalize_category_name("sorting_and_searching") == "Sorting and Searching" + assert denormalize_category_name("dynamic_programming") == "Dynamic Programming" + assert denormalize_category_name("graph_algorithms") == "Graph Algorithms" + + +def test_scrape_category_problems_success(mocker): + mock_response = Mock() + mock_response.text = """ +
+

General

+ +

Sorting and Searching

+ +

Dynamic Programming

+ +
+ """ + mock_response.raise_for_status = Mock() + + mocker.patch("scrapers.cses.requests.get", return_value=mock_response) + + result = scrape_category_problems("sorting_and_searching") + + assert len(result) == 2 + assert result[0].id == "1640" + assert result[0].name == "Sum of Two Values" + assert result[1].id == "1643" + assert result[1].name == "Maximum Subarray Sum" + + +def test_scrape_category_problems_not_found(mocker): + mock_response = Mock() + mock_response.text = """ +
+

Some Other Category

+ +
+ """ + mock_response.raise_for_status = Mock() + + mocker.patch("scrapers.cses.requests.get", return_value=mock_response) + + result = scrape_category_problems("nonexistent_category") + + assert result == [] + + +def test_scrape_category_problems_network_error(mocker): + mocker.patch("scrapers.cses.requests.get", side_effect=Exception("Network error")) + + result = scrape_category_problems("sorting_and_searching") + + assert result == []