feat(cses): update cses with concept of a category

2025-09-20 14:01:18 -04:00 · 2025-09-20 14:01:18 -04:00 · 8e13b8c61d
commit 8e13b8c61d
parent 8df38d0ca8
5 changed files with 299 additions and 28 deletions
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@ -3,12 +3,20 @@
 import json
 import re
 import sys
 import time
 from dataclasses import asdict
 import requests
 from bs4 import BeautifulSoup, Tag
-from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
+from .models import (
    ContestListResult,
    ContestSummary,
    MetadataResult,
    ProblemSummary,
    TestCase,
    TestsResult,
 )
 def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@ -7,7 +7,14 @@ from dataclasses import asdict
 import cloudscraper
 from bs4 import BeautifulSoup, Tag
-from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
+from .models import (
    ContestListResult,
    ContestSummary,
    MetadataResult,
    ProblemSummary,
    TestCase,
    TestsResult,
 )
 def scrape(url: str) -> list[TestCase]:
@ -218,11 +225,54 @@ def scrape_sample_tests(url: str) -> list[TestCase]:
    return scrape(url)
 def scrape_contests() -> list[ContestSummary]:
    try:
        scraper = cloudscraper.create_scraper()
        response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
        response.raise_for_status()
        data = response.json()
        if data["status"] != "OK":
            return []
        contests = []
        for contest in data["result"]:
            contest_id = str(contest["id"])
            name = contest["name"]
            # Clean up contest names for display
            display_name = name
            if "Educational Codeforces Round" in name:
                import re
                match = re.search(r"Educational Codeforces Round (\d+)", name)
                if match:
                    display_name = f"Educational Round {match.group(1)}"
            elif "Codeforces Round" in name and "Div" in name:
                match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
                if match:
                    display_name = f"Round {match.group(1)} (Div. {match.group(2)})"
            elif "Codeforces Global Round" in name:
                match = re.search(r"Codeforces Global Round (\d+)", name)
                if match:
                    display_name = f"Global Round {match.group(1)}"
            contests.append(
                ContestSummary(id=contest_id, name=name, display_name=display_name)
            )
        return contests[:100]  # Limit to recent 100 contests
    except Exception as e:
        print(f"Failed to fetch contests: {e}", file=sys.stderr)
        return []
 def main() -> None:
    if len(sys.argv) < 2:
        result = MetadataResult(
            success=False,
-            error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter>",
+            error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter> OR codeforces.py contests",
        )
        print(json.dumps(asdict(result)))
        sys.exit(1)
@ -316,9 +366,27 @@ def main() -> None:
        )
        print(json.dumps(asdict(tests_result)))
    elif mode == "contests":
        if len(sys.argv) != 2:
            contest_result = ContestListResult(
                success=False, error="Usage: codeforces.py contests"
            )
            print(json.dumps(asdict(contest_result)))
            sys.exit(1)
        contests = scrape_contests()
        if not contests:
            contest_result = ContestListResult(success=False, error="No contests found")
            print(json.dumps(asdict(contest_result)))
            sys.exit(1)
        contest_result = ContestListResult(success=True, error="", contests=contests)
        print(json.dumps(asdict(contest_result)))
    else:
        result = MetadataResult(
-            success=False, error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'"
+            success=False,
            error=f"Unknown mode: {mode}. Use 'metadata', 'tests', or 'contests'",
        )
        print(json.dumps(asdict(result)))
        sys.exit(1)
--- a/scrapers/cses.py
+++ b/scrapers/cses.py
@ -11,6 +11,85 @@ from bs4 import BeautifulSoup, Tag
 from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
 def normalize_category_name(category_name: str) -> str:
    return category_name.lower().replace(" ", "_").replace("&", "and")
 def denormalize_category_name(category_id: str) -> str:
    category_map = {
        "introductory_problems": "Introductory Problems",
        "sorting_and_searching": "Sorting and Searching",
        "dynamic_programming": "Dynamic Programming",
        "graph_algorithms": "Graph Algorithms",
        "range_queries": "Range Queries",
        "tree_algorithms": "Tree Algorithms",
        "mathematics": "Mathematics",
        "string_algorithms": "String Algorithms",
        "geometry": "Geometry",
        "advanced_techniques": "Advanced Techniques",
    }
    return category_map.get(category_id, category_id.replace("_", " ").title())
 def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
    category_name = denormalize_category_name(category_id)
    try:
        problemset_url = "https://cses.fi/problemset/"
        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
        response = requests.get(problemset_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        current_category = None
        problems = []
        target_found = False
        for element in soup.find_all(["h1", "h2", "ul"]):
            if not isinstance(element, Tag):
                continue
            if element.name in ["h1", "h2"]:
                text = element.get_text(strip=True)
                if not text or text.startswith("CSES") or text == "CSES Problem Set":
                    continue
                if target_found and current_category != text:
                    break
                current_category = text
                if text.lower() == category_name.lower():
                    target_found = True
            elif element.name == "ul" and current_category and target_found:
                problem_links = element.find_all(
                    "a", href=lambda x: x and "/problemset/task/" in x
                )
                for link in problem_links:
                    href = link.get("href", "")
                    if not href:
                        continue
                    problem_id = href.split("/")[-1]
                    problem_name = link.get_text(strip=True)
                    if not problem_id.isdigit() or not problem_name:
                        continue
                    problems.append(ProblemSummary(id=problem_id, name=problem_name))
        problems.sort(key=lambda x: int(x.id))
        return problems
    except Exception as e:
        print(f"Failed to scrape CSES category {category_id}: {e}", file=sys.stderr)
        return []
 def parse_problem_url(problem_input: str) -> str | None:
    if problem_input.startswith("https://cses.fi/problemset/task/"):
        return problem_input
@ -94,21 +173,39 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
        soup = BeautifulSoup(response.text, "html.parser")
        all_categories: dict[str, list[ProblemSummary]] = {}
        problem_links = soup.find_all(
            "a", href=lambda x: x and "/problemset/task/" in x
        )
        print(f"Found {len(problem_links)} problem links", file=sys.stderr)
        current_category = None
-        for element in soup.find_all(["h1", "a"]):
+        for element in soup.find_all(["h1", "h2", "ul"]):
-            current_category = process_problem_element(
+            if not isinstance(element, Tag):
-                element, current_category, all_categories
+                continue
-            )
+            if element.name in ["h1", "h2"]:
                text = element.get_text(strip=True)
                if text and not text.startswith("CSES") and text != "CSES Problem Set":
                    current_category = text
                    if current_category not in all_categories:
                        all_categories[current_category] = []
                        print(f"Found category: {current_category}", file=sys.stderr)
            elif element.name == "ul" and current_category:
                problem_links = element.find_all(
                    "a", href=lambda x: x and "/problemset/task/" in x
                )
                for link in problem_links:
                    href = link.get("href", "")
                    if href:
                        problem_id = href.split("/")[-1]
                        problem_name = link.get_text(strip=True)
                        if problem_id.isdigit() and problem_name:
                            problem = ProblemSummary(id=problem_id, name=problem_name)
                            all_categories[current_category].append(problem)
        for category in all_categories:
            all_categories[category].sort(key=lambda x: int(x.id))
-        print(f"Found {len(all_categories)} categories", file=sys.stderr)
+        print(
            f"Found {len(all_categories)} categories with {sum(len(probs) for probs in all_categories.values())} problems",
            file=sys.stderr,
        )
        return all_categories
    except Exception as e:
@ -170,7 +267,7 @@ def main() -> None:
    if len(sys.argv) < 2:
        result = MetadataResult(
            success=False,
-            error="Usage: cses.py metadata OR cses.py tests <problem_id_or_url>",
+            error="Usage: cses.py metadata <category_id> OR cses.py tests <problem_id_or_url>",
        )
        print(json.dumps(asdict(result)))
        sys.exit(1)
@ -178,25 +275,26 @@ def main() -> None:
    mode: str = sys.argv[1]
    if mode == "metadata":
-        if len(sys.argv) != 2:
+        if len(sys.argv) != 3:
            result = MetadataResult(
                success=False,
-                error="Usage: cses.py metadata",
+                error="Usage: cses.py metadata <category_id>",
            )
            print(json.dumps(asdict(result)))
            sys.exit(1)
-        all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems()
+        category_id = sys.argv[2]
        problems = scrape_category_problems(category_id)
-        if not all_categories:
+        if not problems:
            result = MetadataResult(
                success=False,
-                error="Failed to scrape CSES problem categories",
+                error=f"No problems found for category: {category_id}",
            )
            print(json.dumps(asdict(result)))
-            sys.exit(1)
+            return
-        result = MetadataResult(success=True, error="", categories=all_categories)
+        result = MetadataResult(success=True, error="", problems=problems)
        print(json.dumps(asdict(result)))
    elif mode == "tests":
--- a/scrapers/models.py
+++ b/scrapers/models.py
@ -13,6 +13,13 @@ class ProblemSummary:
    name: str
@dataclass
 class ContestSummary:
    id: str
    name: str
    display_name: str
@dataclass
 class ScrapingResult:
    success: bool
@ -26,6 +33,11 @@ class MetadataResult(ScrapingResult):
    categories: dict[str, list[ProblemSummary]] = field(default_factory=dict)
@dataclass
 class ContestListResult(ScrapingResult):
    contests: list[ContestSummary] = field(default_factory=list)
@dataclass
 class TestsResult(ScrapingResult):
    problem_id: str
--- a/tests/scrapers/test_cses.py
+++ b/tests/scrapers/test_cses.py
@ -1,5 +1,12 @@
 from unittest.mock import Mock
-from scrapers.cses import scrape, scrape_all_problems
+
 from scrapers.cses import (
    denormalize_category_name,
    normalize_category_name,
    scrape,
    scrape_all_problems,
    scrape_category_problems,
 )
 from scrapers.models import ProblemSummary
@ -19,12 +26,19 @@ def test_scrape_success(mocker, mock_cses_html):
 def test_scrape_all_problems(mocker):
    mock_response = Mock()
    mock_response.text = """
-    <h1>Introductory Problems</h1>
+    <div class="content">
-    <a href="/problemset/task/1068">Weird Algorithm</a>
+        <h1>Introductory Problems</h1>
-    <a href="/problemset/task/1083">Missing Number</a>
+        <ul>
-    <h1>Sorting and Searching</h1>
+            <li><a href="/problemset/task/1068">Weird Algorithm</a></li>
-    <a href="/problemset/task/1084">Apartments</a>
+            <li><a href="/problemset/task/1083">Missing Number</a></li>
        </ul>
        <h1>Sorting and Searching</h1>
        <ul>
            <li><a href="/problemset/task/1084">Apartments</a></li>
        </ul>
    </div>
    """
    mock_response.raise_for_status = Mock()
    mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
@ -45,3 +59,74 @@ def test_scrape_network_error(mocker):
    result = scrape("https://cses.fi/problemset/task/1068")
    assert result == []
 def test_normalize_category_name():
    assert normalize_category_name("Sorting and Searching") == "sorting_and_searching"
    assert normalize_category_name("Dynamic Programming") == "dynamic_programming"
    assert normalize_category_name("Graph Algorithms") == "graph_algorithms"
 def test_denormalize_category_name():
    assert denormalize_category_name("sorting_and_searching") == "Sorting and Searching"
    assert denormalize_category_name("dynamic_programming") == "Dynamic Programming"
    assert denormalize_category_name("graph_algorithms") == "Graph Algorithms"
 def test_scrape_category_problems_success(mocker):
    mock_response = Mock()
    mock_response.text = """
    <div class="content">
        <h1>General</h1>
        <ul>
            <li><a href="/problemset/task/1000">Test Problem</a></li>
        </ul>
        <h1>Sorting and Searching</h1>
        <ul>
            <li><a href="/problemset/task/1640">Sum of Two Values</a></li>
            <li><a href="/problemset/task/1643">Maximum Subarray Sum</a></li>
        </ul>
        <h1>Dynamic Programming</h1>
        <ul>
            <li><a href="/problemset/task/1633">Dice Combinations</a></li>
        </ul>
    </div>
    """
    mock_response.raise_for_status = Mock()
    mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
    result = scrape_category_problems("sorting_and_searching")
    assert len(result) == 2
    assert result[0].id == "1640"
    assert result[0].name == "Sum of Two Values"
    assert result[1].id == "1643"
    assert result[1].name == "Maximum Subarray Sum"
 def test_scrape_category_problems_not_found(mocker):
    mock_response = Mock()
    mock_response.text = """
    <div class="content">
        <h1>Some Other Category</h1>
        <ul>
            <li><a href="/problemset/task/1000">Test Problem</a></li>
        </ul>
    </div>
    """
    mock_response.raise_for_status = Mock()
    mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
    result = scrape_category_problems("nonexistent_category")
    assert result == []
 def test_scrape_category_problems_network_error(mocker):
    mocker.patch("scrapers.cses.requests.get", side_effect=Exception("Network error"))
    result = scrape_category_problems("sorting_and_searching")
    assert result == []