feat(cses): update cses with concept of a category

2025-09-20 14:01:18 -04:00 · 2025-09-20 14:01:18 -04:00 · 8e13b8c61d
commit 8e13b8c61d
parent 8df38d0ca8
5 changed files with 299 additions and 28 deletions
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@ -3,12 +3,20 @@
 import json
 import re
 import sys
+import time
 from dataclasses import asdict

 import requests
 from bs4 import BeautifulSoup, Tag

-from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
+from .models import (
+    ContestListResult,
+    ContestSummary,
+    MetadataResult,
+    ProblemSummary,
+    TestCase,
+    TestsResult,
+)


 def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@ -7,7 +7,14 @@ from dataclasses import asdict
 import cloudscraper
 from bs4 import BeautifulSoup, Tag

-from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
+from .models import (
+    ContestListResult,
+    ContestSummary,
+    MetadataResult,
+    ProblemSummary,
+    TestCase,
+    TestsResult,
+)


 def scrape(url: str) -> list[TestCase]:
@ -218,11 +225,54 @@ def scrape_sample_tests(url: str) -> list[TestCase]:
    return scrape(url)


+def scrape_contests() -> list[ContestSummary]:
+    try:
+        scraper = cloudscraper.create_scraper()
+        response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
+        response.raise_for_status()
+
+        data = response.json()
+        if data["status"] != "OK":
+            return []
+
+        contests = []
+        for contest in data["result"]:
+            contest_id = str(contest["id"])
+            name = contest["name"]
+
+            # Clean up contest names for display
+            display_name = name
+            if "Educational Codeforces Round" in name:
+                import re
+
+                match = re.search(r"Educational Codeforces Round (\d+)", name)
+                if match:
+                    display_name = f"Educational Round {match.group(1)}"
+            elif "Codeforces Round" in name and "Div" in name:
+                match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
+                if match:
+                    display_name = f"Round {match.group(1)} (Div. {match.group(2)})"
+            elif "Codeforces Global Round" in name:
+                match = re.search(r"Codeforces Global Round (\d+)", name)
+                if match:
+                    display_name = f"Global Round {match.group(1)}"
+
+            contests.append(
+                ContestSummary(id=contest_id, name=name, display_name=display_name)
+            )
+
+        return contests[:100]  # Limit to recent 100 contests
+
+    except Exception as e:
+        print(f"Failed to fetch contests: {e}", file=sys.stderr)
+        return []
+
+
 def main() -> None:
    if len(sys.argv) < 2:
        result = MetadataResult(
            success=False,
-            error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter>",
+            error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter> OR codeforces.py contests",
        )
        print(json.dumps(asdict(result)))
        sys.exit(1)
@ -316,9 +366,27 @@ def main() -> None:
        )
        print(json.dumps(asdict(tests_result)))

+    elif mode == "contests":
+        if len(sys.argv) != 2:
+            contest_result = ContestListResult(
+                success=False, error="Usage: codeforces.py contests"
+            )
+            print(json.dumps(asdict(contest_result)))
+            sys.exit(1)
+
+        contests = scrape_contests()
+        if not contests:
+            contest_result = ContestListResult(success=False, error="No contests found")
+            print(json.dumps(asdict(contest_result)))
+            sys.exit(1)
+
+        contest_result = ContestListResult(success=True, error="", contests=contests)
+        print(json.dumps(asdict(contest_result)))
+
    else:
        result = MetadataResult(
-            success=False, error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'"
+            success=False,
+            error=f"Unknown mode: {mode}. Use 'metadata', 'tests', or 'contests'",
        )
        print(json.dumps(asdict(result)))
        sys.exit(1)
--- a/scrapers/cses.py
+++ b/scrapers/cses.py
@ -11,6 +11,85 @@ from bs4 import BeautifulSoup, Tag
 from .models import MetadataResult, ProblemSummary, TestCase, TestsResult


+def normalize_category_name(category_name: str) -> str:
+    return category_name.lower().replace(" ", "_").replace("&", "and")
+
+
+def denormalize_category_name(category_id: str) -> str:
+    category_map = {
+        "introductory_problems": "Introductory Problems",
+        "sorting_and_searching": "Sorting and Searching",
+        "dynamic_programming": "Dynamic Programming",
+        "graph_algorithms": "Graph Algorithms",
+        "range_queries": "Range Queries",
+        "tree_algorithms": "Tree Algorithms",
+        "mathematics": "Mathematics",
+        "string_algorithms": "String Algorithms",
+        "geometry": "Geometry",
+        "advanced_techniques": "Advanced Techniques",
+    }
+
+    return category_map.get(category_id, category_id.replace("_", " ").title())
+
+
+def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
+    category_name = denormalize_category_name(category_id)
+
+    try:
+        problemset_url = "https://cses.fi/problemset/"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        }
+
+        response = requests.get(problemset_url, headers=headers, timeout=10)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        current_category = None
+        problems = []
+        target_found = False
+
+        for element in soup.find_all(["h1", "h2", "ul"]):
+            if not isinstance(element, Tag):
+                continue
+            if element.name in ["h1", "h2"]:
+                text = element.get_text(strip=True)
+                if not text or text.startswith("CSES") or text == "CSES Problem Set":
+                    continue
+
+                if target_found and current_category != text:
+                    break
+
+                current_category = text
+                if text.lower() == category_name.lower():
+                    target_found = True
+
+            elif element.name == "ul" and current_category and target_found:
+                problem_links = element.find_all(
+                    "a", href=lambda x: x and "/problemset/task/" in x
+                )
+                for link in problem_links:
+                    href = link.get("href", "")
+                    if not href:
+                        continue
+
+                    problem_id = href.split("/")[-1]
+                    problem_name = link.get_text(strip=True)
+
+                    if not problem_id.isdigit() or not problem_name:
+                        continue
+
+                    problems.append(ProblemSummary(id=problem_id, name=problem_name))
+
+        problems.sort(key=lambda x: int(x.id))
+        return problems
+
+    except Exception as e:
+        print(f"Failed to scrape CSES category {category_id}: {e}", file=sys.stderr)
+        return []
+
+
 def parse_problem_url(problem_input: str) -> str | None:
    if problem_input.startswith("https://cses.fi/problemset/task/"):
        return problem_input
@ -94,21 +173,39 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
        soup = BeautifulSoup(response.text, "html.parser")
        all_categories: dict[str, list[ProblemSummary]] = {}

-        problem_links = soup.find_all(
-            "a", href=lambda x: x and "/problemset/task/" in x
-        )
-        print(f"Found {len(problem_links)} problem links", file=sys.stderr)
-
        current_category = None
-        for element in soup.find_all(["h1", "a"]):
-            current_category = process_problem_element(
-                element, current_category, all_categories
-            )
+        for element in soup.find_all(["h1", "h2", "ul"]):
+            if not isinstance(element, Tag):
+                continue
+            if element.name in ["h1", "h2"]:
+                text = element.get_text(strip=True)
+                if text and not text.startswith("CSES") and text != "CSES Problem Set":
+                    current_category = text
+                    if current_category not in all_categories:
+                        all_categories[current_category] = []
+                        print(f"Found category: {current_category}", file=sys.stderr)
+
+            elif element.name == "ul" and current_category:
+                problem_links = element.find_all(
+                    "a", href=lambda x: x and "/problemset/task/" in x
+                )
+                for link in problem_links:
+                    href = link.get("href", "")
+                    if href:
+                        problem_id = href.split("/")[-1]
+                        problem_name = link.get_text(strip=True)
+
+                        if problem_id.isdigit() and problem_name:
+                            problem = ProblemSummary(id=problem_id, name=problem_name)
+                            all_categories[current_category].append(problem)

        for category in all_categories:
            all_categories[category].sort(key=lambda x: int(x.id))

-        print(f"Found {len(all_categories)} categories", file=sys.stderr)
+        print(
+            f"Found {len(all_categories)} categories with {sum(len(probs) for probs in all_categories.values())} problems",
+            file=sys.stderr,
+        )
        return all_categories

    except Exception as e:
@ -170,7 +267,7 @@ def main() -> None:
    if len(sys.argv) < 2:
        result = MetadataResult(
            success=False,
-            error="Usage: cses.py metadata OR cses.py tests <problem_id_or_url>",
+            error="Usage: cses.py metadata <category_id> OR cses.py tests <problem_id_or_url>",
        )
        print(json.dumps(asdict(result)))
        sys.exit(1)
@ -178,25 +275,26 @@ def main() -> None:
    mode: str = sys.argv[1]

    if mode == "metadata":
-        if len(sys.argv) != 2:
+        if len(sys.argv) != 3:
            result = MetadataResult(
                success=False,
-                error="Usage: cses.py metadata",
+                error="Usage: cses.py metadata <category_id>",
            )
            print(json.dumps(asdict(result)))
            sys.exit(1)

-        all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems()
+        category_id = sys.argv[2]
+        problems = scrape_category_problems(category_id)

-        if not all_categories:
+        if not problems:
            result = MetadataResult(
                success=False,
-                error="Failed to scrape CSES problem categories",
+                error=f"No problems found for category: {category_id}",
            )
            print(json.dumps(asdict(result)))
-            sys.exit(1)
+            return

-        result = MetadataResult(success=True, error="", categories=all_categories)
+        result = MetadataResult(success=True, error="", problems=problems)
        print(json.dumps(asdict(result)))

    elif mode == "tests":
--- a/scrapers/models.py
+++ b/scrapers/models.py
@ -13,6 +13,13 @@ class ProblemSummary:
    name: str


+@dataclass
+class ContestSummary:
+    id: str
+    name: str
+    display_name: str
+
+
@dataclass
 class ScrapingResult:
    success: bool
@ -26,6 +33,11 @@ class MetadataResult(ScrapingResult):
    categories: dict[str, list[ProblemSummary]] = field(default_factory=dict)


+@dataclass
+class ContestListResult(ScrapingResult):
+    contests: list[ContestSummary] = field(default_factory=list)
+
+
@dataclass
 class TestsResult(ScrapingResult):
    problem_id: str
--- a/tests/scrapers/test_cses.py
+++ b/tests/scrapers/test_cses.py
@ -1,5 +1,12 @@
 from unittest.mock import Mock
-from scrapers.cses import scrape, scrape_all_problems
+
+from scrapers.cses import (
+    denormalize_category_name,
+    normalize_category_name,
+    scrape,
+    scrape_all_problems,
+    scrape_category_problems,
+)
 from scrapers.models import ProblemSummary


@ -19,12 +26,19 @@ def test_scrape_success(mocker, mock_cses_html):
 def test_scrape_all_problems(mocker):
    mock_response = Mock()
    mock_response.text = """
-    <h1>Introductory Problems</h1>
-    <a href="/problemset/task/1068">Weird Algorithm</a>
-    <a href="/problemset/task/1083">Missing Number</a>
-    <h1>Sorting and Searching</h1>
-    <a href="/problemset/task/1084">Apartments</a>
+    <div class="content">
+        <h1>Introductory Problems</h1>
+        <ul>
+            <li><a href="/problemset/task/1068">Weird Algorithm</a></li>
+            <li><a href="/problemset/task/1083">Missing Number</a></li>
+        </ul>
+        <h1>Sorting and Searching</h1>
+        <ul>
+            <li><a href="/problemset/task/1084">Apartments</a></li>
+        </ul>
+    </div>
    """
+    mock_response.raise_for_status = Mock()

    mocker.patch("scrapers.cses.requests.get", return_value=mock_response)

@ -45,3 +59,74 @@ def test_scrape_network_error(mocker):
    result = scrape("https://cses.fi/problemset/task/1068")

    assert result == []
+
+
+def test_normalize_category_name():
+    assert normalize_category_name("Sorting and Searching") == "sorting_and_searching"
+    assert normalize_category_name("Dynamic Programming") == "dynamic_programming"
+    assert normalize_category_name("Graph Algorithms") == "graph_algorithms"
+
+
+def test_denormalize_category_name():
+    assert denormalize_category_name("sorting_and_searching") == "Sorting and Searching"
+    assert denormalize_category_name("dynamic_programming") == "Dynamic Programming"
+    assert denormalize_category_name("graph_algorithms") == "Graph Algorithms"
+
+
+def test_scrape_category_problems_success(mocker):
+    mock_response = Mock()
+    mock_response.text = """
+    <div class="content">
+        <h1>General</h1>
+        <ul>
+            <li><a href="/problemset/task/1000">Test Problem</a></li>
+        </ul>
+        <h1>Sorting and Searching</h1>
+        <ul>
+            <li><a href="/problemset/task/1640">Sum of Two Values</a></li>
+            <li><a href="/problemset/task/1643">Maximum Subarray Sum</a></li>
+        </ul>
+        <h1>Dynamic Programming</h1>
+        <ul>
+            <li><a href="/problemset/task/1633">Dice Combinations</a></li>
+        </ul>
+    </div>
+    """
+    mock_response.raise_for_status = Mock()
+
+    mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
+
+    result = scrape_category_problems("sorting_and_searching")
+
+    assert len(result) == 2
+    assert result[0].id == "1640"
+    assert result[0].name == "Sum of Two Values"
+    assert result[1].id == "1643"
+    assert result[1].name == "Maximum Subarray Sum"
+
+
+def test_scrape_category_problems_not_found(mocker):
+    mock_response = Mock()
+    mock_response.text = """
+    <div class="content">
+        <h1>Some Other Category</h1>
+        <ul>
+            <li><a href="/problemset/task/1000">Test Problem</a></li>
+        </ul>
+    </div>
+    """
+    mock_response.raise_for_status = Mock()
+
+    mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
+
+    result = scrape_category_problems("nonexistent_category")
+
+    assert result == []
+
+
+def test_scrape_category_problems_network_error(mocker):
+    mocker.patch("scrapers.cses.requests.get", side_effect=Exception("Network error"))
+
+    result = scrape_category_problems("sorting_and_searching")
+
+    assert result == []