feat(cses): update cses with concept of a category

This commit is contained in:
Barrett Ruth 2025-09-20 14:01:18 -04:00
parent 8df38d0ca8
commit 8e13b8c61d
5 changed files with 299 additions and 28 deletions

View file

@ -3,12 +3,20 @@
import json import json
import re import re
import sys import sys
import time
from dataclasses import asdict from dataclasses import asdict
import requests import requests
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult from .models import (
ContestListResult,
ContestSummary,
MetadataResult,
ProblemSummary,
TestCase,
TestsResult,
)
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:

View file

@ -7,7 +7,14 @@ from dataclasses import asdict
import cloudscraper import cloudscraper
from bs4 import BeautifulSoup, Tag from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult from .models import (
ContestListResult,
ContestSummary,
MetadataResult,
ProblemSummary,
TestCase,
TestsResult,
)
def scrape(url: str) -> list[TestCase]: def scrape(url: str) -> list[TestCase]:
@ -218,11 +225,54 @@ def scrape_sample_tests(url: str) -> list[TestCase]:
return scrape(url) return scrape(url)
def scrape_contests() -> list[ContestSummary]:
try:
scraper = cloudscraper.create_scraper()
response = scraper.get("https://codeforces.com/api/contest.list", timeout=10)
response.raise_for_status()
data = response.json()
if data["status"] != "OK":
return []
contests = []
for contest in data["result"]:
contest_id = str(contest["id"])
name = contest["name"]
# Clean up contest names for display
display_name = name
if "Educational Codeforces Round" in name:
import re
match = re.search(r"Educational Codeforces Round (\d+)", name)
if match:
display_name = f"Educational Round {match.group(1)}"
elif "Codeforces Round" in name and "Div" in name:
match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
if match:
display_name = f"Round {match.group(1)} (Div. {match.group(2)})"
elif "Codeforces Global Round" in name:
match = re.search(r"Codeforces Global Round (\d+)", name)
if match:
display_name = f"Global Round {match.group(1)}"
contests.append(
ContestSummary(id=contest_id, name=name, display_name=display_name)
)
return contests[:100] # Limit to recent 100 contests
except Exception as e:
print(f"Failed to fetch contests: {e}", file=sys.stderr)
return []
def main() -> None: def main() -> None:
if len(sys.argv) < 2: if len(sys.argv) < 2:
result = MetadataResult( result = MetadataResult(
success=False, success=False,
error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter>", error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter> OR codeforces.py contests",
) )
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
@ -316,9 +366,27 @@ def main() -> None:
) )
print(json.dumps(asdict(tests_result))) print(json.dumps(asdict(tests_result)))
elif mode == "contests":
if len(sys.argv) != 2:
contest_result = ContestListResult(
success=False, error="Usage: codeforces.py contests"
)
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contests = scrape_contests()
if not contests:
contest_result = ContestListResult(success=False, error="No contests found")
print(json.dumps(asdict(contest_result)))
sys.exit(1)
contest_result = ContestListResult(success=True, error="", contests=contests)
print(json.dumps(asdict(contest_result)))
else: else:
result = MetadataResult( result = MetadataResult(
success=False, error=f"Unknown mode: {mode}. Use 'metadata' or 'tests'" success=False,
error=f"Unknown mode: {mode}. Use 'metadata', 'tests', or 'contests'",
) )
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)

View file

@ -11,6 +11,85 @@ from bs4 import BeautifulSoup, Tag
from .models import MetadataResult, ProblemSummary, TestCase, TestsResult from .models import MetadataResult, ProblemSummary, TestCase, TestsResult
def normalize_category_name(category_name: str) -> str:
return category_name.lower().replace(" ", "_").replace("&", "and")
def denormalize_category_name(category_id: str) -> str:
category_map = {
"introductory_problems": "Introductory Problems",
"sorting_and_searching": "Sorting and Searching",
"dynamic_programming": "Dynamic Programming",
"graph_algorithms": "Graph Algorithms",
"range_queries": "Range Queries",
"tree_algorithms": "Tree Algorithms",
"mathematics": "Mathematics",
"string_algorithms": "String Algorithms",
"geometry": "Geometry",
"advanced_techniques": "Advanced Techniques",
}
return category_map.get(category_id, category_id.replace("_", " ").title())
def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
category_name = denormalize_category_name(category_id)
try:
problemset_url = "https://cses.fi/problemset/"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
response = requests.get(problemset_url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
current_category = None
problems = []
target_found = False
for element in soup.find_all(["h1", "h2", "ul"]):
if not isinstance(element, Tag):
continue
if element.name in ["h1", "h2"]:
text = element.get_text(strip=True)
if not text or text.startswith("CSES") or text == "CSES Problem Set":
continue
if target_found and current_category != text:
break
current_category = text
if text.lower() == category_name.lower():
target_found = True
elif element.name == "ul" and current_category and target_found:
problem_links = element.find_all(
"a", href=lambda x: x and "/problemset/task/" in x
)
for link in problem_links:
href = link.get("href", "")
if not href:
continue
problem_id = href.split("/")[-1]
problem_name = link.get_text(strip=True)
if not problem_id.isdigit() or not problem_name:
continue
problems.append(ProblemSummary(id=problem_id, name=problem_name))
problems.sort(key=lambda x: int(x.id))
return problems
except Exception as e:
print(f"Failed to scrape CSES category {category_id}: {e}", file=sys.stderr)
return []
def parse_problem_url(problem_input: str) -> str | None: def parse_problem_url(problem_input: str) -> str | None:
if problem_input.startswith("https://cses.fi/problemset/task/"): if problem_input.startswith("https://cses.fi/problemset/task/"):
return problem_input return problem_input
@ -94,21 +173,39 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
soup = BeautifulSoup(response.text, "html.parser") soup = BeautifulSoup(response.text, "html.parser")
all_categories: dict[str, list[ProblemSummary]] = {} all_categories: dict[str, list[ProblemSummary]] = {}
problem_links = soup.find_all(
"a", href=lambda x: x and "/problemset/task/" in x
)
print(f"Found {len(problem_links)} problem links", file=sys.stderr)
current_category = None current_category = None
for element in soup.find_all(["h1", "a"]): for element in soup.find_all(["h1", "h2", "ul"]):
current_category = process_problem_element( if not isinstance(element, Tag):
element, current_category, all_categories continue
) if element.name in ["h1", "h2"]:
text = element.get_text(strip=True)
if text and not text.startswith("CSES") and text != "CSES Problem Set":
current_category = text
if current_category not in all_categories:
all_categories[current_category] = []
print(f"Found category: {current_category}", file=sys.stderr)
elif element.name == "ul" and current_category:
problem_links = element.find_all(
"a", href=lambda x: x and "/problemset/task/" in x
)
for link in problem_links:
href = link.get("href", "")
if href:
problem_id = href.split("/")[-1]
problem_name = link.get_text(strip=True)
if problem_id.isdigit() and problem_name:
problem = ProblemSummary(id=problem_id, name=problem_name)
all_categories[current_category].append(problem)
for category in all_categories: for category in all_categories:
all_categories[category].sort(key=lambda x: int(x.id)) all_categories[category].sort(key=lambda x: int(x.id))
print(f"Found {len(all_categories)} categories", file=sys.stderr) print(
f"Found {len(all_categories)} categories with {sum(len(probs) for probs in all_categories.values())} problems",
file=sys.stderr,
)
return all_categories return all_categories
except Exception as e: except Exception as e:
@ -170,7 +267,7 @@ def main() -> None:
if len(sys.argv) < 2: if len(sys.argv) < 2:
result = MetadataResult( result = MetadataResult(
success=False, success=False,
error="Usage: cses.py metadata OR cses.py tests <problem_id_or_url>", error="Usage: cses.py metadata <category_id> OR cses.py tests <problem_id_or_url>",
) )
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
@ -178,25 +275,26 @@ def main() -> None:
mode: str = sys.argv[1] mode: str = sys.argv[1]
if mode == "metadata": if mode == "metadata":
if len(sys.argv) != 2: if len(sys.argv) != 3:
result = MetadataResult( result = MetadataResult(
success=False, success=False,
error="Usage: cses.py metadata", error="Usage: cses.py metadata <category_id>",
) )
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) sys.exit(1)
all_categories: dict[str, list[ProblemSummary]] = scrape_all_problems() category_id = sys.argv[2]
problems = scrape_category_problems(category_id)
if not all_categories: if not problems:
result = MetadataResult( result = MetadataResult(
success=False, success=False,
error="Failed to scrape CSES problem categories", error=f"No problems found for category: {category_id}",
) )
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
sys.exit(1) return
result = MetadataResult(success=True, error="", categories=all_categories) result = MetadataResult(success=True, error="", problems=problems)
print(json.dumps(asdict(result))) print(json.dumps(asdict(result)))
elif mode == "tests": elif mode == "tests":

View file

@ -13,6 +13,13 @@ class ProblemSummary:
name: str name: str
@dataclass
class ContestSummary:
id: str
name: str
display_name: str
@dataclass @dataclass
class ScrapingResult: class ScrapingResult:
success: bool success: bool
@ -26,6 +33,11 @@ class MetadataResult(ScrapingResult):
categories: dict[str, list[ProblemSummary]] = field(default_factory=dict) categories: dict[str, list[ProblemSummary]] = field(default_factory=dict)
@dataclass
class ContestListResult(ScrapingResult):
contests: list[ContestSummary] = field(default_factory=list)
@dataclass @dataclass
class TestsResult(ScrapingResult): class TestsResult(ScrapingResult):
problem_id: str problem_id: str

View file

@ -1,5 +1,12 @@
from unittest.mock import Mock from unittest.mock import Mock
from scrapers.cses import scrape, scrape_all_problems
from scrapers.cses import (
denormalize_category_name,
normalize_category_name,
scrape,
scrape_all_problems,
scrape_category_problems,
)
from scrapers.models import ProblemSummary from scrapers.models import ProblemSummary
@ -19,12 +26,19 @@ def test_scrape_success(mocker, mock_cses_html):
def test_scrape_all_problems(mocker): def test_scrape_all_problems(mocker):
mock_response = Mock() mock_response = Mock()
mock_response.text = """ mock_response.text = """
<h1>Introductory Problems</h1> <div class="content">
<a href="/problemset/task/1068">Weird Algorithm</a> <h1>Introductory Problems</h1>
<a href="/problemset/task/1083">Missing Number</a> <ul>
<h1>Sorting and Searching</h1> <li><a href="/problemset/task/1068">Weird Algorithm</a></li>
<a href="/problemset/task/1084">Apartments</a> <li><a href="/problemset/task/1083">Missing Number</a></li>
</ul>
<h1>Sorting and Searching</h1>
<ul>
<li><a href="/problemset/task/1084">Apartments</a></li>
</ul>
</div>
""" """
mock_response.raise_for_status = Mock()
mocker.patch("scrapers.cses.requests.get", return_value=mock_response) mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
@ -45,3 +59,74 @@ def test_scrape_network_error(mocker):
result = scrape("https://cses.fi/problemset/task/1068") result = scrape("https://cses.fi/problemset/task/1068")
assert result == [] assert result == []
def test_normalize_category_name():
assert normalize_category_name("Sorting and Searching") == "sorting_and_searching"
assert normalize_category_name("Dynamic Programming") == "dynamic_programming"
assert normalize_category_name("Graph Algorithms") == "graph_algorithms"
def test_denormalize_category_name():
assert denormalize_category_name("sorting_and_searching") == "Sorting and Searching"
assert denormalize_category_name("dynamic_programming") == "Dynamic Programming"
assert denormalize_category_name("graph_algorithms") == "Graph Algorithms"
def test_scrape_category_problems_success(mocker):
mock_response = Mock()
mock_response.text = """
<div class="content">
<h1>General</h1>
<ul>
<li><a href="/problemset/task/1000">Test Problem</a></li>
</ul>
<h1>Sorting and Searching</h1>
<ul>
<li><a href="/problemset/task/1640">Sum of Two Values</a></li>
<li><a href="/problemset/task/1643">Maximum Subarray Sum</a></li>
</ul>
<h1>Dynamic Programming</h1>
<ul>
<li><a href="/problemset/task/1633">Dice Combinations</a></li>
</ul>
</div>
"""
mock_response.raise_for_status = Mock()
mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
result = scrape_category_problems("sorting_and_searching")
assert len(result) == 2
assert result[0].id == "1640"
assert result[0].name == "Sum of Two Values"
assert result[1].id == "1643"
assert result[1].name == "Maximum Subarray Sum"
def test_scrape_category_problems_not_found(mocker):
mock_response = Mock()
mock_response.text = """
<div class="content">
<h1>Some Other Category</h1>
<ul>
<li><a href="/problemset/task/1000">Test Problem</a></li>
</ul>
</div>
"""
mock_response.raise_for_status = Mock()
mocker.patch("scrapers.cses.requests.get", return_value=mock_response)
result = scrape_category_problems("nonexistent_category")
assert result == []
def test_scrape_category_problems_network_error(mocker):
mocker.patch("scrapers.cses.requests.get", side_effect=Exception("Network error"))
result = scrape_category_problems("sorting_and_searching")
assert result == []