From 52c50cde795164a03e8d2556d254c7bb039844a8 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Wed, 24 Sep 2025 21:23:06 -0400 Subject: [PATCH 1/2] lol --- scrapers/__init__.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/scrapers/__init__.py b/scrapers/__init__.py index 4749123..e69de29 100644 --- a/scrapers/__init__.py +++ b/scrapers/__init__.py @@ -1,5 +0,0 @@ -from .atcoder import AtCoderScraper -from .codeforces import CodeforcesScraper -from .cses import CSESScraper - -__all__ = ["CodeforcesScraper", "CSESScraper", "AtCoderScraper"] From 7711788d3dc77ead68f316a7575ce0acd17fd653 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Wed, 24 Sep 2025 21:35:57 -0400 Subject: [PATCH 2/2] cleanup --- scrapers/cses.py | 130 ++++++++++++++++------------------------------- 1 file changed, 43 insertions(+), 87 deletions(-) diff --git a/scrapers/cses.py b/scrapers/cses.py index 8326b71..09b949a 100644 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -46,7 +46,6 @@ def snake_to_title(name: str) -> str: "vs", "via", } - words: list[str] = name.split("_") n = len(words) @@ -85,21 +84,16 @@ def make_request(url: str, headers: dict) -> requests.Response: def scrape_category_problems(category_id: str) -> list[ProblemSummary]: category_name = snake_to_title(category_id) - try: problemset_url = "https://cses.fi/problemset/" headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = make_request(problemset_url, headers) - soup = BeautifulSoup(response.text, "html.parser") - current_category = None problems = [] target_found = False - for element in soup.find_all(["h1", "h2", "ul"]): if not isinstance(element, Tag): continue @@ -107,14 +101,11 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]: text = element.get_text(strip=True) if not text or text.startswith("CSES") or text == "CSES Problem Set": continue - if target_found and current_category != text: break - current_category = text if text.lower() == category_name.lower(): target_found = True - elif element.name == "ul" and current_category and target_found: problem_links = element.find_all( "a", href=lambda x: x and "/problemset/task/" in x @@ -123,17 +114,12 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]: href = link.get("href", "") if not href: continue - problem_id = href.split("/")[-1] problem_name = link.get_text(strip=True) - if not problem_id.isdigit() or not problem_name: continue - problems.append(ProblemSummary(id=problem_id, name=problem_name)) - return problems - except Exception as e: print(f"Failed to scrape CSES category {category_id}: {e}", file=sys.stderr) return [] @@ -141,7 +127,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]: def parse_problem_url(problem_input: str) -> str | None: if problem_input.startswith("https://cses.fi/problemset/task/"): - return problem_input + return problem_input.rstrip("/") elif problem_input.isdigit(): return f"https://cses.fi/problemset/task/{problem_input}" return None @@ -150,33 +136,26 @@ def parse_problem_url(problem_input: str) -> str | None: def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: timeout_ms = None memory_mb = None - constraints_ul = soup.find("ul", class_="task-constraints") if not constraints_ul or not isinstance(constraints_ul, Tag): raise ValueError("Could not find task-constraints section") - for li in constraints_ul.find_all("li"): text = li.get_text() - if "Time limit:" in text: match = re.search(r"Time limit:\s*(\d+(?:\.\d+)?)\s*s", text) if match: seconds = float(match.group(1)) timeout_ms = int(seconds * 1000) - if "Memory limit:" in text: match = re.search(r"Memory limit:\s*(\d+)\s*MB", text) if match: memory_mb = float(match.group(1)) - if timeout_ms is None: raise ValueError("Could not find valid timeout in task-constraints section") - if memory_mb is None: raise ValueError( "Could not find valid memory limit in task-constraints section" ) - return timeout_ms, memory_mb @@ -186,27 +165,20 @@ def scrape_categories() -> list[ContestSummary]: "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } response = make_request("https://cses.fi/problemset/", headers) - soup = BeautifulSoup(response.text, "html.parser") categories = [] - for h2 in soup.find_all("h2"): category_name = h2.get_text().strip() if category_name == "General": continue - category_id = normalize_category_name(category_name) - display_name = category_name - categories.append( ContestSummary( id=category_id, name=category_name, display_name=display_name ) ) - return categories - except Exception as e: print(f"Failed to scrape CSES categories: {e}", file=sys.stderr) return [] @@ -222,20 +194,15 @@ def process_problem_element( if category_name not in all_categories: all_categories[category_name] = [] return category_name - if element.name != "a" or "/problemset/task/" not in element.get("href", ""): return current_category - href = element.get("href", "") if not href: return current_category - problem_id = href.split("/")[-1] problem_name = element.get_text(strip=True) - if not (problem_id.isdigit() and problem_name and current_category): return current_category - problem = ProblemSummary(id=problem_id, name=problem_name) all_categories[current_category].append(problem) return current_category @@ -247,13 +214,10 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = requests.get(problemset_url, headers=headers, timeout=10) response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") all_categories: dict[str, list[ProblemSummary]] = {} - current_category = None for element in soup.find_all(["h1", "h2", "ul"]): if not isinstance(element, Tag): @@ -265,7 +229,6 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]: if current_category not in all_categories: all_categories[current_category] = [] print(f"Found category: {current_category}", file=sys.stderr) - elif element.name == "ul" and current_category: problem_links = element.find_all( "a", href=lambda x: x and "/problemset/task/" in x @@ -275,47 +238,61 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]: if href: problem_id = href.split("/")[-1] problem_name = link.get_text(strip=True) - if problem_id.isdigit() and problem_name: problem = ProblemSummary(id=problem_id, name=problem_name) all_categories[current_category].append(problem) - print( f"Found {len(all_categories)} categories with {sum(len(probs) for probs in all_categories.values())} problems", file=sys.stderr, ) return all_categories - except Exception as e: print(f"Failed to scrape CSES problems: {e}", file=sys.stderr) return {} -def extract_example_test_case(soup) -> tuple[str, str] | None: - example_header = soup.find("h1", string="Example") - if not example_header: - return None +def _collect_section_after(header: Tag) -> list[Tag]: + out: list[Tag] = [] + cur = header.find_next_sibling() + while cur and not (isinstance(cur, Tag) and cur.name in ("h1", "h2", "h3")): + if isinstance(cur, Tag): + out.append(cur) + cur = cur.find_next_sibling() + return out - current = example_header.find_next_sibling() - input_text = None - output_text = None - while current: - if current.name == "p" and "Input:" in current.get_text(): - input_pre = current.find_next_sibling("pre") - if input_pre: - input_text = input_pre.get_text().strip() - elif current.name == "p" and "Output:" in current.get_text(): - output_pre = current.find_next_sibling("pre") - if output_pre: - output_text = output_pre.get_text().strip() - break - current = current.find_next_sibling() +def extract_example_test_cases(soup: BeautifulSoup) -> list[tuple[str, str]]: + example_headers = soup.find_all( + lambda t: isinstance(t, Tag) + and t.name in ("h1", "h2", "h3") + and t.get_text(strip=True).lower().startswith("example") + ) + cases: list[tuple[str, str]] = [] + for hdr in example_headers: + section = _collect_section_after(hdr) - if not input_text or not output_text: - return None + def find_labeled(label: str) -> str | None: + for node in section: + if not isinstance(node, Tag): + continue + if node.name in ("p", "h4", "h5", "h6"): + txt = node.get_text(strip=True).lower().rstrip(":") + if txt == label: + pre = node.find_next_sibling("pre") + if pre: + return pre.get_text().strip() + return None - return (input_text, output_text) + inp = find_labeled("input") + out = find_labeled("output") + if not inp or not out: + pres = [n for n in section if isinstance(n, Tag) and n.name == "pre"] + if len(pres) >= 2: + inp = inp or pres[0].get_text().strip() + out = out or pres[1].get_text().strip() + if inp and out: + cases.append((inp, out)) + return cases def scrape(url: str) -> list[TestCase]: @@ -323,18 +300,10 @@ def scrape(url: str) -> list[TestCase]: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = make_request(url, headers) - soup = BeautifulSoup(response.text, "html.parser") - - test_case = extract_example_test_case(soup) - if not test_case: - return [] - - input_text, output_text = test_case - return [TestCase(input=input_text, expected=output_text)] - + pairs = extract_example_test_cases(soup) + return [TestCase(input=inp, expected=out) for (inp, out) in pairs] except Exception as e: print(f"Error scraping CSES: {e}", file=sys.stderr) return [] @@ -361,7 +330,6 @@ class CSESScraper(BaseScraper): return func(*args) except Exception as e: error_msg = f"{self.platform_name}: {str(e)}" - if operation == "metadata": return MetadataResult(success=False, error=error_msg) elif operation == "tests": @@ -400,21 +368,18 @@ class CSESScraper(BaseScraper): timeout_ms=0, memory_mb=0, ) - tests = scrape(url) + m = re.search(r"/task/(\d+)", url) actual_problem_id = ( - problem_id if problem_id.isdigit() else problem_id.split("/")[-1] + problem_id if problem_id.isdigit() else (m.group(1) if m else "") ) - headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() - soup = BeautifulSoup(response.text, "html.parser") timeout_ms, memory_mb = extract_problem_limits(soup) - if not tests: return TestsResult( success=False, @@ -425,7 +390,6 @@ class CSESScraper(BaseScraper): timeout_ms=timeout_ms, memory_mb=memory_mb, ) - return TestsResult( success=True, error="", @@ -453,10 +417,8 @@ def main() -> None: ) print(json.dumps(asdict(result))) sys.exit(1) - mode: str = sys.argv[1] scraper = CSESScraper() - if mode == "metadata": if len(sys.argv) != 3: result = MetadataResult( @@ -465,13 +427,11 @@ def main() -> None: ) print(json.dumps(asdict(result))) sys.exit(1) - category_id = sys.argv[2] result = scraper.scrape_contest_metadata(category_id) print(json.dumps(asdict(result))) if not result.success: sys.exit(1) - elif mode == "tests": if len(sys.argv) != 4: tests_result = TestsResult( @@ -485,14 +445,12 @@ def main() -> None: ) print(json.dumps(asdict(tests_result))) sys.exit(1) - category = sys.argv[2] problem_id = sys.argv[3] tests_result = scraper.scrape_problem_tests(category, problem_id) print(json.dumps(asdict(tests_result))) if not tests_result.success: sys.exit(1) - elif mode == "contests": if len(sys.argv) != 2: contest_result = ContestListResult( @@ -500,12 +458,10 @@ def main() -> None: ) print(json.dumps(asdict(contest_result))) sys.exit(1) - contest_result = scraper.scrape_contest_list() print(json.dumps(asdict(contest_result))) if not contest_result.success: sys.exit(1) - else: result = MetadataResult( success=False,