From 52c50cde795164a03e8d2556d254c7bb039844a8 Mon Sep 17 00:00:00 2001
From: Barrett Ruth <br.barrettruth@gmail.com>
Date: Wed, 24 Sep 2025 21:23:06 -0400
Subject: [PATCH 1/2] lol

---
 scrapers/__init__.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/scrapers/__init__.py b/scrapers/__init__.py
index 4749123..e69de29 100644
--- a/scrapers/__init__.py
+++ b/scrapers/__init__.py
@@ -1,5 +0,0 @@
-from .atcoder import AtCoderScraper
-from .codeforces import CodeforcesScraper
-from .cses import CSESScraper
-
-__all__ = ["CodeforcesScraper", "CSESScraper", "AtCoderScraper"]

From 7711788d3dc77ead68f316a7575ce0acd17fd653 Mon Sep 17 00:00:00 2001
From: Barrett Ruth <br.barrettruth@gmail.com>
Date: Wed, 24 Sep 2025 21:35:57 -0400
Subject: [PATCH 2/2] cleanup

---
 scrapers/cses.py | 130 ++++++++++++++++-------------------------------
 1 file changed, 43 insertions(+), 87 deletions(-)

diff --git a/scrapers/cses.py b/scrapers/cses.py
index 8326b71..09b949a 100644
--- a/scrapers/cses.py
+++ b/scrapers/cses.py
@@ -46,7 +46,6 @@ def snake_to_title(name: str) -> str:
         "vs",
         "via",
     }
-
     words: list[str] = name.split("_")
     n = len(words)
 
@@ -85,21 +84,16 @@ def make_request(url: str, headers: dict) -> requests.Response:
 
 def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
     category_name = snake_to_title(category_id)
-
     try:
         problemset_url = "https://cses.fi/problemset/"
         headers = {
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
         }
-
         response = make_request(problemset_url, headers)
-
         soup = BeautifulSoup(response.text, "html.parser")
-
         current_category = None
         problems = []
         target_found = False
-
         for element in soup.find_all(["h1", "h2", "ul"]):
             if not isinstance(element, Tag):
                 continue
@@ -107,14 +101,11 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
                 text = element.get_text(strip=True)
                 if not text or text.startswith("CSES") or text == "CSES Problem Set":
                     continue
-
                 if target_found and current_category != text:
                     break
-
                 current_category = text
                 if text.lower() == category_name.lower():
                     target_found = True
-
             elif element.name == "ul" and current_category and target_found:
                 problem_links = element.find_all(
                     "a", href=lambda x: x and "/problemset/task/" in x
@@ -123,17 +114,12 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
                     href = link.get("href", "")
                     if not href:
                         continue
-
                     problem_id = href.split("/")[-1]
                     problem_name = link.get_text(strip=True)
-
                     if not problem_id.isdigit() or not problem_name:
                         continue
-
                     problems.append(ProblemSummary(id=problem_id, name=problem_name))
-
         return problems
-
     except Exception as e:
         print(f"Failed to scrape CSES category {category_id}: {e}", file=sys.stderr)
         return []
@@ -141,7 +127,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
 
 def parse_problem_url(problem_input: str) -> str | None:
     if problem_input.startswith("https://cses.fi/problemset/task/"):
-        return problem_input
+        return problem_input.rstrip("/")
     elif problem_input.isdigit():
         return f"https://cses.fi/problemset/task/{problem_input}"
     return None
@@ -150,33 +136,26 @@ def parse_problem_url(problem_input: str) -> str | None:
 def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
     timeout_ms = None
     memory_mb = None
-
     constraints_ul = soup.find("ul", class_="task-constraints")
     if not constraints_ul or not isinstance(constraints_ul, Tag):
         raise ValueError("Could not find task-constraints section")
-
     for li in constraints_ul.find_all("li"):
         text = li.get_text()
-
         if "Time limit:" in text:
             match = re.search(r"Time limit:\s*(\d+(?:\.\d+)?)\s*s", text)
             if match:
                 seconds = float(match.group(1))
                 timeout_ms = int(seconds * 1000)
-
         if "Memory limit:" in text:
             match = re.search(r"Memory limit:\s*(\d+)\s*MB", text)
             if match:
                 memory_mb = float(match.group(1))
-
     if timeout_ms is None:
         raise ValueError("Could not find valid timeout in task-constraints section")
-
     if memory_mb is None:
         raise ValueError(
             "Could not find valid memory limit in task-constraints section"
         )
-
     return timeout_ms, memory_mb
 
 
@@ -186,27 +165,20 @@ def scrape_categories() -> list[ContestSummary]:
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
         }
         response = make_request("https://cses.fi/problemset/", headers)
-
         soup = BeautifulSoup(response.text, "html.parser")
         categories = []
-
         for h2 in soup.find_all("h2"):
             category_name = h2.get_text().strip()
             if category_name == "General":
                 continue
-
             category_id = normalize_category_name(category_name)
-
             display_name = category_name
-
             categories.append(
                 ContestSummary(
                     id=category_id, name=category_name, display_name=display_name
                 )
             )
-
         return categories
-
     except Exception as e:
         print(f"Failed to scrape CSES categories: {e}", file=sys.stderr)
         return []
@@ -222,20 +194,15 @@ def process_problem_element(
         if category_name not in all_categories:
             all_categories[category_name] = []
         return category_name
-
     if element.name != "a" or "/problemset/task/" not in element.get("href", ""):
         return current_category
-
     href = element.get("href", "")
     if not href:
         return current_category
-
     problem_id = href.split("/")[-1]
     problem_name = element.get_text(strip=True)
-
     if not (problem_id.isdigit() and problem_name and current_category):
         return current_category
-
     problem = ProblemSummary(id=problem_id, name=problem_name)
     all_categories[current_category].append(problem)
     return current_category
@@ -247,13 +214,10 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
         headers = {
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
         }
-
         response = requests.get(problemset_url, headers=headers, timeout=10)
         response.raise_for_status()
-
         soup = BeautifulSoup(response.text, "html.parser")
         all_categories: dict[str, list[ProblemSummary]] = {}
-
         current_category = None
         for element in soup.find_all(["h1", "h2", "ul"]):
             if not isinstance(element, Tag):
@@ -265,7 +229,6 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
                     if current_category not in all_categories:
                         all_categories[current_category] = []
                         print(f"Found category: {current_category}", file=sys.stderr)
-
             elif element.name == "ul" and current_category:
                 problem_links = element.find_all(
                     "a", href=lambda x: x and "/problemset/task/" in x
@@ -275,47 +238,61 @@ def scrape_all_problems() -> dict[str, list[ProblemSummary]]:
                     if href:
                         problem_id = href.split("/")[-1]
                         problem_name = link.get_text(strip=True)
-
                         if problem_id.isdigit() and problem_name:
                             problem = ProblemSummary(id=problem_id, name=problem_name)
                             all_categories[current_category].append(problem)
-
         print(
             f"Found {len(all_categories)} categories with {sum(len(probs) for probs in all_categories.values())} problems",
             file=sys.stderr,
         )
         return all_categories
-
     except Exception as e:
         print(f"Failed to scrape CSES problems: {e}", file=sys.stderr)
         return {}
 
 
-def extract_example_test_case(soup) -> tuple[str, str] | None:
-    example_header = soup.find("h1", string="Example")
-    if not example_header:
-        return None
+def _collect_section_after(header: Tag) -> list[Tag]:
+    out: list[Tag] = []
+    cur = header.find_next_sibling()
+    while cur and not (isinstance(cur, Tag) and cur.name in ("h1", "h2", "h3")):
+        if isinstance(cur, Tag):
+            out.append(cur)
+        cur = cur.find_next_sibling()
+    return out
 
-    current = example_header.find_next_sibling()
-    input_text = None
-    output_text = None
 
-    while current:
-        if current.name == "p" and "Input:" in current.get_text():
-            input_pre = current.find_next_sibling("pre")
-            if input_pre:
-                input_text = input_pre.get_text().strip()
-        elif current.name == "p" and "Output:" in current.get_text():
-            output_pre = current.find_next_sibling("pre")
-            if output_pre:
-                output_text = output_pre.get_text().strip()
-                break
-        current = current.find_next_sibling()
+def extract_example_test_cases(soup: BeautifulSoup) -> list[tuple[str, str]]:
+    example_headers = soup.find_all(
+        lambda t: isinstance(t, Tag)
+        and t.name in ("h1", "h2", "h3")
+        and t.get_text(strip=True).lower().startswith("example")
+    )
+    cases: list[tuple[str, str]] = []
+    for hdr in example_headers:
+        section = _collect_section_after(hdr)
 
-    if not input_text or not output_text:
-        return None
+        def find_labeled(label: str) -> str | None:
+            for node in section:
+                if not isinstance(node, Tag):
+                    continue
+                if node.name in ("p", "h4", "h5", "h6"):
+                    txt = node.get_text(strip=True).lower().rstrip(":")
+                    if txt == label:
+                        pre = node.find_next_sibling("pre")
+                        if pre:
+                            return pre.get_text().strip()
+            return None
 
-    return (input_text, output_text)
+        inp = find_labeled("input")
+        out = find_labeled("output")
+        if not inp or not out:
+            pres = [n for n in section if isinstance(n, Tag) and n.name == "pre"]
+            if len(pres) >= 2:
+                inp = inp or pres[0].get_text().strip()
+                out = out or pres[1].get_text().strip()
+        if inp and out:
+            cases.append((inp, out))
+    return cases
 
 
 def scrape(url: str) -> list[TestCase]:
@@ -323,18 +300,10 @@ def scrape(url: str) -> list[TestCase]:
         headers = {
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
         }
-
         response = make_request(url, headers)
-
         soup = BeautifulSoup(response.text, "html.parser")
-
-        test_case = extract_example_test_case(soup)
-        if not test_case:
-            return []
-
-        input_text, output_text = test_case
-        return [TestCase(input=input_text, expected=output_text)]
-
+        pairs = extract_example_test_cases(soup)
+        return [TestCase(input=inp, expected=out) for (inp, out) in pairs]
     except Exception as e:
         print(f"Error scraping CSES: {e}", file=sys.stderr)
         return []
@@ -361,7 +330,6 @@ class CSESScraper(BaseScraper):
             return func(*args)
         except Exception as e:
             error_msg = f"{self.platform_name}: {str(e)}"
-
             if operation == "metadata":
                 return MetadataResult(success=False, error=error_msg)
             elif operation == "tests":
@@ -400,21 +368,18 @@ class CSESScraper(BaseScraper):
                 timeout_ms=0,
                 memory_mb=0,
             )
-
         tests = scrape(url)
+        m = re.search(r"/task/(\d+)", url)
         actual_problem_id = (
-            problem_id if problem_id.isdigit() else problem_id.split("/")[-1]
+            problem_id if problem_id.isdigit() else (m.group(1) if m else "")
         )
-
         headers = {
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
         }
         response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
-
         soup = BeautifulSoup(response.text, "html.parser")
         timeout_ms, memory_mb = extract_problem_limits(soup)
-
         if not tests:
             return TestsResult(
                 success=False,
@@ -425,7 +390,6 @@ class CSESScraper(BaseScraper):
                 timeout_ms=timeout_ms,
                 memory_mb=memory_mb,
             )
-
         return TestsResult(
             success=True,
             error="",
@@ -453,10 +417,8 @@ def main() -> None:
         )
         print(json.dumps(asdict(result)))
         sys.exit(1)
-
     mode: str = sys.argv[1]
     scraper = CSESScraper()
-
     if mode == "metadata":
         if len(sys.argv) != 3:
             result = MetadataResult(
@@ -465,13 +427,11 @@ def main() -> None:
             )
             print(json.dumps(asdict(result)))
             sys.exit(1)
-
         category_id = sys.argv[2]
         result = scraper.scrape_contest_metadata(category_id)
         print(json.dumps(asdict(result)))
         if not result.success:
             sys.exit(1)
-
     elif mode == "tests":
         if len(sys.argv) != 4:
             tests_result = TestsResult(
@@ -485,14 +445,12 @@ def main() -> None:
             )
             print(json.dumps(asdict(tests_result)))
             sys.exit(1)
-
         category = sys.argv[2]
         problem_id = sys.argv[3]
         tests_result = scraper.scrape_problem_tests(category, problem_id)
         print(json.dumps(asdict(tests_result)))
         if not tests_result.success:
             sys.exit(1)
-
     elif mode == "contests":
         if len(sys.argv) != 2:
             contest_result = ContestListResult(
@@ -500,12 +458,10 @@ def main() -> None:
             )
             print(json.dumps(asdict(contest_result)))
             sys.exit(1)
-
         contest_result = scraper.scrape_contest_list()
         print(json.dumps(asdict(contest_result)))
         if not contest_result.success:
             sys.exit(1)
-
     else:
         result = MetadataResult(
             success=False,