initial commit

2025-09-11 23:52:32 -05:00 · 2025-09-11 23:52:32 -05:00 · dcb7debff6
commit dcb7debff6
29 changed files with 1276 additions and 0 deletions
--- a/templates/scrapers/atcoder.py
+++ b/templates/scrapers/atcoder.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import sys
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def parse_problem_url(contest_id: str, problem_letter: str) -> str:
+    task_id = f"{contest_id}_{problem_letter}"
+    return f"https://atcoder.jp/contests/{contest_id}/tasks/{task_id}"
+
+
+def scrape(url: str) -> list[tuple[str, str]]:
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        }
+
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        tests = []
+
+        sample_headers = soup.find_all(
+            "h3", string=lambda x: x and "sample" in x.lower() if x else False
+        )
+
+        i = 0
+        while i < len(sample_headers):
+            header = sample_headers[i]
+            if "input" in header.get_text().lower():
+                input_pre = header.find_next("pre")
+                if input_pre and i + 1 < len(sample_headers):
+                    next_header = sample_headers[i + 1]
+                    if "output" in next_header.get_text().lower():
+                        output_pre = next_header.find_next("pre")
+                        if output_pre:
+                            input_text = input_pre.get_text().strip().replace("\r", "")
+                            output_text = (
+                                output_pre.get_text().strip().replace("\r", "")
+                            )
+                            if input_text and output_text:
+                                tests.append((input_text, output_text))
+                        i += 2
+                        continue
+            i += 1
+
+        return tests
+
+    except Exception as e:
+        print(f"Error scraping AtCoder: {e}", file=sys.stderr)
+        return []
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: atcoder.py <contest_id> <problem_letter>", file=sys.stderr)
+        print("Example: atcoder.py abc042 a", file=sys.stderr)
+        sys.exit(1)
+
+    contest_id = sys.argv[1]
+    problem_letter = sys.argv[2]
+
+    url = parse_problem_url(contest_id, problem_letter)
+    print(f"Scraping: {url}", file=sys.stderr)
+
+    tests = scrape(url)
+
+    if not tests:
+        print(f"No tests found for {contest_id} {problem_letter}", file=sys.stderr)
+        sys.exit(1)
+
+    print("---INPUT---")
+    print(len(tests))
+    for input_data, output_data in tests:
+        print(input_data)
+    print("---OUTPUT---")
+    for input_data, output_data in tests:
+        print(output_data)
+    print("---END---")
+
+
+if __name__ == "__main__":
+    main()
--- a/templates/scrapers/codeforces.py
+++ b/templates/scrapers/codeforces.py
@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+import sys
+
+import cloudscraper
+from bs4 import BeautifulSoup
+
+
+def scrape(url: str):
+    try:
+        scraper = cloudscraper.create_scraper()
+        response = scraper.get(url, timeout=10)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        tests = []
+
+        input_sections = soup.find_all("div", class_="input")
+        output_sections = soup.find_all("div", class_="output")
+
+        for inp_section, out_section in zip(input_sections, output_sections):
+            inp_pre = inp_section.find("pre")
+            out_pre = out_section.find("pre")
+
+            if inp_pre and out_pre:
+                input_lines = []
+                output_lines = []
+
+                for line_div in inp_pre.find_all("div", class_="test-example-line"):
+                    input_lines.append(line_div.get_text().strip())
+
+                output_divs = out_pre.find_all("div", class_="test-example-line")
+                if not output_divs:
+                    output_text_raw = out_pre.get_text().strip().replace("\r", "")
+                    output_lines = [
+                        line.strip()
+                        for line in output_text_raw.split("\n")
+                        if line.strip()
+                    ]
+                else:
+                    for line_div in output_divs:
+                        output_lines.append(line_div.get_text().strip())
+
+                if input_lines and output_lines:
+                    if len(input_lines) > 1 and input_lines[0].isdigit():
+                        test_count = int(input_lines[0])
+                        remaining_input = input_lines[1:]
+                        for i in range(min(test_count, len(output_lines))):
+                            if i < len(remaining_input):
+                                tests.append((remaining_input[i], output_lines[i]))
+                    else:
+                        input_text = "\n".join(input_lines)
+                        output_text = "\n".join(output_lines)
+                        tests.append((input_text, output_text))
+
+        return tests
+
+    except Exception as e:
+        print(f"CloudScraper failed: {e}", file=sys.stderr)
+        return []
+
+
+def parse_problem_url(contest_id: str, problem_letter: str) -> str:
+    return (
+        f"https://codeforces.com/contest/{contest_id}/problem/{problem_letter.upper()}"
+    )
+
+
+def scrape_sample_tests(url: str):
+    print(f"Scraping: {url}", file=sys.stderr)
+    return scrape(url)
+
+
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: codeforces.py <contest_id> <problem_letter>", file=sys.stderr)
+        print("Example: codeforces.py 1234 A", file=sys.stderr)
+        sys.exit(1)
+
+    contest_id = sys.argv[1]
+    problem_letter = sys.argv[2]
+
+    url = parse_problem_url(contest_id, problem_letter)
+    tests = scrape_sample_tests(url)
+
+    if not tests:
+        print(f"No tests found for {contest_id} {problem_letter}", file=sys.stderr)
+        print(
+            "Consider adding test cases manually to the io/ directory", file=sys.stderr
+        )
+        sys.exit(1)
+
+    print("---INPUT---")
+    print(len(tests))
+    for input_data, output_data in tests:
+        print(input_data)
+    print("---OUTPUT---")
+    for input_data, output_data in tests:
+        print(output_data)
+    print("---END---")
+
+
+if __name__ == "__main__":
+    main()
--- a/templates/scrapers/cses.py
+++ b/templates/scrapers/cses.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+import sys
+
+import requests
+from bs4 import BeautifulSoup
+
+
+def parse_problem_url(problem_input: str) -> str | None:
+    if problem_input.startswith("https://cses.fi/problemset/task/"):
+        return problem_input
+    elif problem_input.isdigit():
+        return f"https://cses.fi/problemset/task/{problem_input}"
+    return None
+
+
+def scrape(url: str) -> list[tuple[str, str]]:
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        }
+
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        tests = []
+        example_header = soup.find("h1", string="Example")
+
+        if example_header:
+            current = example_header.find_next_sibling()
+            input_text = None
+            output_text = None
+
+            while current:
+                if current.name == "p" and "Input:" in current.get_text():
+                    input_pre = current.find_next_sibling("pre")
+                    if input_pre:
+                        input_text = input_pre.get_text().strip()
+                elif current.name == "p" and "Output:" in current.get_text():
+                    output_pre = current.find_next_sibling("pre")
+                    if output_pre:
+                        output_text = output_pre.get_text().strip()
+                        break
+                current = current.find_next_sibling()
+
+            if input_text and output_text:
+                tests.append((input_text, output_text))
+
+        return tests
+
+    except Exception as e:
+        print(f"Error scraping CSES: {e}", file=sys.stderr)
+        return []
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: cses.py <problem_id_or_url>", file=sys.stderr)
+        sys.exit(1)
+
+    problem_input = sys.argv[1]
+    url = parse_problem_url(problem_input)
+
+    if not url:
+        print(f"Invalid problem input: {problem_input}", file=sys.stderr)
+        print("Use either problem ID (e.g., 1068) or full URL", file=sys.stderr)
+        sys.exit(1)
+
+    tests = scrape(url)
+
+    if not tests:
+        print(f"No tests found for {problem_input}", file=sys.stderr)
+        sys.exit(1)
+
+    print("---INPUT---")
+    print(len(tests))
+    for input_data, output_data in tests:
+        print(input_data)
+    print("---OUTPUT---")
+    for input_data, output_data in tests:
+        print(output_data)
+    print("---END---")
+
+
+if __name__ == "__main__":
+    main()