diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 0a8225a..2166f55 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -12,10 +12,36 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str: return f"https://atcoder.jp/contests/{contest_id}/tasks/{task_id}" +def extract_problem_from_row(row, contest_id: str) -> dict[str, str] | None: + cells = row.find_all("td") + if len(cells) < 2: + return None + + task_link = cells[1].find("a") + if not task_link: + return None + + task_name = task_link.get_text(strip=True) + task_href = task_link.get("href", "") + + if not task_href: + return None + + task_id = task_href.split("/")[-1] + if not task_id.startswith(contest_id + "_"): + return None + + problem_letter = task_id[len(contest_id) + 1 :] + if not problem_letter or not task_name: + return None + + return {"id": problem_letter.lower(), "name": task_name} + + def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: try: - contest_url: str = f"https://atcoder.jp/contests/{contest_id}/tasks" - headers: dict[str, str] = { + contest_url = f"https://atcoder.jp/contests/{contest_id}/tasks" + headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } @@ -23,31 +49,18 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - problems: list[dict[str, str]] = [] - task_table = soup.find("table", class_="table") + if not task_table: return [] - rows = task_table.find_all("tr")[1:] # Skip header row + rows = task_table.find_all("tr")[1:] + problems = [] for row in rows: - cells = row.find_all("td") - if len(cells) >= 2: - task_link = cells[1].find("a") - if task_link: - task_name: str = task_link.get_text(strip=True) - task_href: str = task_link.get("href", "") - - # Extract problem letter from task name or URL - task_id: str = task_href.split("/")[-1] if task_href else "" - if task_id.startswith(contest_id + "_"): - problem_letter: str = task_id[len(contest_id) + 1 :] - - if problem_letter and task_name: - problems.append( - {"id": problem_letter.lower(), "name": task_name} - ) + problem = extract_problem_from_row(row, contest_id) + if problem: + problems.append(problem) problems.sort(key=lambda x: x["id"]) return problems @@ -57,9 +70,38 @@ def scrape_contest_problems(contest_id: str) -> list[dict[str, str]]: return [] +def extract_test_case_from_headers(sample_headers, i: int) -> tuple[str, str] | None: + if i >= len(sample_headers): + return None + + header = sample_headers[i] + if "input" not in header.get_text().lower(): + return None + + input_pre = header.find_next("pre") + if not input_pre or i + 1 >= len(sample_headers): + return None + + next_header = sample_headers[i + 1] + if "output" not in next_header.get_text().lower(): + return None + + output_pre = next_header.find_next("pre") + if not output_pre: + return None + + input_text = input_pre.get_text().strip().replace("\r", "") + output_text = output_pre.get_text().strip().replace("\r", "") + + if not input_text or not output_text: + return None + + return (input_text, output_text) + + def scrape(url: str) -> list[tuple[str, str]]: try: - headers: dict[str, str] = { + headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } @@ -68,33 +110,20 @@ def scrape(url: str) -> list[tuple[str, str]]: soup = BeautifulSoup(response.text, "html.parser") - tests: list[tuple[str, str]] = [] - sample_headers = soup.find_all( "h3", string=lambda x: x and "sample" in x.lower() if x else False ) + tests = [] i = 0 + while i < len(sample_headers): - header = sample_headers[i] - if "input" in header.get_text().lower(): - input_pre = header.find_next("pre") - if input_pre and i + 1 < len(sample_headers): - next_header = sample_headers[i + 1] - if "output" in next_header.get_text().lower(): - output_pre = next_header.find_next("pre") - if output_pre: - input_text: str = ( - input_pre.get_text().strip().replace("\r", "") - ) - output_text: str = ( - output_pre.get_text().strip().replace("\r", "") - ) - if input_text and output_text: - tests.append((input_text, output_text)) - i += 2 - continue - i += 1 + test_case = extract_test_case_from_headers(sample_headers, i) + if test_case: + tests.append(test_case) + i += 2 + else: + i += 1 return tests diff --git a/scrapers/cses.py b/scrapers/cses.py index 5d84927..c91f671 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -15,10 +15,36 @@ def parse_problem_url(problem_input: str) -> str | None: return None +def process_problem_element( + element, current_category: str, all_categories: dict +) -> str | None: + if element.name == "h1": + category_name = element.get_text().strip() + if category_name not in all_categories: + all_categories[category_name] = [] + return category_name + + if element.name != "a" or "/problemset/task/" not in element.get("href", ""): + return current_category + + href = element.get("href", "") + if not href: + return current_category + + problem_id = href.split("/")[-1] + problem_name = element.get_text(strip=True) + + if not (problem_id.isdigit() and problem_name and current_category): + return current_category + + all_categories[current_category].append({"id": problem_id, "name": problem_name}) + return current_category + + def scrape_all_problems() -> dict[str, list[dict[str, str]]]: try: - problemset_url: str = "https://cses.fi/problemset/" - headers: dict[str, str] = { + problemset_url = "https://cses.fi/problemset/" + headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } @@ -26,28 +52,18 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - all_categories: dict[str, list[dict[str, str]]] = {} + all_categories = {} problem_links = soup.find_all( "a", href=lambda x: x and "/problemset/task/" in x ) print(f"Found {len(problem_links)} problem links", file=sys.stderr) - current_category: str | None = None + current_category = None for element in soup.find_all(["h1", "a"]): - if element.name == "h1": - current_category = element.get_text().strip() - if current_category not in all_categories: - all_categories[current_category] = [] - elif element.name == "a" and "/problemset/task/" in element.get("href", ""): - href: str = element.get("href", "") - problem_id: str = href.split("/")[-1] - problem_name: str = element.get_text(strip=True) - - if problem_id.isdigit() and problem_name and current_category: - all_categories[current_category].append( - {"id": problem_id, "name": problem_name} - ) + current_category = process_problem_element( + element, current_category, all_categories + ) for category in all_categories: all_categories[category].sort(key=lambda x: int(x["id"])) @@ -60,9 +76,36 @@ def scrape_all_problems() -> dict[str, list[dict[str, str]]]: return {} +def extract_example_test_case(soup) -> tuple[str, str] | None: + example_header = soup.find("h1", string="Example") + if not example_header: + return None + + current = example_header.find_next_sibling() + input_text = None + output_text = None + + while current: + if current.name == "p" and "Input:" in current.get_text(): + input_pre = current.find_next_sibling("pre") + if input_pre: + input_text = input_pre.get_text().strip() + elif current.name == "p" and "Output:" in current.get_text(): + output_pre = current.find_next_sibling("pre") + if output_pre: + output_text = output_pre.get_text().strip() + break + current = current.find_next_sibling() + + if not input_text or not output_text: + return None + + return (input_text, output_text) + + def scrape(url: str) -> list[tuple[str, str]]: try: - headers: dict[str, str] = { + headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } @@ -71,30 +114,11 @@ def scrape(url: str) -> list[tuple[str, str]]: soup = BeautifulSoup(response.text, "html.parser") - tests: list[tuple[str, str]] = [] - example_header = soup.find("h1", string="Example") + test_case = extract_example_test_case(soup) + if not test_case: + return [] - if example_header: - current = example_header.find_next_sibling() - input_text: str | None = None - output_text: str | None = None - - while current: - if current.name == "p" and "Input:" in current.get_text(): - input_pre = current.find_next_sibling("pre") - if input_pre: - input_text = input_pre.get_text().strip() - elif current.name == "p" and "Output:" in current.get_text(): - output_pre = current.find_next_sibling("pre") - if output_pre: - output_text = output_pre.get_text().strip() - break - current = current.find_next_sibling() - - if input_text and output_text: - tests.append((input_text, output_text)) - - return tests + return [test_case] except Exception as e: print(f"Error scraping CSES: {e}", file=sys.stderr)