From b2e1ea2c58319ca770f2cccedbbbb29b7c9a0978 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Thu, 18 Sep 2025 09:56:54 -0400 Subject: [PATCH] feat(scraper): update scraper i/o for single test panel --- scrapers/atcoder.py | 15 ++--- scrapers/codeforces.py | 126 ++++++++++++++++++++++++++++++++--------- scrapers/cses.py | 6 +- 3 files changed, 108 insertions(+), 39 deletions(-) diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 46d673d..0a8225a 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -169,21 +169,16 @@ def main() -> None: print(json.dumps(result)) sys.exit(1) - individual_test_cases: list[dict[str, str]] = [] - for index, (input_data, output_data) in enumerate(tests, 1): - individual_test_cases.append( - {"index": index, "input": input_data, "output": output_data} - ) - - combined_input = "\n".join(tc["input"] for tc in individual_test_cases) - combined_output = "\n".join(tc["output"] for tc in individual_test_cases) + test_list: list[dict[str, str]] = [] + for input_data, output_data in tests: + normalized_input = "1\n" + input_data + test_list.append({"input": normalized_input, "expected": output_data}) result = { "success": True, "problem_id": problem_id, "url": url, - "test_cases": individual_test_cases, - "combined": {"input": combined_input, "output": combined_output}, + "tests": test_list, } print(json.dumps(result)) diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index 8e89b0a..2a4e502 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -14,42 +14,116 @@ def scrape(url: str) -> list[tuple[str, str]]: response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - tests: list[tuple[str, str]] = [] - input_sections = soup.find_all("div", class_="input") output_sections = soup.find_all("div", class_="output") + individual_inputs = {} + individual_outputs = {} + + for inp_section in input_sections: + inp_pre = inp_section.find("pre") + if not inp_pre: + continue + + test_line_divs = inp_pre.find_all( + "div", class_=lambda x: x and "test-example-line-" in x + ) + if not test_line_divs: + continue + + for div in test_line_divs: + class_name = next( + ( + cls + for cls in div.get("class", []) + if "test-example-line-" in cls + ), + None, + ) + if not class_name: + continue + + test_num = class_name.replace("test-example-line-", "") + if test_num not in individual_inputs: + individual_inputs[test_num] = [] + individual_inputs[test_num].append(div.get_text().strip()) + + for out_section in output_sections: + out_pre = out_section.find("pre") + if not out_pre: + continue + + test_line_divs = out_pre.find_all( + "div", class_=lambda x: x and "test-example-line-" in x + ) + if not test_line_divs: + continue + + for div in test_line_divs: + class_name = next( + ( + cls + for cls in div.get("class", []) + if "test-example-line-" in cls + ), + None, + ) + if not class_name: + continue + + test_num = class_name.replace("test-example-line-", "") + if test_num not in individual_outputs: + individual_outputs[test_num] = [] + individual_outputs[test_num].append(div.get_text().strip()) + + if individual_inputs and individual_outputs: + common_tests = set(individual_inputs.keys()) & set( + individual_outputs.keys() + ) + if common_tests: + tests = [] + for test_num in sorted(common_tests): + input_text = "\n".join(individual_inputs[test_num]) + output_text = "\n".join(individual_outputs[test_num]) + prefixed_input = "1\n" + input_text + tests.append((prefixed_input, output_text)) + return tests + all_inputs = [] all_outputs = [] for inp_section in input_sections: inp_pre = inp_section.find("pre") - if inp_pre: - divs = inp_pre.find_all("div") - if divs: - lines = [div.get_text().strip() for div in divs] - text = "\n".join(lines) - else: - text = inp_pre.get_text().replace("\r", "").strip() - all_inputs.append(text) + if not inp_pre: + continue + + divs = inp_pre.find_all("div") + if divs: + lines = [div.get_text().strip() for div in divs] + text = "\n".join(lines) + else: + text = inp_pre.get_text().replace("\r", "").strip() + all_inputs.append(text) for out_section in output_sections: out_pre = out_section.find("pre") - if out_pre: - divs = out_pre.find_all("div") - if divs: - lines = [div.get_text().strip() for div in divs] - text = "\n".join(lines) - else: - text = out_pre.get_text().replace("\r", "").strip() - all_outputs.append(text) + if not out_pre: + continue - if all_inputs and all_outputs: - combined_input = "\n".join(all_inputs) - combined_output = "\n".join(all_outputs) - tests.append((combined_input, combined_output)) + divs = out_pre.find_all("div") + if divs: + lines = [div.get_text().strip() for div in divs] + text = "\n".join(lines) + else: + text = out_pre.get_text().replace("\r", "").strip() + all_outputs.append(text) - return tests + if not all_inputs or not all_outputs: + return [] + + combined_input = "\n".join(all_inputs) + combined_output = "\n".join(all_outputs) + return [(combined_input, combined_output)] except Exception as e: print(f"CloudScraper failed: {e}", file=sys.stderr) @@ -170,15 +244,15 @@ def main() -> None: print(json.dumps(result)) sys.exit(1) - test_cases: list[dict[str, str]] = [] + test_list: list[dict[str, str]] = [] for input_data, output_data in tests: - test_cases.append({"input": input_data, "output": output_data}) + test_list.append({"input": input_data, "expected": output_data}) result: dict[str, str | bool | list] = { "success": True, "problem_id": problem_id, "url": url, - "test_cases": test_cases, + "tests": test_list, } print(json.dumps(result)) diff --git a/scrapers/cses.py b/scrapers/cses.py index 8cd6020..5d84927 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -174,15 +174,15 @@ def main() -> None: print(json.dumps(result)) sys.exit(1) - test_cases: list[dict[str, str]] = [] + test_list: list[dict[str, str]] = [] for input_data, output_data in tests: - test_cases.append({"input": input_data, "output": output_data}) + test_list.append({"input": input_data, "expected": output_data}) result = { "success": True, "problem_id": problem_id, "url": url, - "test_cases": test_cases, + "tests": test_list, } print(json.dumps(result))