fix scrapers
This commit is contained in:
parent
34ef7bafd6
commit
4498c4a7fa
11 changed files with 294 additions and 1701 deletions
|
|
@ -1,375 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
from scrapling.fetchers import StealthyFetcher
|
||||
|
||||
from .base import BaseScraper
|
||||
from .models import (
|
||||
ContestListResult,
|
||||
ContestSummary,
|
||||
MetadataResult,
|
||||
ProblemSummary,
|
||||
TestCase,
|
||||
TestsResult,
|
||||
)
|
||||
|
||||
|
||||
def scrape(url: str) -> list[TestCase]:
|
||||
try:
|
||||
page = StealthyFetcher.fetch(url, headless=True, solve_cloudflare=True)
|
||||
html = page.html_content
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
input_sections = soup.find_all("div", class_="input")
|
||||
output_sections = soup.find_all("div", class_="output")
|
||||
|
||||
individual_inputs: dict[str, list[str]] = {}
|
||||
individual_outputs: dict[str, list[str]] = {}
|
||||
|
||||
for inp_section in input_sections:
|
||||
inp_pre = inp_section.find("pre")
|
||||
if not inp_pre or not isinstance(inp_pre, Tag):
|
||||
continue
|
||||
|
||||
test_line_divs = inp_pre.find_all(
|
||||
"div", class_=lambda x: x and "test-example-line-" in x
|
||||
)
|
||||
if not test_line_divs:
|
||||
continue
|
||||
|
||||
for div in test_line_divs:
|
||||
classes = div.get("class", [])
|
||||
class_name = next(
|
||||
(
|
||||
cls
|
||||
for cls in classes
|
||||
if "test-example-line-" in cls and cls.split("-")[-1].isdigit()
|
||||
),
|
||||
None,
|
||||
)
|
||||
if not class_name:
|
||||
continue
|
||||
|
||||
test_num = class_name.replace("test-example-line-", "")
|
||||
if test_num not in individual_inputs:
|
||||
individual_inputs[test_num] = []
|
||||
individual_inputs[test_num].append(div.get_text().strip())
|
||||
|
||||
for out_section in output_sections:
|
||||
out_pre = out_section.find("pre")
|
||||
if not out_pre or not isinstance(out_pre, Tag):
|
||||
continue
|
||||
|
||||
test_line_divs = out_pre.find_all(
|
||||
"div", class_=lambda x: x and "test-example-line-" in x
|
||||
)
|
||||
if not test_line_divs:
|
||||
continue
|
||||
|
||||
for div in test_line_divs:
|
||||
classes = div.get("class", [])
|
||||
class_name = next(
|
||||
(
|
||||
cls
|
||||
for cls in classes
|
||||
if "test-example-line-" in cls and cls.split("-")[-1].isdigit()
|
||||
),
|
||||
None,
|
||||
)
|
||||
if not class_name:
|
||||
continue
|
||||
|
||||
test_num = class_name.replace("test-example-line-", "")
|
||||
if test_num not in individual_outputs:
|
||||
individual_outputs[test_num] = []
|
||||
individual_outputs[test_num].append(div.get_text().strip())
|
||||
|
||||
if individual_inputs and individual_outputs:
|
||||
common_tests = set(individual_inputs.keys()) & set(
|
||||
individual_outputs.keys()
|
||||
)
|
||||
if common_tests:
|
||||
tests = []
|
||||
for test_num in sorted(common_tests):
|
||||
input_text = "\n".join(individual_inputs[test_num])
|
||||
output_text = "\n".join(individual_outputs[test_num])
|
||||
prefixed_input = "1\n" + input_text
|
||||
tests.append(TestCase(input=prefixed_input, expected=output_text))
|
||||
return tests
|
||||
all_inputs = []
|
||||
all_outputs = []
|
||||
|
||||
for inp_section in input_sections:
|
||||
inp_pre = inp_section.find("pre")
|
||||
if not inp_pre or not isinstance(inp_pre, Tag):
|
||||
continue
|
||||
|
||||
divs = inp_pre.find_all("div")
|
||||
if divs:
|
||||
lines = [div.get_text().strip() for div in divs if isinstance(div, Tag)]
|
||||
text = "\n".join(lines)
|
||||
else:
|
||||
text = inp_pre.get_text().replace("\r", "").strip()
|
||||
all_inputs.append(text)
|
||||
|
||||
for out_section in output_sections:
|
||||
out_pre = out_section.find("pre")
|
||||
if not out_pre or not isinstance(out_pre, Tag):
|
||||
continue
|
||||
|
||||
divs = out_pre.find_all("div")
|
||||
if divs:
|
||||
lines = [div.get_text().strip() for div in divs if isinstance(div, Tag)]
|
||||
text = "\n".join(lines)
|
||||
else:
|
||||
text = out_pre.get_text().replace("\r", "").strip()
|
||||
all_outputs.append(text)
|
||||
|
||||
if not all_inputs or not all_outputs:
|
||||
return []
|
||||
|
||||
combined_input = "\n".join(all_inputs)
|
||||
combined_output = "\n".join(all_outputs)
|
||||
return [TestCase(input=combined_input, expected=combined_output)]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Scrapling failed: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def parse_problem_url(contest_id: str, problem_letter: str) -> str:
|
||||
return (
|
||||
f"https://codeforces.com/contest/{contest_id}/problem/{problem_letter.upper()}"
|
||||
)
|
||||
|
||||
|
||||
def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
|
||||
timeout_ms = None
|
||||
memory_mb = None
|
||||
|
||||
time_limit_div = soup.find("div", class_="time-limit")
|
||||
if time_limit_div:
|
||||
text = time_limit_div.get_text().strip()
|
||||
match = re.search(r"(\d+) seconds?", text)
|
||||
if match:
|
||||
seconds = int(match.group(1))
|
||||
timeout_ms = seconds * 1000
|
||||
|
||||
if timeout_ms is None:
|
||||
raise ValueError("Could not find valid timeout in time-limit section")
|
||||
|
||||
memory_limit_div = soup.find("div", class_="memory-limit")
|
||||
if memory_limit_div:
|
||||
text = memory_limit_div.get_text().strip()
|
||||
match = re.search(r"(\d+) megabytes", text)
|
||||
if match:
|
||||
memory_mb = float(match.group(1))
|
||||
|
||||
if memory_mb is None:
|
||||
raise ValueError("Could not find valid memory limit in memory-limit section")
|
||||
|
||||
return timeout_ms, memory_mb
|
||||
|
||||
|
||||
def scrape_contest_problems(contest_id: str) -> list[ProblemSummary]:
|
||||
try:
|
||||
contest_url: str = f"https://codeforces.com/contest/{contest_id}"
|
||||
page = StealthyFetcher.fetch(contest_url, headless=True, solve_cloudflare=True)
|
||||
html = page.html_content
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
problems: list[ProblemSummary] = []
|
||||
|
||||
problem_links = soup.find_all(
|
||||
"a", href=lambda x: x and f"/contest/{contest_id}/problem/" in x
|
||||
)
|
||||
|
||||
for link in problem_links:
|
||||
if not isinstance(link, Tag):
|
||||
continue
|
||||
href: str = str(link.get("href", ""))
|
||||
if f"/contest/{contest_id}/problem/" in href:
|
||||
problem_letter: str = href.split("/")[-1].lower()
|
||||
problem_name: str = link.get_text(strip=True)
|
||||
|
||||
if not (problem_letter and problem_name):
|
||||
continue
|
||||
|
||||
problems.append(ProblemSummary(id=problem_letter, name=problem_name))
|
||||
|
||||
seen: set[str] = set()
|
||||
unique_problems: list[ProblemSummary] = []
|
||||
for p in problems:
|
||||
if p.id not in seen:
|
||||
seen.add(p.id)
|
||||
unique_problems.append(p)
|
||||
|
||||
return unique_problems
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to scrape contest problems: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def scrape_sample_tests(url: str) -> list[TestCase]:
|
||||
print(f"Scraping: {url}", file=sys.stderr)
|
||||
return scrape(url)
|
||||
|
||||
|
||||
def scrape_contests() -> list[ContestSummary]:
|
||||
response = requests.get("https://codeforces.com/api/contest.list", timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if data["status"] != "OK":
|
||||
return []
|
||||
|
||||
contests = []
|
||||
for contest in data["result"]:
|
||||
contest_id = str(contest["id"])
|
||||
name = contest["name"]
|
||||
contests.append(ContestSummary(id=contest_id, name=name, display_name=name))
|
||||
|
||||
return contests
|
||||
|
||||
|
||||
class CodeforcesScraper(BaseScraper):
|
||||
@property
|
||||
def platform_name(self) -> str:
|
||||
return "codeforces"
|
||||
|
||||
def scrape_contest_metadata(self, contest_id: str) -> MetadataResult:
|
||||
return self._safe_execute(
|
||||
"metadata", self._scrape_contest_metadata_impl, contest_id
|
||||
)
|
||||
|
||||
def scrape_problem_tests(self, contest_id: str, problem_id: str) -> TestsResult:
|
||||
return self._safe_execute(
|
||||
"tests", self._scrape_problem_tests_impl, contest_id, problem_id
|
||||
)
|
||||
|
||||
def scrape_contest_list(self) -> ContestListResult:
|
||||
return self._safe_execute("contests", self._scrape_contest_list_impl)
|
||||
|
||||
def _scrape_contest_metadata_impl(self, contest_id: str) -> MetadataResult:
|
||||
problems = scrape_contest_problems(contest_id)
|
||||
if not problems:
|
||||
return self._create_metadata_error(
|
||||
f"No problems found for contest {contest_id}", contest_id
|
||||
)
|
||||
return MetadataResult(
|
||||
success=True, error="", contest_id=contest_id, problems=problems
|
||||
)
|
||||
|
||||
def _scrape_problem_tests_impl(
|
||||
self, contest_id: str, problem_letter: str
|
||||
) -> TestsResult:
|
||||
problem_id = contest_id + problem_letter.lower()
|
||||
url = parse_problem_url(contest_id, problem_letter)
|
||||
tests = scrape_sample_tests(url)
|
||||
|
||||
page = StealthyFetcher.fetch(url, headless=True, solve_cloudflare=True)
|
||||
html = page.html_content
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
timeout_ms, memory_mb = extract_problem_limits(soup)
|
||||
|
||||
problem_statement_div = soup.find("div", class_="problem-statement")
|
||||
interactive = bool(
|
||||
problem_statement_div
|
||||
and "This is an interactive problem" in problem_statement_div.get_text()
|
||||
)
|
||||
|
||||
if not tests:
|
||||
return self._create_tests_error(
|
||||
f"No tests found for {contest_id} {problem_letter}", problem_id, url
|
||||
)
|
||||
|
||||
return TestsResult(
|
||||
success=True,
|
||||
error="",
|
||||
problem_id=problem_id,
|
||||
url=url,
|
||||
tests=tests,
|
||||
timeout_ms=timeout_ms,
|
||||
memory_mb=memory_mb,
|
||||
interactive=interactive,
|
||||
)
|
||||
|
||||
def _scrape_contest_list_impl(self) -> ContestListResult:
|
||||
contests = scrape_contests()
|
||||
if not contests:
|
||||
return self._create_contests_error("No contests found")
|
||||
return ContestListResult(success=True, error="", contests=contests)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if len(sys.argv) < 2:
|
||||
result = MetadataResult(
|
||||
success=False,
|
||||
error="Usage: codeforces.py metadata <contest_id> OR codeforces.py tests <contest_id> <problem_letter> OR codeforces.py contests",
|
||||
)
|
||||
print(json.dumps(asdict(result)))
|
||||
sys.exit(1)
|
||||
|
||||
scraper = CodeforcesScraper()
|
||||
mode: str = sys.argv[1]
|
||||
|
||||
if mode == "metadata":
|
||||
if len(sys.argv) != 3:
|
||||
result = MetadataResult(
|
||||
success=False, error="Usage: codeforces.py metadata <contest_id>"
|
||||
)
|
||||
print(json.dumps(asdict(result)))
|
||||
sys.exit(1)
|
||||
|
||||
contest_id: str = sys.argv[2]
|
||||
result = scraper.scrape_contest_metadata(contest_id)
|
||||
print(json.dumps(asdict(result)))
|
||||
|
||||
elif mode == "tests":
|
||||
if len(sys.argv) != 4:
|
||||
tests_result = TestsResult(
|
||||
success=False,
|
||||
error="Usage: codeforces.py tests <contest_id> <problem_letter>",
|
||||
problem_id="",
|
||||
url="",
|
||||
tests=[],
|
||||
timeout_ms=0,
|
||||
memory_mb=0,
|
||||
)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
sys.exit(1)
|
||||
|
||||
tests_contest_id: str = sys.argv[2]
|
||||
problem_letter: str = sys.argv[3]
|
||||
tests_result = scraper.scrape_problem_tests(tests_contest_id, problem_letter)
|
||||
print(json.dumps(asdict(tests_result)))
|
||||
|
||||
elif mode == "contests":
|
||||
if len(sys.argv) != 2:
|
||||
contest_result = ContestListResult(
|
||||
success=False, error="Usage: codeforces.py contests"
|
||||
)
|
||||
print(json.dumps(asdict(contest_result)))
|
||||
sys.exit(1)
|
||||
|
||||
contest_result = scraper.scrape_contest_list()
|
||||
print(json.dumps(asdict(contest_result)))
|
||||
|
||||
else:
|
||||
result = MetadataResult(
|
||||
success=False,
|
||||
error=f"Unknown mode: {mode}. Use 'metadata', 'tests', or 'contests'",
|
||||
)
|
||||
print(json.dumps(asdict(result)))
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue