fix(ci): move imports

2025-09-20 23:52:32 -04:00 · 2025-09-20 23:52:32 -04:00 · 7b8aae7921
commit 7b8aae7921
parent 847307bd1f
5 changed files with 475 additions and 95 deletions
--- a/lua/cp/init.lua
+++ b/lua/cp/init.lua
@ -145,6 +145,42 @@ local function setup_problem(contest_id, problem_id, language)
  logger.log(('switched to problem %s'):format(ctx.problem_name))
 end
 local function ensure_io_directory()
  vim.fn.mkdir('io', 'p')
 end
 local function scrape_missing_problems(contest_id, missing_problems)
  ensure_io_directory()
  logger.log(('scraping %d uncached problems...'):format(#missing_problems))
  local results =
    scrape.scrape_problems_parallel(state.platform, contest_id, missing_problems, config)
  local success_count = 0
  local failed_problems = {}
  for problem_id, result in pairs(results) do
    if result.success then
      success_count = success_count + 1
    else
      table.insert(failed_problems, problem_id)
    end
  end
  if #failed_problems > 0 then
    logger.log(
      ('scraping complete: %d/%d successful, failed: %s'):format(
        success_count,
        #missing_problems,
        table.concat(failed_problems, ', ')
      ),
      vim.log.levels.WARN
    )
  else
    logger.log(('scraping complete: %d/%d successful'):format(success_count, #missing_problems))
  end
 end
 local function get_current_problem()
  local filename = vim.fn.expand('%:t:r')
  if filename == '' then
@ -557,6 +593,62 @@ end
 ---@param delta number 1 for next, -1 for prev
 ---@param language? string
 local function setup_contest(contest_id, language)
  if not state.platform then
    logger.log('no platform set', vim.log.levels.ERROR)
    return false
  end
  if not vim.tbl_contains(config.scrapers, state.platform) then
    logger.log('scraping disabled for ' .. state.platform, vim.log.levels.WARN)
    return false
  end
  logger.log(('setting up contest %s %s'):format(state.platform, contest_id))
  local metadata_result = scrape.scrape_contest_metadata(state.platform, contest_id)
  if not metadata_result.success then
    logger.log(
      'failed to load contest metadata: ' .. (metadata_result.error or 'unknown error'),
      vim.log.levels.ERROR
    )
    return false
  end
  local problems = metadata_result.problems
  if not problems or #problems == 0 then
    logger.log('no problems found in contest', vim.log.levels.ERROR)
    return false
  end
  logger.log(('found %d problems, checking cache...'):format(#problems))
  cache.load()
  local missing_problems = {}
  for _, problem in ipairs(problems) do
    local cached_tests = cache.get_test_cases(state.platform, contest_id, problem.id)
    if not cached_tests then
      table.insert(missing_problems, problem)
    end
  end
  if #missing_problems > 0 then
    logger.log(('scraping %d uncached problems...'):format(#missing_problems))
    scrape_missing_problems(contest_id, missing_problems)
  else
    logger.log('all problems already cached')
  end
  state.contest_id = contest_id
  if state.platform == 'cses' then
    setup_problem(problems[1].id, nil, language)
  else
    setup_problem(contest_id, problems[1].id, language)
  end
  return true
 end
 local function navigate_problem(delta, language)
  if not state.platform or not state.contest_id then
    logger.log('no contest set. run :CP <platform> <contest> first', vim.log.levels.ERROR)
@ -701,20 +793,12 @@ local function parse_command(args)
        language = language,
      }
    elseif #filtered_args == 2 then
-      if first == 'cses' then
+      return {
-        logger.log(
+        type = 'contest_setup',
-          'CSES requires both category and problem ID. Usage: :CP cses <category> <problem_id>',
+        platform = first,
-          vim.log.levels.ERROR
+        contest = filtered_args[2],
-        )
+        language = language,
-        return { type = 'error' }
+      }
      else
        return {
          type = 'contest_setup',
          platform = first,
          contest = filtered_args[2],
          language = language,
        }
      end
    elseif #filtered_args == 3 then
      return {
        type = 'full_setup',
@ -779,24 +863,7 @@ function M.handle_command(opts)
  if cmd.type == 'contest_setup' then
    if set_platform(cmd.platform) then
-      state.contest_id = cmd.contest
+      setup_contest(cmd.contest, cmd.language)
      if vim.tbl_contains(config.scrapers, cmd.platform) then
        local metadata_result = scrape.scrape_contest_metadata(cmd.platform, cmd.contest)
        if not metadata_result.success then
          logger.log(
            'failed to load contest metadata: ' .. (metadata_result.error or 'unknown error'),
            vim.log.levels.WARN
          )
        else
          logger.log(
            ('loaded %d problems for %s %s'):format(
              #metadata_result.problems,
              cmd.platform,
              cmd.contest
            )
          )
        end
      end
    end
    return
  end
--- a/lua/cp/scrape.lua
+++ b/lua/cp/scrape.lua
@ -14,6 +14,7 @@
 local M = {}
 local cache = require('cp.cache')
 local logger = require('cp.log')
 local problem = require('cp.problem')
 local function get_plugin_path()
  local plugin_path = debug.getinfo(1, 'S').source:sub(2)
@ -294,4 +295,127 @@ function M.scrape_problem(ctx)
  }
 end
 ---@param platform string
 ---@param contest_id string
 ---@param problems table[]
 ---@param config table
 ---@return table[]
 function M.scrape_problems_parallel(platform, contest_id, problems, config)
  vim.validate({
    platform = { platform, 'string' },
    contest_id = { contest_id, 'string' },
    problems = { problems, 'table' },
    config = { config, 'table' },
  })
  if not check_internet_connectivity() then
    return {}
  end
  if not setup_python_env() then
    return {}
  end
  local plugin_path = get_plugin_path()
  local jobs = {}
  for _, problem in ipairs(problems) do
    local args
    if platform == 'cses' then
      args = {
        'uv',
        'run',
        '--directory',
        plugin_path,
        '-m',
        'scrapers.' .. platform,
        'tests',
        problem.id,
      }
    else
      args = {
        'uv',
        'run',
        '--directory',
        plugin_path,
        '-m',
        'scrapers.' .. platform,
        'tests',
        contest_id,
        problem.id,
      }
    end
    local job = vim.system(args, {
      cwd = plugin_path,
      text = true,
      timeout = 30000,
    })
    jobs[problem.id] = {
      job = job,
      problem = problem,
    }
  end
  local results = {}
  for problem_id, job_data in pairs(jobs) do
    local result = job_data.job:wait()
    local scrape_result = {
      success = false,
      problem_id = problem_id,
      error = 'Unknown error',
    }
    if result.code == 0 then
      local ok, data = pcall(vim.json.decode, result.stdout)
      if ok and data.success then
        scrape_result = data
        if data.tests and #data.tests > 0 then
          local ctx = problem.create_context(platform, contest_id, problem_id, config)
          local base_name = vim.fn.fnamemodify(ctx.input_file, ':r')
          for i, test_case in ipairs(data.tests) do
            local input_file = base_name .. '.' .. i .. '.cpin'
            local expected_file = base_name .. '.' .. i .. '.cpout'
            local input_content = test_case.input:gsub('\r', '')
            local expected_content = test_case.expected:gsub('\r', '')
            vim.fn.writefile(vim.split(input_content, '\n', true), input_file)
            vim.fn.writefile(vim.split(expected_content, '\n', true), expected_file)
          end
          local cached_test_cases = {}
          for i, test_case in ipairs(data.tests) do
            table.insert(cached_test_cases, {
              index = i,
              input = test_case.input,
              expected = test_case.expected,
            })
          end
          cache.set_test_cases(
            platform,
            contest_id,
            problem_id,
            cached_test_cases,
            data.timeout_ms,
            data.memory_mb
          )
        end
      else
        scrape_result.error = ok and data.error or 'Failed to parse scraper output'
      end
    else
      scrape_result.error = 'Scraper execution failed: ' .. (result.stderr or 'Unknown error')
    end
    results[problem_id] = scrape_result
  end
  return results
 end
 return M
--- a/scrapers/atcoder.py
+++ b/scrapers/atcoder.py
@ -168,70 +168,210 @@ def scrape(url: str) -> list[TestCase]:
 def scrape_contests() -> list[ContestSummary]:
-    contests = []
+    import concurrent.futures
-    max_pages = 15
+    import random
-    for page in range(1, max_pages + 1):
+    def get_max_pages() -> int:
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            }
-            url = f"https://atcoder.jp/contests/archive?page={page}"
+            response = requests.get(
-            response = requests.get(url, headers=headers, timeout=10)
+                "https://atcoder.jp/contests/archive", headers=headers, timeout=10
            )
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
-            table = soup.find("table", class_="table")
+            pagination = soup.find("ul", class_="pagination")
-            if not table:
+            if not pagination or not isinstance(pagination, Tag):
-                break
+                return 15
-            tbody = table.find("tbody")
+            lis = pagination.find_all("li")
-            if not tbody or not isinstance(tbody, Tag):
+            if lis and isinstance(lis[-1], Tag):
-                break
+                last_li_text = lis[-1].get_text().strip()
                try:
                    return int(last_li_text)
                except ValueError:
                    return 15
            return 15
        except Exception:
            return 15
-            rows = tbody.find_all("tr")
+    def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]:
-            if not rows:
+        for attempt in range(max_retries):
-                break
+            try:
                headers = {
                    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
                }
                url = f"https://atcoder.jp/contests/archive?page={page}"
                response = requests.get(url, headers=headers, timeout=10)
-            for row in rows:
+                if response.status_code == 429:
-                cells = row.find_all("td")
+                    backoff_time = (2**attempt) + random.uniform(0, 1)
-                if len(cells) < 2:
+                    print(
                        f"Rate limited on page {page}, retrying in {backoff_time:.1f}s",
                        file=sys.stderr,
                    )
                    time.sleep(backoff_time)
                    continue
-                contest_cell = cells[1]
+                response.raise_for_status()
-                link = contest_cell.find("a")
+
-                if not link or not link.get("href"):
+                soup = BeautifulSoup(response.text, "html.parser")
                table = soup.find("table", class_="table")
                if not table:
                    return []
                tbody = table.find("tbody")
                if not tbody or not isinstance(tbody, Tag):
                    return []
                rows = tbody.find_all("tr")
                if not rows:
                    return []
                contests = []
                for row in rows:
                    cells = row.find_all("td")
                    if len(cells) < 2:
                        continue
                    contest_cell = cells[1]
                    link = contest_cell.find("a")
                    if not link or not link.get("href"):
                        continue
                    href = link.get("href")
                    contest_id = href.split("/")[-1]
                    name = link.get_text().strip()
                    try:
                        name = name.encode().decode("unicode_escape")
                    except:
                        pass
                    name = (
                        name.replace("\uff08", "(")
                        .replace("\uff09", ")")
                        .replace("\u3000", " ")
                    )
                    name = re.sub(
                        r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name
                    )
                    def generate_display_name_from_id(contest_id: str) -> str:
                        parts = contest_id.replace("-", " ").replace("_", " ")
                        parts = re.sub(
                            r"\b(jsc|JSC)\b",
                            "Japanese Student Championship",
                            parts,
                            flags=re.IGNORECASE,
                        )
                        parts = re.sub(
                            r"\b(wtf|WTF)\b",
                            "World Tour Finals",
                            parts,
                            flags=re.IGNORECASE,
                        )
                        parts = re.sub(
                            r"\b(ahc)(\d+)\b",
                            r"Heuristic Contest \2 (AHC)",
                            parts,
                            flags=re.IGNORECASE,
                        )
                        parts = re.sub(
                            r"\b(arc)(\d+)\b",
                            r"Regular Contest \2 (ARC)",
                            parts,
                            flags=re.IGNORECASE,
                        )
                        parts = re.sub(
                            r"\b(abc)(\d+)\b",
                            r"Beginner Contest \2 (ABC)",
                            parts,
                            flags=re.IGNORECASE,
                        )
                        parts = re.sub(
                            r"\b(agc)(\d+)\b",
                            r"Grand Contest \2 (AGC)",
                            parts,
                            flags=re.IGNORECASE,
                        )
                        return parts.title()
                    english_chars = sum(1 for c in name if c.isascii() and c.isalpha())
                    total_chars = len(re.sub(r"\s+", "", name))
                    if total_chars > 0 and english_chars / total_chars < 0.3:
                        display_name = generate_display_name_from_id(contest_id)
                    else:
                        display_name = name
                        if "AtCoder Beginner Contest" in name:
                            match = re.search(r"AtCoder Beginner Contest (\d+)", name)
                            if match:
                                display_name = (
                                    f"Beginner Contest {match.group(1)} (ABC)"
                                )
                        elif "AtCoder Regular Contest" in name:
                            match = re.search(r"AtCoder Regular Contest (\d+)", name)
                            if match:
                                display_name = f"Regular Contest {match.group(1)} (ARC)"
                        elif "AtCoder Grand Contest" in name:
                            match = re.search(r"AtCoder Grand Contest (\d+)", name)
                            if match:
                                display_name = f"Grand Contest {match.group(1)} (AGC)"
                        elif "AtCoder Heuristic Contest" in name:
                            match = re.search(r"AtCoder Heuristic Contest (\d+)", name)
                            if match:
                                display_name = (
                                    f"Heuristic Contest {match.group(1)} (AHC)"
                                )
                    contests.append(
                        ContestSummary(
                            id=contest_id, name=name, display_name=display_name
                        )
                    )
                return contests
            except requests.exceptions.RequestException as e:
                if response.status_code == 429:
                    continue
-
+                print(
-                href = link.get("href")
+                    f"Failed to scrape page {page} (attempt {attempt + 1}): {e}",
-                contest_id = href.split("/")[-1]
+                    file=sys.stderr,
                name = link.get_text().strip()
                display_name = name
                if "AtCoder Beginner Contest" in name:
                    match = re.search(r"AtCoder Beginner Contest (\d+)", name)
                    if match:
                        display_name = f"Beginner Contest {match.group(1)} (ABC)"
                elif "AtCoder Regular Contest" in name:
                    match = re.search(r"AtCoder Regular Contest (\d+)", name)
                    if match:
                        display_name = f"Regular Contest {match.group(1)} (ARC)"
                elif "AtCoder Grand Contest" in name:
                    match = re.search(r"AtCoder Grand Contest (\d+)", name)
                    if match:
                        display_name = f"Grand Contest {match.group(1)} (AGC)"
                contests.append(
                    ContestSummary(id=contest_id, name=name, display_name=display_name)
                )
                if attempt == max_retries - 1:
                    return []
            except Exception as e:
                print(f"Unexpected error on page {page}: {e}", file=sys.stderr)
                return []
-            time.sleep(0.5)
+        return []
-        except Exception as e:
+    max_pages = get_max_pages()
-            print(f"Failed to scrape page {page}: {e}", file=sys.stderr)
+    page_results = {}
            continue
-    return contests
+    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        future_to_page = {
            executor.submit(scrape_page_with_retry, page): page
            for page in range(1, max_pages + 1)
        }
        for future in concurrent.futures.as_completed(future_to_page):
            page = future_to_page[future]
            page_contests = future.result()
            page_results[page] = page_contests
    # Sort by page number to maintain order
    all_contests = []
    for page in sorted(page_results.keys()):
        all_contests.extend(page_results[page])
    return all_contests
 def main() -> None:
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import json
 import re
 import sys
 from dataclasses import asdict
@ -148,8 +149,6 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str:
 def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]:
    import re
    timeout_ms = None
    memory_mb = None
@ -240,22 +239,43 @@ def scrape_contests() -> list[ContestSummary]:
            contest_id = str(contest["id"])
            name = contest["name"]
            # Clean up contest names for display
            display_name = name
            if "Educational Codeforces Round" in name:
                import re
                match = re.search(r"Educational Codeforces Round (\d+)", name)
                if match:
                    display_name = f"Educational Round {match.group(1)}"
            elif "Codeforces Round" in name and "Div" in name:
                match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
                if match:
                    display_name = f"Round {match.group(1)} (Div. {match.group(2)})"
            elif "Codeforces Global Round" in name:
                match = re.search(r"Codeforces Global Round (\d+)", name)
                if match:
                    display_name = f"Global Round {match.group(1)}"
            elif "Codeforces Round" in name:
                # Handle various Div patterns
                div_match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name)
                if div_match:
                    display_name = (
                        f"Round {div_match.group(1)} (Div. {div_match.group(2)})"
                    )
                else:
                    # Handle combined divs like "Div. 1 + Div. 2"
                    combined_match = re.search(
                        r"Codeforces Round (\d+) \(Div\. 1 \+ Div\. 2\)", name
                    )
                    if combined_match:
                        display_name = (
                            f"Round {combined_match.group(1)} (Div. 1 + Div. 2)"
                        )
                    else:
                        # Handle single div like "Div. 1"
                        single_div_match = re.search(
                            r"Codeforces Round (\d+) \(Div\. 1\)", name
                        )
                        if single_div_match:
                            display_name = f"Round {single_div_match.group(1)} (Div. 1)"
                        else:
                            # Fallback: extract just the round number
                            round_match = re.search(r"Codeforces Round (\d+)", name)
                            if round_match:
                                display_name = f"Round {round_match.group(1)}"
            contests.append(
                ContestSummary(id=contest_id, name=name, display_name=display_name)
--- a/scrapers/cses.py
+++ b/scrapers/cses.py
@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 import json
 import random
 import re
 import sys
 import time
 from dataclasses import asdict
 import requests
@ -39,6 +41,38 @@ def denormalize_category_name(category_id: str) -> str:
    return category_map.get(category_id, category_id.replace("_", " ").title())
 def request_with_retry(
    url: str, headers: dict, max_retries: int = 3
 ) -> requests.Response:
    for attempt in range(max_retries):
        try:
            delay = 0.5 + random.uniform(0, 0.3)
            time.sleep(delay)
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 429:
                backoff = (2**attempt) + random.uniform(0, 1)
                print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr)
                time.sleep(backoff)
                continue
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            if attempt == max_retries - 1:
                raise
            backoff = 2**attempt
            print(
                f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}",
                file=sys.stderr,
            )
            time.sleep(backoff)
    raise Exception("All retry attempts failed")
 def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
    category_name = denormalize_category_name(category_id)
@ -48,8 +82,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]:
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
-        response = requests.get(problemset_url, headers=headers, timeout=10)
+        response = request_with_retry(problemset_url, headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
@ -143,10 +176,7 @@ def scrape_categories() -> list[ContestSummary]:
        headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
-        response = requests.get(
+        response = request_with_retry("https://cses.fi/problemset/", headers)
            "https://cses.fi/problemset/", headers=headers, timeout=10
        )
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        categories = []
@ -293,8 +323,7 @@ def scrape(url: str) -> list[TestCase]:
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        }
-        response = requests.get(url, headers=headers, timeout=10)
+        response = request_with_retry(url, headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")