diff --git a/lua/cp/ansi.lua b/lua/cp/ansi.lua index c8e2a01..642b624 100644 --- a/lua/cp/ansi.lua +++ b/lua/cp/ansi.lua @@ -194,17 +194,9 @@ function M.setup_highlight_groups() BrightWhite = vim.g.terminal_color_15, } - local missing_color = false - for _, terminal_color in pairs(color_map) do - if terminal_color == nil then - missing_color = true - break - end - end - - if missing_color or #color_map == 0 then + if vim.tbl_count(color_map) < 16 then logger.log( - 'ansi terminal colors (vim.g.terminal_color_*) not configured. . ANSI colors will not display properly. ', + 'ansi terminal colors (vim.g.terminal_color_*) not configured. ANSI colors will not display properly. ', vim.log.levels.WARN ) end diff --git a/lua/cp/diff.lua b/lua/cp/diff.lua index e576b8c..9295c63 100644 --- a/lua/cp/diff.lua +++ b/lua/cp/diff.lua @@ -9,7 +9,6 @@ local M = {} ----Vim's built-in diff backend using diffthis ---@type DiffBackend local vim_backend = { name = 'vim', @@ -18,17 +17,15 @@ local vim_backend = { return { content = actual_lines, - highlights = nil, -- diffthis handles highlighting + highlights = nil, } end, } ----Git word-diff backend for character-level precision ---@type DiffBackend local git_backend = { name = 'git', render = function(expected, actual) - -- Create temporary files for git diff local tmp_expected = vim.fn.tempname() local tmp_actual = vim.fn.tempname() @@ -48,7 +45,6 @@ local git_backend = { local result = vim.system(cmd, { text = true }):wait() - -- Clean up temp files vim.fn.delete(tmp_expected) vim.fn.delete(tmp_actual) @@ -58,25 +54,21 @@ local git_backend = { highlights = {}, } else - -- Parse git diff output to extract content and highlights local diff_content = result.stdout or '' local lines = {} local highlights = {} local line_num = 0 - -- Extract content lines that start with space, +, or - for line in diff_content:gmatch('[^\n]*') do if line:match('^[%s%+%-]') or (not line:match('^[@%-+]') and not line:match('^index') and not line:match('^diff')) then - -- This is content, not metadata local clean_line = line if line:match('^[%+%-]') then - clean_line = line:sub(2) -- Remove +/- prefix + clean_line = line:sub(2) end - -- Parse diff markers in the line local col_pos = 0 local processed_line = '' local i = 1 @@ -97,28 +89,26 @@ local git_backend = { end if next_marker_start then - -- Add text before marker if next_marker_start > i then local before_text = clean_line:sub(i, next_marker_start - 1) processed_line = processed_line .. before_text col_pos = col_pos + #before_text end - -- Extract and add marker content with highlighting local marker_end = (marker_type == 'removed') and removed_end or added_end local marker_text = clean_line:sub(next_marker_start, marker_end) local content_text if marker_type == 'removed' then - content_text = marker_text:sub(3, -3) -- Remove [- and -] + content_text = marker_text:sub(3, -3) table.insert(highlights, { line = line_num, col_start = col_pos, col_end = col_pos + #content_text, highlight_group = 'DiffDelete', }) - else -- added - content_text = marker_text:sub(3, -3) -- Remove {+ and +} + else + content_text = marker_text:sub(3, -3) table.insert(highlights, { line = line_num, col_start = col_pos, @@ -131,7 +121,6 @@ local git_backend = { col_pos = col_pos + #content_text i = marker_end + 1 else - -- No more markers, add rest of line local rest = clean_line:sub(i) processed_line = processed_line .. rest break @@ -152,34 +141,29 @@ local git_backend = { end, } ----Available diff backends ---@type table local backends = { vim = vim_backend, git = git_backend, } ----Get available backend names ---@return string[] function M.get_available_backends() return vim.tbl_keys(backends) end ----Get a diff backend by name ---@param name string ---@return DiffBackend? function M.get_backend(name) return backends[name] end ----Check if git backend is available ---@return boolean function M.is_git_available() local result = vim.system({ 'git', '--version' }, { text = true }):wait() return result.code == 0 end ----Get the best available backend based on config and system availability ---@param preferred_backend? string ---@return DiffBackend function M.get_best_backend(preferred_backend) @@ -193,7 +177,6 @@ function M.get_best_backend(preferred_backend) return backends.vim end ----Render diff using specified backend ---@param expected string ---@param actual string ---@param backend_name? string diff --git a/lua/cp/execute.lua b/lua/cp/execute.lua index a56bc62..1c433d6 100644 --- a/lua/cp/execute.lua +++ b/lua/cp/execute.lua @@ -63,10 +63,6 @@ local function build_command(cmd_template, executable, substitutions) return cmd end -local function ensure_directories() - vim.system({ 'mkdir', '-p', 'build', 'io' }):wait() -end - ---@param language_config table ---@param substitutions table ---@return {code: integer, stdout: string, stderr: string} @@ -252,7 +248,7 @@ function M.run_problem(ctx, contest_config, is_debug) is_debug = { is_debug, 'boolean' }, }) - ensure_directories() + vim.system({ 'mkdir', '-p', 'build', 'io' }):wait() local language = get_language_from_file(ctx.source_file, contest_config) local language_config = contest_config[language] diff --git a/lua/cp/init.lua b/lua/cp/init.lua index 7560f01..36c2faf 100644 --- a/lua/cp/init.lua +++ b/lua/cp/init.lua @@ -45,8 +45,7 @@ local function set_platform(platform) end state.platform = platform - vim.fn.mkdir('build', 'p') - vim.fn.mkdir('io', 'p') + vim.system({ 'mkdir', '-p', 'build', 'io' }):wait() return true end @@ -59,27 +58,31 @@ local function setup_problem(contest_id, problem_id, language) return end - local problem_name = state.platform == 'cses' and contest_id or (contest_id .. (problem_id or '')) + local problem_name = contest_id .. (problem_id or '') logger.log(('setting up problem: %s'):format(problem_name)) local ctx = problem.create_context(state.platform, contest_id, problem_id, config, language) if vim.tbl_contains(config.scrapers, state.platform) then - local metadata_result = scrape.scrape_contest_metadata(state.platform, contest_id) - if not metadata_result.success then - logger.log( - 'failed to load contest metadata: ' .. (metadata_result.error or 'unknown error'), - vim.log.levels.WARN - ) + cache.load() + local existing_contest_data = cache.get_contest_data(state.platform, contest_id) + + if not existing_contest_data then + local metadata_result = scrape.scrape_contest_metadata(state.platform, contest_id) + if not metadata_result.success then + logger.log( + 'failed to load contest metadata: ' .. (metadata_result.error or 'unknown error'), + vim.log.levels.WARN + ) + end end end local cached_test_cases = cache.get_test_cases(state.platform, contest_id, problem_id) if cached_test_cases then state.test_cases = cached_test_cases - end - - if vim.tbl_contains(config.scrapers, state.platform) then + logger.log(('using cached test cases (%d)'):format(#cached_test_cases)) + elseif vim.tbl_contains(config.scrapers, state.platform) then local scrape_result = scrape.scrape_problem(ctx) if not scrape_result.success then @@ -103,6 +106,7 @@ local function setup_problem(contest_id, problem_id, language) end vim.cmd('silent only') + state.run_panel_active = false state.contest_id = contest_id state.problem_id = problem_id @@ -145,6 +149,38 @@ local function setup_problem(contest_id, problem_id, language) logger.log(('switched to problem %s'):format(ctx.problem_name)) end +local function scrape_missing_problems(contest_id, missing_problems) + vim.fn.mkdir('io', 'p') + + logger.log(('scraping %d uncached problems...'):format(#missing_problems)) + + local results = + scrape.scrape_problems_parallel(state.platform, contest_id, missing_problems, config) + + local success_count = 0 + local failed_problems = {} + for problem_id, result in pairs(results) do + if result.success then + success_count = success_count + 1 + else + table.insert(failed_problems, problem_id) + end + end + + if #failed_problems > 0 then + logger.log( + ('scraping complete: %d/%d successful, failed: %s'):format( + success_count, + #missing_problems, + table.concat(failed_problems, ', ') + ), + vim.log.levels.WARN + ) + else + logger.log(('scraping complete: %d/%d successful'):format(success_count, #missing_problems)) + end +end + local function get_current_problem() local filename = vim.fn.expand('%:t:r') if filename == '' then @@ -476,6 +512,7 @@ local function toggle_run_panel(is_debug) update_diff_panes() end + ---@param delta number 1 for next, -1 for prev local function navigate_test_case(delta) local test_state = run.get_run_panel_state() if #test_state.test_cases == 0 then @@ -555,6 +592,60 @@ local function toggle_run_panel(is_debug) logger.log(string.format('test panel opened (%d test cases)', #test_state.test_cases)) end +---@param contest_id string +---@param language? string +local function setup_contest(contest_id, language) + if not state.platform then + logger.log('no platform set', vim.log.levels.ERROR) + return false + end + + if not vim.tbl_contains(config.scrapers, state.platform) then + logger.log('scraping disabled for ' .. state.platform, vim.log.levels.WARN) + return false + end + + logger.log(('setting up contest %s %s'):format(state.platform, contest_id)) + + local metadata_result = scrape.scrape_contest_metadata(state.platform, contest_id) + if not metadata_result.success then + logger.log( + 'failed to load contest metadata: ' .. (metadata_result.error or 'unknown error'), + vim.log.levels.ERROR + ) + return false + end + + local problems = metadata_result.problems + if not problems or #problems == 0 then + logger.log('no problems found in contest', vim.log.levels.ERROR) + return false + end + + logger.log(('found %d problems, checking cache...'):format(#problems)) + + cache.load() + local missing_problems = {} + for _, prob in ipairs(problems) do + local cached_tests = cache.get_test_cases(state.platform, contest_id, prob.id) + if not cached_tests then + table.insert(missing_problems, prob) + end + end + + if #missing_problems > 0 then + logger.log(('scraping %d uncached problems...'):format(#missing_problems)) + scrape_missing_problems(contest_id, missing_problems) + else + logger.log('all problems already cached') + end + + state.contest_id = contest_id + setup_problem(contest_id, problems[1].id, language) + + return true +end + ---@param delta number 1 for next, -1 for prev ---@param language? string local function navigate_problem(delta, language) @@ -574,13 +665,7 @@ local function navigate_problem(delta, language) end local problems = contest_data.problems - local current_problem_id - - if state.platform == 'cses' then - current_problem_id = state.contest_id - else - current_problem_id = state.problem_id - end + local current_problem_id = state.problem_id if not current_problem_id then logger.log('no current problem set', vim.log.levels.ERROR) @@ -610,11 +695,7 @@ local function navigate_problem(delta, language) local new_problem = problems[new_index] - if state.platform == 'cses' then - setup_problem(new_problem.id, nil, language) - else - setup_problem(state.contest_id, new_problem.id, language) - end + setup_problem(state.contest_id, new_problem.id, language) end local function restore_from_current_file() @@ -638,7 +719,7 @@ local function restore_from_current_file() ('Restoring from cached state: %s %s %s'):format( file_state.platform, file_state.contest_id, - file_state.problem_id or 'CSES' + file_state.problem_id or 'N/A' ) ) @@ -649,11 +730,7 @@ local function restore_from_current_file() state.contest_id = file_state.contest_id state.problem_id = file_state.problem_id - if file_state.platform == 'cses' then - setup_problem(file_state.contest_id, nil, file_state.language) - else - setup_problem(file_state.contest_id, file_state.problem_id, file_state.language) - end + setup_problem(file_state.contest_id, file_state.problem_id, file_state.language) return true end @@ -701,20 +778,12 @@ local function parse_command(args) language = language, } elseif #filtered_args == 2 then - if first == 'cses' then - logger.log( - 'CSES requires both category and problem ID. Usage: :CP cses ', - vim.log.levels.ERROR - ) - return { type = 'error' } - else - return { - type = 'contest_setup', - platform = first, - contest = filtered_args[2], - language = language, - } - end + return { + type = 'contest_setup', + platform = first, + contest = filtered_args[2], + language = language, + } elseif #filtered_args == 3 then return { type = 'full_setup', @@ -779,24 +848,7 @@ function M.handle_command(opts) if cmd.type == 'contest_setup' then if set_platform(cmd.platform) then - state.contest_id = cmd.contest - if vim.tbl_contains(config.scrapers, cmd.platform) then - local metadata_result = scrape.scrape_contest_metadata(cmd.platform, cmd.contest) - if not metadata_result.success then - logger.log( - 'failed to load contest metadata: ' .. (metadata_result.error or 'unknown error'), - vim.log.levels.WARN - ) - else - logger.log( - ('loaded %d problems for %s %s'):format( - #metadata_result.problems, - cmd.platform, - cmd.contest - ) - ) - end - end + setup_contest(cmd.contest, cmd.language) end return end @@ -853,11 +905,7 @@ function M.handle_command(opts) end if cmd.type == 'problem_switch' then - if state.platform == 'cses' then - setup_problem(cmd.problem, nil, cmd.language) - else - setup_problem(state.contest_id, cmd.problem, cmd.language) - end + setup_problem(state.contest_id, cmd.problem, cmd.language) return end end diff --git a/lua/cp/scrape.lua b/lua/cp/scrape.lua index d01bbb6..f8a5e31 100644 --- a/lua/cp/scrape.lua +++ b/lua/cp/scrape.lua @@ -14,16 +14,13 @@ local M = {} local cache = require('cp.cache') local logger = require('cp.log') +local problem = require('cp.problem') local function get_plugin_path() local plugin_path = debug.getinfo(1, 'S').source:sub(2) return vim.fn.fnamemodify(plugin_path, ':h:h:h') end -local function ensure_io_directory() - vim.fn.mkdir('io', 'p') -end - local function check_internet_connectivity() local result = vim.system({ 'ping', '-c', '1', '-W', '3', '8.8.8.8' }, { text = true }):wait() return result.code == 0 @@ -143,7 +140,7 @@ function M.scrape_problem(ctx) ctx = { ctx, 'table' }, }) - ensure_io_directory() + vim.fn.mkdir('io', 'p') if vim.fn.filereadable(ctx.input_file) == 1 and vim.fn.filereadable(ctx.expected_file) == 1 then local base_name = vim.fn.fnamemodify(ctx.input_file, ':r') @@ -294,4 +291,113 @@ function M.scrape_problem(ctx) } end +---@param platform string +---@param contest_id string +---@param problems table[] +---@param config table +---@return table[] +function M.scrape_problems_parallel(platform, contest_id, problems, config) + vim.validate({ + platform = { platform, 'string' }, + contest_id = { contest_id, 'string' }, + problems = { problems, 'table' }, + config = { config, 'table' }, + }) + + if not check_internet_connectivity() then + return {} + end + + if not setup_python_env() then + return {} + end + + local plugin_path = get_plugin_path() + local jobs = {} + + for _, prob in ipairs(problems) do + local args = { + 'uv', + 'run', + '--directory', + plugin_path, + '-m', + 'scrapers.' .. platform, + 'tests', + contest_id, + prob.id, + } + + local job = vim.system(args, { + cwd = plugin_path, + text = true, + timeout = 30000, + }) + + jobs[prob.id] = { + job = job, + problem = prob, + } + end + + local results = {} + for problem_id, job_data in pairs(jobs) do + local result = job_data.job:wait() + local scrape_result = { + success = false, + problem_id = problem_id, + error = 'Unknown error', + } + + if result.code == 0 then + local ok, data = pcall(vim.json.decode, result.stdout) + if ok and data.success then + scrape_result = data + + if data.tests and #data.tests > 0 then + local ctx = problem.create_context(platform, contest_id, problem_id, config) + local base_name = vim.fn.fnamemodify(ctx.input_file, ':r') + + for i, test_case in ipairs(data.tests) do + local input_file = base_name .. '.' .. i .. '.cpin' + local expected_file = base_name .. '.' .. i .. '.cpout' + + local input_content = test_case.input:gsub('\r', '') + local expected_content = test_case.expected:gsub('\r', '') + + vim.fn.writefile(vim.split(input_content, '\n', true), input_file) + vim.fn.writefile(vim.split(expected_content, '\n', true), expected_file) + end + + local cached_test_cases = {} + for i, test_case in ipairs(data.tests) do + table.insert(cached_test_cases, { + index = i, + input = test_case.input, + expected = test_case.expected, + }) + end + + cache.set_test_cases( + platform, + contest_id, + problem_id, + cached_test_cases, + data.timeout_ms, + data.memory_mb + ) + end + else + scrape_result.error = ok and data.error or 'Failed to parse scraper output' + end + else + scrape_result.error = 'Scraper execution failed: ' .. (result.stderr or 'Unknown error') + end + + results[problem_id] = scrape_result + end + + return results +end + return M diff --git a/scrapers/atcoder.py b/scrapers/atcoder.py index 02beda8..3dc1d16 100644 --- a/scrapers/atcoder.py +++ b/scrapers/atcoder.py @@ -168,70 +168,209 @@ def scrape(url: str) -> list[TestCase]: def scrape_contests() -> list[ContestSummary]: - contests = [] - max_pages = 15 + import concurrent.futures + import random - for page in range(1, max_pages + 1): + def get_max_pages() -> int: try: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - url = f"https://atcoder.jp/contests/archive?page={page}" - response = requests.get(url, headers=headers, timeout=10) + response = requests.get( + "https://atcoder.jp/contests/archive", headers=headers, timeout=10 + ) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") - table = soup.find("table", class_="table") - if not table: - break + pagination = soup.find("ul", class_="pagination") + if not pagination or not isinstance(pagination, Tag): + return 15 - tbody = table.find("tbody") - if not tbody or not isinstance(tbody, Tag): - break + lis = pagination.find_all("li") + if lis and isinstance(lis[-1], Tag): + last_li_text = lis[-1].get_text().strip() + try: + return int(last_li_text) + except ValueError: + return 15 + return 15 + except Exception: + return 15 - rows = tbody.find_all("tr") - if not rows: - break + def scrape_page_with_retry(page: int, max_retries: int = 3) -> list[ContestSummary]: + for attempt in range(max_retries): + try: + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + } + url = f"https://atcoder.jp/contests/archive?page={page}" + response = requests.get(url, headers=headers, timeout=10) - for row in rows: - cells = row.find_all("td") - if len(cells) < 2: + if response.status_code == 429: + backoff_time = (2**attempt) + random.uniform(0, 1) + print( + f"Rate limited on page {page}, retrying in {backoff_time:.1f}s", + file=sys.stderr, + ) + time.sleep(backoff_time) continue - contest_cell = cells[1] - link = contest_cell.find("a") - if not link or not link.get("href"): + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + table = soup.find("table", class_="table") + if not table: + return [] + + tbody = table.find("tbody") + if not tbody or not isinstance(tbody, Tag): + return [] + + rows = tbody.find_all("tr") + if not rows: + return [] + + contests = [] + for row in rows: + cells = row.find_all("td") + if len(cells) < 2: + continue + + contest_cell = cells[1] + link = contest_cell.find("a") + if not link or not link.get("href"): + continue + + href = link.get("href") + contest_id = href.split("/")[-1] + name = link.get_text().strip() + + try: + name = name.encode().decode("unicode_escape") + except (UnicodeDecodeError, UnicodeEncodeError): + pass + + name = ( + name.replace("\uff08", "(") + .replace("\uff09", ")") + .replace("\u3000", " ") + ) + name = re.sub( + r"[\uff01-\uff5e]", lambda m: chr(ord(m.group()) - 0xFEE0), name + ) + + def generate_display_name_from_id(contest_id: str) -> str: + parts = contest_id.replace("-", " ").replace("_", " ") + + parts = re.sub( + r"\b(jsc|JSC)\b", + "Japanese Student Championship", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(wtf|WTF)\b", + "World Tour Finals", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(ahc)(\d+)\b", + r"Heuristic Contest \2 (AHC)", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(arc)(\d+)\b", + r"Regular Contest \2 (ARC)", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(abc)(\d+)\b", + r"Beginner Contest \2 (ABC)", + parts, + flags=re.IGNORECASE, + ) + parts = re.sub( + r"\b(agc)(\d+)\b", + r"Grand Contest \2 (AGC)", + parts, + flags=re.IGNORECASE, + ) + + return parts.title() + + english_chars = sum(1 for c in name if c.isascii() and c.isalpha()) + total_chars = len(re.sub(r"\s+", "", name)) + + if total_chars > 0 and english_chars / total_chars < 0.3: + display_name = generate_display_name_from_id(contest_id) + else: + display_name = name + if "AtCoder Beginner Contest" in name: + match = re.search(r"AtCoder Beginner Contest (\d+)", name) + if match: + display_name = ( + f"Beginner Contest {match.group(1)} (ABC)" + ) + elif "AtCoder Regular Contest" in name: + match = re.search(r"AtCoder Regular Contest (\d+)", name) + if match: + display_name = f"Regular Contest {match.group(1)} (ARC)" + elif "AtCoder Grand Contest" in name: + match = re.search(r"AtCoder Grand Contest (\d+)", name) + if match: + display_name = f"Grand Contest {match.group(1)} (AGC)" + elif "AtCoder Heuristic Contest" in name: + match = re.search(r"AtCoder Heuristic Contest (\d+)", name) + if match: + display_name = ( + f"Heuristic Contest {match.group(1)} (AHC)" + ) + + contests.append( + ContestSummary( + id=contest_id, name=name, display_name=display_name + ) + ) + + return contests + + except requests.exceptions.RequestException as e: + if response.status_code == 429: continue - - href = link.get("href") - contest_id = href.split("/")[-1] - name = link.get_text().strip() - - display_name = name - if "AtCoder Beginner Contest" in name: - match = re.search(r"AtCoder Beginner Contest (\d+)", name) - if match: - display_name = f"Beginner Contest {match.group(1)} (ABC)" - elif "AtCoder Regular Contest" in name: - match = re.search(r"AtCoder Regular Contest (\d+)", name) - if match: - display_name = f"Regular Contest {match.group(1)} (ARC)" - elif "AtCoder Grand Contest" in name: - match = re.search(r"AtCoder Grand Contest (\d+)", name) - if match: - display_name = f"Grand Contest {match.group(1)} (AGC)" - - contests.append( - ContestSummary(id=contest_id, name=name, display_name=display_name) + print( + f"Failed to scrape page {page} (attempt {attempt + 1}): {e}", + file=sys.stderr, ) + if attempt == max_retries - 1: + return [] + except Exception as e: + print(f"Unexpected error on page {page}: {e}", file=sys.stderr) + return [] - time.sleep(0.5) + return [] - except Exception as e: - print(f"Failed to scrape page {page}: {e}", file=sys.stderr) - continue + max_pages = get_max_pages() + page_results = {} - return contests + with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + future_to_page = { + executor.submit(scrape_page_with_retry, page): page + for page in range(1, max_pages + 1) + } + + for future in concurrent.futures.as_completed(future_to_page): + page = future_to_page[future] + page_contests = future.result() + page_results[page] = page_contests + + all_contests = [] + for page in sorted(page_results.keys()): + all_contests.extend(page_results[page]) + + return all_contests def main() -> None: diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py index b4f6409..0aa7d07 100644 --- a/scrapers/codeforces.py +++ b/scrapers/codeforces.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import json +import re import sys from dataclasses import asdict @@ -148,8 +149,6 @@ def parse_problem_url(contest_id: str, problem_letter: str) -> str: def extract_problem_limits(soup: BeautifulSoup) -> tuple[int, float]: - import re - timeout_ms = None memory_mb = None @@ -240,28 +239,45 @@ def scrape_contests() -> list[ContestSummary]: contest_id = str(contest["id"]) name = contest["name"] - # Clean up contest names for display display_name = name if "Educational Codeforces Round" in name: - import re - match = re.search(r"Educational Codeforces Round (\d+)", name) if match: display_name = f"Educational Round {match.group(1)}" - elif "Codeforces Round" in name and "Div" in name: - match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name) - if match: - display_name = f"Round {match.group(1)} (Div. {match.group(2)})" elif "Codeforces Global Round" in name: match = re.search(r"Codeforces Global Round (\d+)", name) if match: display_name = f"Global Round {match.group(1)}" + elif "Codeforces Round" in name: + div_match = re.search(r"Codeforces Round (\d+) \(Div\. (\d+)\)", name) + if div_match: + display_name = ( + f"Round {div_match.group(1)} (Div. {div_match.group(2)})" + ) + else: + combined_match = re.search( + r"Codeforces Round (\d+) \(Div\. 1 \+ Div\. 2\)", name + ) + if combined_match: + display_name = ( + f"Round {combined_match.group(1)} (Div. 1 + Div. 2)" + ) + else: + single_div_match = re.search( + r"Codeforces Round (\d+) \(Div\. 1\)", name + ) + if single_div_match: + display_name = f"Round {single_div_match.group(1)} (Div. 1)" + else: + round_match = re.search(r"Codeforces Round (\d+)", name) + if round_match: + display_name = f"Round {round_match.group(1)}" contests.append( ContestSummary(id=contest_id, name=name, display_name=display_name) ) - return contests[:100] # Limit to recent 100 contests + return contests[:100] except Exception as e: print(f"Failed to fetch contests: {e}", file=sys.stderr) diff --git a/scrapers/cses.py b/scrapers/cses.py index 8edaef8..b2f1733 100755 --- a/scrapers/cses.py +++ b/scrapers/cses.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 import json +import random import re import sys +import time from dataclasses import asdict import requests @@ -39,6 +41,38 @@ def denormalize_category_name(category_id: str) -> str: return category_map.get(category_id, category_id.replace("_", " ").title()) +def request_with_retry( + url: str, headers: dict, max_retries: int = 3 +) -> requests.Response: + for attempt in range(max_retries): + try: + delay = 0.5 + random.uniform(0, 0.3) + time.sleep(delay) + + response = requests.get(url, headers=headers, timeout=10) + + if response.status_code == 429: + backoff = (2**attempt) + random.uniform(0, 1) + print(f"Rate limited, retrying in {backoff:.1f}s", file=sys.stderr) + time.sleep(backoff) + continue + + response.raise_for_status() + return response + + except requests.exceptions.RequestException as e: + if attempt == max_retries - 1: + raise + backoff = 2**attempt + print( + f"Request failed (attempt {attempt + 1}), retrying in {backoff}s: {e}", + file=sys.stderr, + ) + time.sleep(backoff) + + raise Exception("All retry attempts failed") + + def scrape_category_problems(category_id: str) -> list[ProblemSummary]: category_name = denormalize_category_name(category_id) @@ -48,8 +82,7 @@ def scrape_category_problems(category_id: str) -> list[ProblemSummary]: "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = requests.get(problemset_url, headers=headers, timeout=10) - response.raise_for_status() + response = request_with_retry(problemset_url, headers) soup = BeautifulSoup(response.text, "html.parser") @@ -143,10 +176,7 @@ def scrape_categories() -> list[ContestSummary]: headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = requests.get( - "https://cses.fi/problemset/", headers=headers, timeout=10 - ) - response.raise_for_status() + response = request_with_retry("https://cses.fi/problemset/", headers) soup = BeautifulSoup(response.text, "html.parser") categories = [] @@ -293,8 +323,7 @@ def scrape(url: str) -> list[TestCase]: "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() + response = request_with_retry(url, headers) soup = BeautifulSoup(response.text, "html.parser") @@ -314,7 +343,7 @@ def main() -> None: if len(sys.argv) < 2: result = MetadataResult( success=False, - error="Usage: cses.py metadata OR cses.py tests OR cses.py contests", + error="Usage: cses.py metadata OR cses.py tests OR cses.py contests", ) print(json.dumps(asdict(result))) sys.exit(1) @@ -345,10 +374,10 @@ def main() -> None: print(json.dumps(asdict(result))) elif mode == "tests": - if len(sys.argv) != 3: + if len(sys.argv) != 4: tests_result = TestsResult( success=False, - error="Usage: cses.py tests ", + error="Usage: cses.py tests ", problem_id="", url="", tests=[], @@ -358,7 +387,7 @@ def main() -> None: print(json.dumps(asdict(tests_result))) sys.exit(1) - problem_input: str = sys.argv[2] + problem_input: str = sys.argv[3] url: str | None = parse_problem_url(problem_input) if not url: @@ -446,7 +475,7 @@ def main() -> None: else: result = MetadataResult( success=False, - error=f"Unknown mode: {mode}. Use 'metadata', 'tests', or 'contests'", + error=f"Unknown mode: {mode}. Use 'metadata ', 'tests ', or 'contests'", ) print(json.dumps(asdict(result))) sys.exit(1) diff --git a/tests/scrapers/test_atcoder.py b/tests/scrapers/test_atcoder.py index 0474c6a..5ff91d9 100644 --- a/tests/scrapers/test_atcoder.py +++ b/tests/scrapers/test_atcoder.py @@ -54,18 +54,22 @@ def test_scrape_network_error(mocker): def test_scrape_contests_success(mocker): def mock_get_side_effect(url, **kwargs): - if "page=1" in url: + if url == "https://atcoder.jp/contests/archive": mock_response = Mock() + mock_response.raise_for_status.return_value = None mock_response.text = """ - - - - - - - - - + +
    +
  • 1
  • +
+ + """ + return mock_response + elif "page=1" in url: + mock_response = Mock() + mock_response.raise_for_status.return_value = None + mock_response.text = """ +
Start TimeContest NameDurationRated Range
@@ -84,9 +88,9 @@ def test_scrape_contests_success(mocker): """ return mock_response else: - # Return empty page for all other pages mock_response = Mock() - mock_response.text = "No table found" + mock_response.raise_for_status.return_value = None + mock_response.text = "" return mock_response mocker.patch("scrapers.atcoder.requests.get", side_effect=mock_get_side_effect)
2025-01-15 21:00:00+0900