From 3b3da9ab9b0334b933a38db8761db26e8d3ea8d5 Mon Sep 17 00:00:00 2001 From: Barrett Ruth Date: Fri, 6 Mar 2026 23:02:59 -0500 Subject: [PATCH] feat(codechef): implement login, submit, and full contest list Problem: CodeChef login/submit used wrong selectors for the Drupal-based site, the contest list only showed future contests, and problem/contest URLs were not set for `:CP open`. Solution: Fix login selectors (`input[name="name"]`, `input[name="pass"]`, `input.cc-login-btn`) with fast 3s failure on bad credentials. Rewrite submit to use MUI Select (`[aria-haspopup="listbox"]`), Ace editor clipboard injection, and `#submit_btn` dispatch with a practice-contest fallback. Paginate `/api/list/contests/past` to collect all 228 past Starters, then concurrently fetch each to expand parent contests into individual division entries (e.g. `START228 (Div. 4)`). Set `url`, `contest_url`, and `standings_url` correctly in `scrape_contest_metadata`. --- scrapers/codechef.py | 278 +++++++++++++++++++++++++++---------------- 1 file changed, 175 insertions(+), 103 deletions(-) diff --git a/scrapers/codechef.py b/scrapers/codechef.py index 614e8c2..6e3115c 100644 --- a/scrapers/codechef.py +++ b/scrapers/codechef.py @@ -23,6 +23,7 @@ from .models import ( BASE_URL = "https://www.codechef.com" API_CONTESTS_ALL = "/api/list/contests/all" +API_CONTESTS_PAST = "/api/list/contests/past" API_CONTEST = "/api/contests/{contest_id}" API_PROBLEM = "/api/contests/{contest_id}/problems/{problem_id}" HEADERS = { @@ -32,17 +33,19 @@ CONNECTIONS = 8 _COOKIE_PATH = Path.home() / ".cache" / "cp-nvim" / "codechef-cookies.json" -_CC_CHECK_LOGIN_JS = """() => { - const d = document.getElementById('__NEXT_DATA__'); - if (d) { - try { - const p = JSON.parse(d.textContent); - if (p?.props?.pageProps?.currentUser?.username) return true; - } catch(e) {} - } - return !!document.querySelector('a[href="/logout"]') || - !!document.querySelector('[class*="user-name"]'); -}""" +_CC_CHECK_LOGIN_JS = "() => !!document.querySelector('a[href*=\"/users/\"]')" + +_CC_LANG_IDS: dict[str, str] = { + "C++": "42", + "PYTH 3": "116", + "JAVA": "10", + "PYPY3": "109", + "GO": "114", + "rust": "93", + "KTLN": "47", + "NODEJS": "56", + "TS": "35", +} async def fetch_json(client: httpx.AsyncClient, path: str) -> dict[str, Any]: @@ -71,21 +74,19 @@ def _login_headless_codechef(credentials: dict[str, str]) -> LoginResult: def check_login(page): nonlocal logged_in - logged_in = page.evaluate(_CC_CHECK_LOGIN_JS) + logged_in = "dashboard" in page.url or page.evaluate(_CC_CHECK_LOGIN_JS) def login_action(page): nonlocal login_error try: - page.locator('input[type="email"], input[name="email"]').first.fill( - credentials.get("username", "") - ) - page.locator('input[type="password"], input[name="password"]').first.fill( - credentials.get("password", "") - ) - page.locator('button[type="submit"]').first.click() - page.wait_for_url( - lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT - ) + page.locator('input[name="name"]').fill(credentials.get("username", "")) + page.locator('input[name="pass"]').fill(credentials.get("password", "")) + page.locator('input.cc-login-btn').click() + try: + page.wait_for_url(lambda url: "/login" not in url, timeout=3000) + except Exception: + login_error = "Login failed (bad credentials?)" + return except Exception as e: login_error = str(e) @@ -155,21 +156,19 @@ def _submit_headless_codechef( def check_login(page): nonlocal logged_in - logged_in = page.evaluate(_CC_CHECK_LOGIN_JS) + logged_in = "dashboard" in page.url or page.evaluate(_CC_CHECK_LOGIN_JS) def login_action(page): nonlocal login_error try: - page.locator('input[type="email"], input[name="email"]').first.fill( - credentials.get("username", "") - ) - page.locator('input[type="password"], input[name="password"]').first.fill( - credentials.get("password", "") - ) - page.locator('button[type="submit"]').first.click() - page.wait_for_url( - lambda url: "/login" not in url, timeout=BROWSER_NAV_TIMEOUT - ) + page.locator('input[name="name"]').fill(credentials.get("username", "")) + page.locator('input[name="pass"]').fill(credentials.get("password", "")) + page.locator('input.cc-login-btn').click() + try: + page.wait_for_url(lambda url: "/login" not in url, timeout=3000) + except Exception: + login_error = "Login failed (bad credentials?)" + return except Exception as e: login_error = str(e) @@ -179,54 +178,44 @@ def _submit_headless_codechef( needs_relogin = True return try: - selected = False - selects = page.locator("select") - for i in range(selects.count()): - try: - sel = selects.nth(i) - opts = sel.locator("option").all_inner_texts() - match = next( - (o for o in opts if language_id.lower() in o.lower()), None - ) - if match: - sel.select_option(label=match) - selected = True - break - except Exception: - pass + page.wait_for_timeout(2000) - if not selected: - lang_trigger = page.locator( - '[class*="language"] button, [data-testid*="language"] button' - ).first - lang_trigger.click() - page.wait_for_timeout(500) - page.locator( - f'[role="option"]:has-text("{language_id}"), ' - f'li:has-text("{language_id}")' - ).first.click() + page.locator('[aria-haspopup="listbox"]').click() + page.wait_for_selector('[role="option"]', timeout=5000) + page.locator(f'[role="option"][data-value="{language_id}"]').click() + page.wait_for_timeout(2000) + page.locator('.ace_editor').click() + page.keyboard.press('Control+a') + page.wait_for_timeout(200) page.evaluate( """(code) => { - if (typeof monaco !== 'undefined') { - const models = monaco.editor.getModels(); - if (models.length > 0) { models[0].setValue(code); return; } - } - const cm = document.querySelector('.CodeMirror'); - if (cm && cm.CodeMirror) { cm.CodeMirror.setValue(code); return; } - const ta = document.querySelector('textarea'); - if (ta) { ta.value = code; ta.dispatchEvent(new Event('input', {bubbles: true})); } + const textarea = document.querySelector('.ace_text-input'); + const dt = new DataTransfer(); + dt.setData('text/plain', code); + textarea.dispatchEvent(new ClipboardEvent('paste', { + clipboardData: dt, bubbles: true, cancelable: true + })); }""", source_code, ) + page.wait_for_timeout(1000) - page.locator( - 'button[type="submit"]:has-text("Submit"), button:has-text("Submit Code")' - ).first.click() - page.wait_for_url( - lambda url: "/submit/" not in url or "submission" in url, - timeout=BROWSER_NAV_TIMEOUT * 2, + page.evaluate( + "() => document.getElementById('submit_btn').scrollIntoView({block:'center'})" ) + page.wait_for_timeout(200) + page.locator('#submit_btn').dispatch_event('click') + page.wait_for_timeout(3000) + + dialog_text = page.evaluate("""() => { + const d = document.querySelector('[role="dialog"], .swal2-popup'); + return d ? d.textContent.trim() : null; + }""") + if dialog_text and "not available for accepting solutions" in dialog_text: + submit_error = "PRACTICE_FALLBACK" + elif dialog_text: + submit_error = dialog_text except Exception as e: submit_error = str(e) @@ -252,10 +241,12 @@ def _submit_headless_codechef( ) print(json.dumps({"status": "submitting"}), flush=True) - session.fetch( - f"{BASE_URL}/{contest_id}/submit/{problem_id}", - page_action=submit_action, + submit_url = ( + f"{BASE_URL}/submit/{problem_id}" + if contest_id == "PRACTICE" + else f"{BASE_URL}/{contest_id}/submit/{problem_id}" ) + session.fetch(submit_url, page_action=submit_action) try: browser_cookies = session.context.cookies() @@ -275,12 +266,20 @@ def _submit_headless_codechef( _retried=True, ) + if submit_error == "PRACTICE_FALLBACK" and not _retried: + return _submit_headless_codechef( + "PRACTICE", + problem_id, + file_path, + language_id, + credentials, + _retried=True, + ) + if submit_error: return SubmitResult(success=False, error=submit_error) - return SubmitResult( - success=True, error="", submission_id="", verdict="submitted" - ) + return SubmitResult(success=True, error="", submission_id="") except Exception as e: return SubmitResult(success=False, error=str(e)) @@ -296,12 +295,19 @@ class CodeChefScraper(BaseScraper): data = await fetch_json( client, API_CONTEST.format(contest_id=contest_id) ) - if not data.get("problems"): + problems_raw = data.get("problems") + if not problems_raw and isinstance(data.get("child_contests"), dict): + for div in ("div_4", "div_3", "div_2", "div_1"): + child = data["child_contests"].get(div, {}) + child_code = child.get("contest_code") + if child_code: + return await self.scrape_contest_metadata(child_code) + if not problems_raw: return self._metadata_error( f"No problems found for contest {contest_id}" ) problems = [] - for problem_code, problem_data in data["problems"].items(): + for problem_code, problem_data in problems_raw.items(): if problem_data.get("category_name") == "main": problems.append( ProblemSummary( @@ -314,42 +320,101 @@ class CodeChefScraper(BaseScraper): error="", contest_id=contest_id, problems=problems, - url=f"{BASE_URL}/{contest_id}", + url=f"{BASE_URL}/problems/%s", + contest_url=f"{BASE_URL}/{contest_id}", + standings_url=f"{BASE_URL}/{contest_id}/rankings", ) except Exception as e: return self._metadata_error(f"Failed to fetch contest {contest_id}: {e}") async def scrape_contest_list(self) -> ContestListResult: - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient( + limits=httpx.Limits(max_connections=CONNECTIONS) + ) as client: try: data = await fetch_json(client, API_CONTESTS_ALL) except httpx.HTTPStatusError as e: return self._contests_error(f"Failed to fetch contests: {e}") - contests: list[ContestSummary] = [] - seen: set[str] = set() - for c in data.get("future_contests", []) + data.get("past_contests", []): - code = c.get("contest_code", "") - name = c.get("contest_name", code) - if not re.match(r"^START\d+$", code): - continue - if code in seen: - continue - seen.add(code) - start_time: int | None = None - iso = c.get("contest_start_date_iso") - if iso: + + present = data.get("present_contests", []) + future = data.get("future_contests", []) + + async def fetch_past_page(offset: int) -> list[dict[str, Any]]: + r = await client.get( + BASE_URL + API_CONTESTS_PAST, + params={"sort_by": "START", "sorting_order": "desc", "offset": offset}, + headers=HEADERS, + timeout=HTTP_TIMEOUT, + ) + r.raise_for_status() + return r.json().get("contests", []) + + past: list[dict[str, Any]] = [] + offset = 0 + while True: + page = await fetch_past_page(offset) + past.extend(c for c in page if re.match(r"^START\d+", c.get("contest_code", ""))) + if len(page) < 20: + break + offset += 20 + + raw: list[dict[str, Any]] = [] + seen_raw: set[str] = set() + for c in present + future + past: + code = c.get("contest_code", "") + if not code or code in seen_raw: + continue + seen_raw.add(code) + raw.append(c) + + sem = asyncio.Semaphore(CONNECTIONS) + + async def expand(c: dict[str, Any]) -> list[ContestSummary]: + code = c["contest_code"] + name = c.get("contest_name", code) + start_time: int | None = None + iso = c.get("contest_start_date_iso") + if iso: + try: + start_time = int(datetime.fromisoformat(iso).timestamp()) + except Exception: + pass + base_name = re.sub(r"\s*\(.*?\)\s*$", "", name).strip() try: - dt = datetime.fromisoformat(iso) - start_time = int(dt.timestamp()) + async with sem: + detail = await fetch_json(client, API_CONTEST.format(contest_id=code)) + children = detail.get("child_contests") + if children and isinstance(children, dict): + divs: list[ContestSummary] = [] + for div_key in ("div_1", "div_2", "div_3", "div_4"): + child = children.get(div_key) + if not child: + continue + child_code = child.get("contest_code") + div_num = child.get("div", {}).get("div_number", div_key[-1]) + if child_code: + display = f"{base_name} (Div. {div_num})" + divs.append(ContestSummary( + id=child_code, name=display, display_name=display, start_time=start_time + )) + if divs: + return divs except Exception: pass - contests.append( - ContestSummary( - id=code, name=name, display_name=name, start_time=start_time - ) - ) + return [ContestSummary(id=code, name=name, display_name=name, start_time=start_time)] + + results = await asyncio.gather(*[expand(c) for c in raw]) + + contests: list[ContestSummary] = [] + seen: set[str] = set() + for group in results: + for entry in group: + if entry.id not in seen: + seen.add(entry.id) + contests.append(entry) + if not contests: - return self._contests_error("No Starters contests found") + return self._contests_error("No contests found") return ContestListResult(success=True, error="", contests=contests) async def stream_tests_for_category_async(self, category_id: str) -> None: @@ -369,6 +434,13 @@ class CodeChefScraper(BaseScraper): ) return all_problems = contest_data.get("problems", {}) + if not all_problems and isinstance(contest_data.get("child_contests"), dict): + for div in ("div_4", "div_3", "div_2", "div_1"): + child = contest_data["child_contests"].get(div, {}) + child_code = child.get("contest_code") + if child_code: + await self.stream_tests_for_category_async(child_code) + return if not all_problems: print( json.dumps(