fix(scrapers): browser login fast paths and AtCoder submit rewrite
Problem: CF and AtCoder always did a full browser login on every `login` invocation, even with valid cookies. AtCoder submit never persisted cookies, re-logging in on every submit. CF's cookie guard used `X-User-Handle` (no longer set by CF — now `X-User-Sha1`), so cookies were never saved. CF `login_action` was missing `wait_for_selector` for the form that appears after the Cloudflare gate reloads. AtCoder submit injected source via CodeMirror which doesn't exist on AtCoder (it uses ACE editor). Solution: Added cookie fast paths to CF and AtCoder login — emit `checking_login` and return early if the existing session is valid. `checking_login` is only emitted when cookies actually exist; fresh starts go straight to `logging_in`. Fixed CF cookie guard to `X-User-Sha1` and added `wait_for_selector` for the login form. Rewrote AtCoder submit to use `set_input_files` on the real source file path, with `wait_for_function` on `#plain-textarea` to confirm the ACE editor populated before clicking submit.
This commit is contained in:
parent
f5992fa9b5
commit
13438beca7
4 changed files with 727 additions and 128 deletions
|
|
@ -331,6 +331,32 @@ class CodeforcesScraper(BaseScraper):
|
|||
return await asyncio.to_thread(_login_headless_cf, credentials)
|
||||
|
||||
|
||||
def _cf_check_logged_in(page) -> bool:
|
||||
return page.evaluate(
|
||||
"() => Array.from(document.querySelectorAll('a'))"
|
||||
".some(a => a.textContent.includes('Logout'))"
|
||||
)
|
||||
|
||||
|
||||
def _cf_login_action(credentials: dict[str, str]):
|
||||
login_error: str | None = None
|
||||
|
||||
def login_action(page):
|
||||
nonlocal login_error
|
||||
try:
|
||||
page.wait_for_selector('input[name="handleOrEmail"]', timeout=60000)
|
||||
page.fill('input[name="handleOrEmail"]', credentials.get("username", ""))
|
||||
page.fill('input[name="password"]', credentials.get("password", ""))
|
||||
page.locator('#enterForm input[type="submit"]').click()
|
||||
page.wait_for_url(
|
||||
lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT
|
||||
)
|
||||
except Exception as e:
|
||||
login_error = str(e)
|
||||
|
||||
return login_action, lambda: login_error
|
||||
|
||||
|
||||
def _login_headless_cf(credentials: dict[str, str]) -> LoginResult:
|
||||
try:
|
||||
from scrapling.fetchers import StealthySession # type: ignore[import-untyped,unresolved-import]
|
||||
|
|
@ -344,33 +370,30 @@ def _login_headless_cf(credentials: dict[str, str]) -> LoginResult:
|
|||
|
||||
_ensure_browser()
|
||||
|
||||
logged_in = False
|
||||
login_error: str | None = None
|
||||
saved_cookies = load_platform_cookies("codeforces") or []
|
||||
|
||||
def check_login(page):
|
||||
nonlocal logged_in
|
||||
logged_in = page.evaluate(
|
||||
"() => Array.from(document.querySelectorAll('a'))"
|
||||
".some(a => a.textContent.includes('Logout'))"
|
||||
)
|
||||
if saved_cookies:
|
||||
print(json.dumps({"status": "checking_login"}), flush=True)
|
||||
logged_in = False
|
||||
|
||||
def check_action(page):
|
||||
nonlocal logged_in
|
||||
logged_in = _cf_check_logged_in(page)
|
||||
|
||||
def login_action(page):
|
||||
nonlocal login_error
|
||||
try:
|
||||
page.fill(
|
||||
'input[name="handleOrEmail"]',
|
||||
credentials.get("username", ""),
|
||||
)
|
||||
page.fill(
|
||||
'input[name="password"]',
|
||||
credentials.get("password", ""),
|
||||
)
|
||||
page.locator('#enterForm input[type="submit"]').click()
|
||||
page.wait_for_url(
|
||||
lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT
|
||||
)
|
||||
except Exception as e:
|
||||
login_error = str(e)
|
||||
with StealthySession(
|
||||
headless=True,
|
||||
timeout=BROWSER_SESSION_TIMEOUT,
|
||||
google_search=False,
|
||||
cookies=saved_cookies,
|
||||
) as session:
|
||||
session.fetch(f"{BASE_URL}/", page_action=check_action, solve_cloudflare=True)
|
||||
if logged_in:
|
||||
return LoginResult(success=True, error="")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
login_action, get_error = _cf_login_action(credentials)
|
||||
|
||||
try:
|
||||
with StealthySession(
|
||||
|
|
@ -384,22 +407,23 @@ def _login_headless_cf(credentials: dict[str, str]) -> LoginResult:
|
|||
page_action=login_action,
|
||||
solve_cloudflare=True,
|
||||
)
|
||||
login_error = get_error()
|
||||
if login_error:
|
||||
return LoginResult(success=False, error=f"Login failed: {login_error}")
|
||||
|
||||
session.fetch(
|
||||
f"{BASE_URL}/",
|
||||
page_action=check_login,
|
||||
network_idle=True,
|
||||
)
|
||||
logged_in = False
|
||||
|
||||
def verify_action(page):
|
||||
nonlocal logged_in
|
||||
logged_in = _cf_check_logged_in(page)
|
||||
|
||||
session.fetch(f"{BASE_URL}/", page_action=verify_action, network_idle=True)
|
||||
if not logged_in:
|
||||
return LoginResult(
|
||||
success=False, error="Login failed (bad credentials?)"
|
||||
)
|
||||
return LoginResult(success=False, error="Login failed (bad credentials?)")
|
||||
|
||||
try:
|
||||
browser_cookies = session.context.cookies()
|
||||
if any(c.get("name") == "X-User-Handle" for c in browser_cookies):
|
||||
if any(c.get("name") == "X-User-Sha1" for c in browser_cookies):
|
||||
save_platform_cookies("codeforces", browser_cookies)
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -438,35 +462,15 @@ def _submit_headless(
|
|||
if not _retried:
|
||||
saved_cookies = load_platform_cookies("codeforces") or []
|
||||
|
||||
logged_in = bool(saved_cookies) and not _retried
|
||||
login_error: str | None = None
|
||||
logged_in = bool(saved_cookies)
|
||||
submit_error: str | None = None
|
||||
needs_relogin = False
|
||||
|
||||
def check_login(page):
|
||||
nonlocal logged_in
|
||||
logged_in = page.evaluate(
|
||||
"() => Array.from(document.querySelectorAll('a'))"
|
||||
".some(a => a.textContent.includes('Logout'))"
|
||||
)
|
||||
logged_in = _cf_check_logged_in(page)
|
||||
|
||||
def login_action(page):
|
||||
nonlocal login_error
|
||||
try:
|
||||
page.fill(
|
||||
'input[name="handleOrEmail"]',
|
||||
credentials.get("username", ""),
|
||||
)
|
||||
page.fill(
|
||||
'input[name="password"]',
|
||||
credentials.get("password", ""),
|
||||
)
|
||||
page.locator('#enterForm input[type="submit"]').click()
|
||||
page.wait_for_url(
|
||||
lambda url: "/enter" not in url, timeout=BROWSER_NAV_TIMEOUT
|
||||
)
|
||||
except Exception as e:
|
||||
login_error = str(e)
|
||||
_login_action, _get_login_error = _cf_login_action(credentials)
|
||||
|
||||
def submit_action(page):
|
||||
nonlocal submit_error, needs_relogin
|
||||
|
|
@ -511,27 +515,25 @@ def _submit_headless(
|
|||
headless=True,
|
||||
timeout=BROWSER_SESSION_TIMEOUT,
|
||||
google_search=False,
|
||||
cookies=saved_cookies if (saved_cookies and not _retried) else [],
|
||||
cookies=saved_cookies if saved_cookies else [],
|
||||
) as session:
|
||||
if not (saved_cookies and not _retried):
|
||||
if not _retried and saved_cookies:
|
||||
print(json.dumps({"status": "checking_login"}), flush=True)
|
||||
session.fetch(
|
||||
f"{BASE_URL}/",
|
||||
page_action=check_login,
|
||||
network_idle=True,
|
||||
)
|
||||
session.fetch(f"{BASE_URL}/", page_action=check_login, solve_cloudflare=True)
|
||||
|
||||
if not logged_in:
|
||||
print(json.dumps({"status": "logging_in"}), flush=True)
|
||||
session.fetch(
|
||||
f"{BASE_URL}/enter",
|
||||
page_action=login_action,
|
||||
page_action=_login_action,
|
||||
solve_cloudflare=True,
|
||||
)
|
||||
login_error = _get_login_error()
|
||||
if login_error:
|
||||
return SubmitResult(
|
||||
success=False, error=f"Login failed: {login_error}"
|
||||
)
|
||||
logged_in = True
|
||||
|
||||
print(json.dumps({"status": "submitting"}), flush=True)
|
||||
session.fetch(
|
||||
|
|
@ -542,7 +544,7 @@ def _submit_headless(
|
|||
|
||||
try:
|
||||
browser_cookies = session.context.cookies()
|
||||
if any(c.get("name") == "X-User-Handle" for c in browser_cookies):
|
||||
if any(c.get("name") == "X-User-Sha1" for c in browser_cookies):
|
||||
save_platform_cookies("codeforces", browser_cookies)
|
||||
except Exception:
|
||||
pass
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue