From 0b40e0f33e7cf43dc045734493d5f64f9f00d7d6 Mon Sep 17 00:00:00 2001
From: Barrett Ruth <62671086+barrettruth@users.noreply.github.com>
Date: Fri, 6 Mar 2026 13:25:44 -0500
Subject: [PATCH] fix: replace curl_cffi with scrapling in codeforces metadata
 (#334)

## Problem

`codeforces.py` used `curl_cffi` to bypass Cloudflare when fetching
contest problem HTML, making it unavailable in the nix python env and
requiring an extra dependency across `pyproject.toml` and `flake.nix`.

## Solution

Rewrite `_fetch_problems_html` to use scrapling `StealthySession` with
`solve_cloudflare=True`, matching the existing CF submit pattern. Extend
`needs_browser` in `scraper.lua` to route CF `metadata` and `tests`
through the FHS env on NixOS. Remove `curl-cffi` from `pyproject.toml`,
`flake.nix`, and test mocks.
---
 flake.nix              |  2 --
 lua/cp/scraper.lua     |  4 +++-
 pyproject.toml         |  1 -
 scrapers/codeforces.py | 27 +++++++++++++++++++++++----
 tests/conftest.py      | 34 ++++++----------------------------
 5 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/flake.nix b/flake.nix
index 1c416c6..1ccb2d3 100644
--- a/flake.nix
+++ b/flake.nix
@@ -19,7 +19,6 @@
         pkgs.python312.withPackages (ps: [
           ps.backoff
           ps.beautifulsoup4
-          ps.curl-cffi
           ps.httpx
           ps.ndjson
           ps.pydantic
@@ -31,7 +30,6 @@
         pkgs.python312.withPackages (ps: [
           ps.backoff
           ps.beautifulsoup4
-          ps.curl-cffi
           ps.httpx
           ps.ndjson
           ps.pydantic
diff --git a/lua/cp/scraper.lua b/lua/cp/scraper.lua
index 8f9759d..28705df 100644
--- a/lua/cp/scraper.lua
+++ b/lua/cp/scraper.lua
@@ -44,7 +44,9 @@ local function run_scraper(platform, subcommand, args, opts)
     return { success = false, error = msg }
   end
 
-  local needs_browser = subcommand == 'submit' or subcommand == 'login'
+  local needs_browser = subcommand == 'submit'
+    or subcommand == 'login'
+    or (platform == 'codeforces' and (subcommand == 'metadata' or subcommand == 'tests'))
 
   if needs_browser then
     utils.setup_nix_submit_env()
diff --git a/pyproject.toml b/pyproject.toml
index 9ffc00c..e677a57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
     "backoff>=2.2.1",
     "beautifulsoup4>=4.13.5",
     "scrapling[fetchers]>=0.4",
-    "curl-cffi>=0.13.0",
     "httpx>=0.28.1",
     "ndjson>=0.3.1",
     "pydantic>=2.11.10",
diff --git a/scrapers/codeforces.py b/scrapers/codeforces.py
index d2e7083..8ae768e 100644
--- a/scrapers/codeforces.py
+++ b/scrapers/codeforces.py
@@ -7,7 +7,6 @@ from typing import Any
 
 import requests
 from bs4 import BeautifulSoup, Tag
-from curl_cffi import requests as curl_requests
 
 from .base import BaseScraper, extract_precision
 from .models import (
@@ -141,10 +140,30 @@ def _is_interactive(block: Tag) -> bool:
 
 
 def _fetch_problems_html(contest_id: str) -> str:
+    try:
+        from scrapling.fetchers import StealthySession  # type: ignore[import-untyped,unresolved-import]
+    except ImportError:
+        raise RuntimeError("scrapling is required for Codeforces metadata")
+
+    from .atcoder import _ensure_browser
+
+    _ensure_browser()
+
     url = f"{BASE_URL}/contest/{contest_id}/problems"
-    response = curl_requests.get(url, impersonate="chrome", timeout=HTTP_TIMEOUT)
-    response.raise_for_status()
-    return response.text
+    html = ""
+
+    def page_action(page):
+        nonlocal html
+        html = page.content()
+
+    with StealthySession(
+        headless=True,
+        timeout=BROWSER_SESSION_TIMEOUT,
+        google_search=False,
+    ) as session:
+        session.fetch(url, page_action=page_action, solve_cloudflare=True)
+
+    return html
 
 
 def _parse_all_blocks(html: str) -> list[dict[str, Any]]:
diff --git a/tests/conftest.py b/tests/conftest.py
index deb7e3a..b6ff810 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,6 @@ from typing import Any
 import httpx
 import pytest
 import requests
-from curl_cffi import requests as curl_requests
 
 ROOT = Path(__file__).resolve().parent.parent
 FIX = Path(__file__).resolve().parent / "fixtures"
@@ -136,15 +135,10 @@ def run_scraper_offline(fixture_text):
 
             case "codeforces":
 
-                class MockCurlResponse:
-                    def __init__(self, html: str):
-                        self.text = html
-
-                    def raise_for_status(self):
-                        pass
-
-                def _mock_curl_get(url: str, **kwargs):
-                    return MockCurlResponse(_router_codeforces(url=url))
+                def _mock_fetch_problems_html(cid: str) -> str:
+                    return _router_codeforces(
+                        url=f"https://codeforces.com/contest/{cid}/problems"
+                    )
 
                 def _mock_requests_get(url: str, **kwargs):
                     if "api/contest.list" in url:
@@ -175,7 +169,7 @@ def run_scraper_offline(fixture_text):
                     raise AssertionError(f"Unexpected requests.get call: {url}")
 
                 return {
-                    "curl_requests.get": _mock_curl_get,
+                    "_fetch_problems_html": _mock_fetch_problems_html,
                     "requests.get": _mock_requests_get,
                 }
 
@@ -215,23 +209,8 @@ def run_scraper_offline(fixture_text):
                         return MockResponse(data)
                     raise AssertionError(f"No fixture for CodeChef url={url!r}")
 
-                class MockCodeChefCurlResponse:
-                    def __init__(self, html: str):
-                        self.text = html
-
-                    def raise_for_status(self):
-                        pass
-
-                def _mock_curl_get(url: str, **kwargs):
-                    if "/problems/" in url:
-                        problem_id = url.rstrip("/").split("/")[-1]
-                        html = fixture_text(f"codechef/{problem_id}.html")
-                        return MockCodeChefCurlResponse(html)
-                    raise AssertionError(f"No fixture for CodeChef url={url!r}")
-
                 return {
                     "__offline_get_async": __offline_get_async,
-                    "curl_requests.get": _mock_curl_get,
                 }
 
             case _:
@@ -250,7 +229,7 @@ def run_scraper_offline(fixture_text):
         offline_fetches = _make_offline_fetches(scraper_name)
 
         if scraper_name == "codeforces":
-            curl_requests.get = offline_fetches["curl_requests.get"]
+            ns._fetch_problems_html = offline_fetches["_fetch_problems_html"]
             requests.get = offline_fetches["requests.get"]
         elif scraper_name == "atcoder":
             ns._fetch = offline_fetches["_fetch"]
@@ -259,7 +238,6 @@ def run_scraper_offline(fixture_text):
             httpx.AsyncClient.get = offline_fetches["__offline_fetch_text"]
         elif scraper_name == "codechef":
             httpx.AsyncClient.get = offline_fetches["__offline_get_async"]
-            curl_requests.get = offline_fetches["curl_requests.get"]
 
         scraper_class = getattr(ns, scraper_classes[scraper_name])
         scraper = scraper_class()