cp.nvim/scrapers/clients.py
2025-09-22 20:46:27 -04:00

82 lines
2.3 KiB
Python

import time
import backoff
import requests
from .base import HttpClient, ScraperConfig
class RequestsClient:
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
self.config = config
self.session = requests.Session()
default_headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
if headers:
default_headers.update(headers)
self.session.headers.update(default_headers)
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
@backoff.on_predicate(
backoff.expo,
lambda response: response.status_code == 429,
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.session.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
self.session.close()
class CloudScraperClient:
def __init__(self, config: ScraperConfig):
import cloudscraper
self.config = config
self.scraper = cloudscraper.create_scraper()
@backoff.on_exception(
backoff.expo,
(requests.RequestException, requests.HTTPError),
max_tries=3,
base=2.0,
jitter=backoff.random_jitter,
)
def get(self, url: str, **kwargs) -> requests.Response:
timeout = kwargs.get("timeout", self.config.timeout_seconds)
response = self.scraper.get(url, timeout=timeout, **kwargs)
response.raise_for_status()
if (
hasattr(self.config, "rate_limit_delay")
and self.config.rate_limit_delay > 0
):
time.sleep(self.config.rate_limit_delay)
return response
def close(self) -> None:
if hasattr(self.scraper, "close"):
self.scraper.close()