82 lines
2.3 KiB
Python
82 lines
2.3 KiB
Python
import time
|
|
|
|
import backoff
|
|
import requests
|
|
|
|
from .base import HttpClient, ScraperConfig
|
|
|
|
|
|
class RequestsClient:
|
|
def __init__(self, config: ScraperConfig, headers: dict[str, str] | None = None):
|
|
self.config = config
|
|
self.session = requests.Session()
|
|
|
|
default_headers = {
|
|
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
}
|
|
if headers:
|
|
default_headers.update(headers)
|
|
|
|
self.session.headers.update(default_headers)
|
|
|
|
@backoff.on_exception(
|
|
backoff.expo,
|
|
(requests.RequestException, requests.HTTPError),
|
|
max_tries=3,
|
|
base=2.0,
|
|
jitter=backoff.random_jitter,
|
|
)
|
|
@backoff.on_predicate(
|
|
backoff.expo,
|
|
lambda response: response.status_code == 429,
|
|
max_tries=3,
|
|
base=2.0,
|
|
jitter=backoff.random_jitter,
|
|
)
|
|
def get(self, url: str, **kwargs) -> requests.Response:
|
|
timeout = kwargs.get("timeout", self.config.timeout_seconds)
|
|
response = self.session.get(url, timeout=timeout, **kwargs)
|
|
response.raise_for_status()
|
|
|
|
if (
|
|
hasattr(self.config, "rate_limit_delay")
|
|
and self.config.rate_limit_delay > 0
|
|
):
|
|
time.sleep(self.config.rate_limit_delay)
|
|
|
|
return response
|
|
|
|
def close(self) -> None:
|
|
self.session.close()
|
|
|
|
|
|
class CloudScraperClient:
|
|
def __init__(self, config: ScraperConfig):
|
|
import cloudscraper
|
|
|
|
self.config = config
|
|
self.scraper = cloudscraper.create_scraper()
|
|
|
|
@backoff.on_exception(
|
|
backoff.expo,
|
|
(requests.RequestException, requests.HTTPError),
|
|
max_tries=3,
|
|
base=2.0,
|
|
jitter=backoff.random_jitter,
|
|
)
|
|
def get(self, url: str, **kwargs) -> requests.Response:
|
|
timeout = kwargs.get("timeout", self.config.timeout_seconds)
|
|
response = self.scraper.get(url, timeout=timeout, **kwargs)
|
|
response.raise_for_status()
|
|
|
|
if (
|
|
hasattr(self.config, "rate_limit_delay")
|
|
and self.config.rate_limit_delay > 0
|
|
):
|
|
time.sleep(self.config.rate_limit_delay)
|
|
|
|
return response
|
|
|
|
def close(self) -> None:
|
|
if hasattr(self.scraper, "close"):
|
|
self.scraper.close()
|