or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-automation.mdcli-tools.mdconfiguration.mdcore-types.mdcrawlers.mderror-handling.mdevents.mdfingerprinting.mdhttp-clients.mdindex.mdrequest-management.mdsessions.mdstatistics.mdstorage.md

http-clients.mddocs/

0

# HTTP Clients

1

2

Pluggable HTTP client implementations supporting different libraries and browser impersonation for enhanced anti-detection capabilities. HTTP clients handle the actual network communication while providing consistent interfaces for different underlying implementations.

3

4

## Capabilities

5

6

### Base HTTP Client

7

8

Abstract base class defining the interface for all HTTP client implementations in Crawlee.

9

10

```python { .api }

11

class HttpClient:

12

async def crawl(

13

self,

14

request: Request,

15

*,

16

session: Session | None = None,

17

proxy_info: ProxyInfo | None = None,

18

statistics: Statistics | None = None

19

) -> HttpCrawlingResult:

20

"""

21

Perform HTTP request crawling.

22

23

Args:

24

request: Request to process

25

session: Session for state management

26

proxy_info: Proxy configuration

27

statistics: Statistics collector

28

29

Returns:

30

HttpCrawlingResult with response data

31

"""

32

33

async def send_request(

34

self,

35

url: str,

36

*,

37

method: HttpMethod = "GET",

38

headers: dict[str, str] | None = None,

39

payload: HttpPayload | None = None,

40

**kwargs

41

) -> HttpResponse:

42

"""

43

Send direct HTTP request.

44

45

Args:

46

url: Target URL

47

method: HTTP method

48

headers: Request headers

49

payload: Request body

50

51

Returns:

52

HttpResponse object

53

"""

54

```

55

56

### HTTPX Client

57

58

HTTP client implementation using the httpx library with support for HTTP/2, connection pooling, and async operations.

59

60

```python { .api }

61

class HttpxHttpClient(HttpClient):

62

def __init__(

63

self,

64

*,

65

persist_cookies_per_session: bool = True,

66

additional_http_error_status_codes: set[int] | None = None,

67

ignore_http_error_status_codes: set[int] | None = None,

68

**httpx_kwargs

69

): ...

70

71

@property

72

def client(self) -> httpx.AsyncClient:

73

"""Access underlying httpx client."""

74

```

75

76

### Curl Impersonate Client

77

78

HTTP client using curl-cffi for browser impersonation and advanced anti-detection capabilities.

79

80

```python { .api }

81

class CurlImpersonateHttpClient(HttpClient):

82

def __init__(

83

self,

84

*,

85

persist_cookies_per_session: bool = True,

86

impersonate: str = "chrome",

87

additional_http_error_status_codes: set[int] | None = None,

88

ignore_http_error_status_codes: set[int] | None = None,

89

**curl_cffi_kwargs

90

): ...

91

92

@property

93

def impersonate(self) -> str:

94

"""Browser impersonation target."""

95

```

96

97

### HTTP Response

98

99

Response object containing response data, headers, and metadata from HTTP requests.

100

101

```python { .api }

102

class HttpResponse:

103

def __init__(

104

self,

105

*,

106

url: str,

107

status_code: int,

108

headers: HttpHeaders,

109

content: bytes,

110

encoding: str | None = None

111

): ...

112

113

@property

114

def url(self) -> str:

115

"""Final response URL (after redirects)."""

116

117

@property

118

def status_code(self) -> int:

119

"""HTTP status code."""

120

121

@property

122

def headers(self) -> HttpHeaders:

123

"""Response headers."""

124

125

@property

126

def content(self) -> bytes:

127

"""Raw response content."""

128

129

@property

130

def text(self) -> str:

131

"""Response content as string."""

132

133

@property

134

def encoding(self) -> str | None:

135

"""Character encoding of response."""

136

137

@property

138

def content_type(self) -> str | None:

139

"""MIME type from Content-Type header."""

140

141

def json(self) -> any:

142

"""

143

Parse response content as JSON.

144

145

Returns:

146

Parsed JSON data

147

148

Raises:

149

JSONDecodeError: If content is not valid JSON

150

"""

151

152

@property

153

def ok(self) -> bool:

154

"""True if status code indicates success (200-299)."""

155

156

def raise_for_status(self) -> None:

157

"""

158

Raise HTTPStatusError for bad response status codes.

159

160

Raises:

161

HttpStatusCodeError: For 4xx and 5xx status codes

162

"""

163

```

164

165

### HTTP Crawling Result

166

167

Result object containing both HTTP response data and additional crawling metadata.

168

169

```python { .api }

170

class HttpCrawlingResult:

171

def __init__(

172

self,

173

*,

174

http_response: HttpResponse,

175

encoding: str | None = None

176

): ...

177

178

@property

179

def http_response(self) -> HttpResponse:

180

"""HTTP response object."""

181

182

@property

183

def encoding(self) -> str | None:

184

"""Character encoding override."""

185

```

186

187

## Configuration Options

188

189

### HTTP Client Configuration

190

191

Common configuration options available across HTTP client implementations.

192

193

```python { .api }

194

class HttpClientConfig:

195

persist_cookies_per_session: bool = True

196

additional_http_error_status_codes: set[int] | None = None

197

ignore_http_error_status_codes: set[int] | None = None

198

timeout: float = 30.0

199

max_redirects: int = 10

200

verify_ssl: bool = True

201

proxy_url: str | None = None

202

```

203

204

### Browser Impersonation Options

205

206

Configuration for curl-cffi browser impersonation capabilities.

207

208

```python { .api }

209

ImpersonateTarget = Literal[

210

"chrome",

211

"chrome99",

212

"chrome100",

213

"chrome101",

214

"chrome104",

215

"chrome107",

216

"chrome110",

217

"chrome116",

218

"firefox",

219

"firefox99",

220

"firefox102",

221

"firefox109",

222

"safari",

223

"safari15_3",

224

"safari15_5",

225

"safari17_0",

226

"safari17_2_1"

227

]

228

```

229

230

## Usage Examples

231

232

### Basic HTTP Client Usage

233

234

```python

235

import asyncio

236

from crawlee.http_clients import HttpxHttpClient

237

from crawlee import Request

238

239

async def main():

240

client = HttpxHttpClient()

241

242

# Send direct request

243

response = await client.send_request(

244

'https://api.example.com/data',

245

method='GET',

246

headers={'User-Agent': 'My Bot 1.0'}

247

)

248

249

print(f"Status: {response.status_code}")

250

print(f"Content: {response.text}")

251

252

# Process as JSON

253

if response.content_type == 'application/json':

254

data = response.json()

255

print(f"JSON data: {data}")

256

257

await client.close()

258

259

asyncio.run(main())

260

```

261

262

### Browser Impersonation

263

264

```python

265

import asyncio

266

from crawlee.http_clients import CurlImpersonateHttpClient

267

268

async def main():

269

# Impersonate Chrome browser

270

client = CurlImpersonateHttpClient(

271

impersonate='chrome116'

272

)

273

274

response = await client.send_request('https://example.com')

275

276

print(f"Impersonating: {client.impersonate}")

277

print(f"Response: {response.status_code}")

278

279

# The request appears to come from Chrome 116

280

print(f"User-Agent: {response.headers.get('user-agent', 'Not set')}")

281

282

await client.close()

283

284

asyncio.run(main())

285

```

286

287

### Custom HTTP Client Configuration

288

289

```python

290

import asyncio

291

import httpx

292

from crawlee.http_clients import HttpxHttpClient

293

294

async def main():

295

# Custom httpx configuration

296

client = HttpxHttpClient(

297

timeout=60.0,

298

verify=False, # Disable SSL verification

299

limits=httpx.Limits(

300

max_keepalive_connections=100,

301

max_connections=200

302

),

303

ignore_http_error_status_codes={404, 503}

304

)

305

306

try:

307

response = await client.send_request('https://example.com/may-not-exist')

308

# Won't raise error for 404 due to ignore_http_error_status_codes

309

print(f"Status: {response.status_code}")

310

except Exception as e:

311

print(f"Request failed: {e}")

312

313

await client.close()

314

315

asyncio.run(main())

316

```

317

318

### Using HTTP Clients with Crawlers

319

320

```python

321

import asyncio

322

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

323

from crawlee.http_clients import CurlImpersonateHttpClient

324

325

async def main():

326

# Configure crawler with custom HTTP client

327

http_client = CurlImpersonateHttpClient(

328

impersonate='safari17_0',

329

persist_cookies_per_session=True

330

)

331

332

crawler = HttpCrawler(

333

http_client=http_client,

334

max_requests_per_crawl=50

335

)

336

337

@crawler.router.default_handler

338

async def handler(context: HttpCrawlingContext):

339

response = context.response

340

341

print(f"Crawled: {response.url}")

342

print(f"Status: {response.status_code}")

343

print(f"Content-Type: {response.content_type}")

344

345

# Extract data based on content type

346

if response.content_type and 'application/json' in response.content_type:

347

data = response.json()

348

await context.push_data(data)

349

else:

350

# Process HTML or other content

351

data = {

352

'url': response.url,

353

'status': response.status_code,

354

'title': 'Extracted from HTML' # Add your extraction logic

355

}

356

await context.push_data(data)

357

358

await crawler.run(['https://api.example.com/data'])

359

360

asyncio.run(main())

361

```

362

363

### Error Handling

364

365

```python

366

import asyncio

367

from crawlee.http_clients import HttpxHttpClient

368

from crawlee.errors import HttpStatusCodeError

369

370

async def main():

371

client = HttpxHttpClient()

372

373

try:

374

response = await client.send_request('https://httpbin.org/status/500')

375

376

# Check if response is successful

377

if not response.ok:

378

print(f"Request failed with status: {response.status_code}")

379

380

# Or raise exception for bad status

381

response.raise_for_status()

382

383

except HttpStatusCodeError as e:

384

print(f"HTTP error occurred: {e}")

385

print(f"Status code: {e.status_code}")

386

387

except Exception as e:

388

print(f"Other error occurred: {e}")

389

390

finally:

391

await client.close()

392

393

asyncio.run(main())

394

```