0
# Real-time Monitoring
1
2
WebSocket-based job monitoring with automatic fallback to polling for long-running crawl and batch operations.
3
4
## Core Monitoring Method
5
6
```typescript { .api }
7
/**
8
* Create a watcher for a crawl or batch job
9
* @param jobId - Job identifier to monitor
10
* @param opts - Watcher configuration options
11
* @returns Watcher instance for real-time updates
12
*/
13
watcher(jobId: string, opts?: WatcherOptions): Watcher;
14
```
15
16
## Watcher Configuration
17
18
```typescript { .api }
19
interface WatcherOptions {
20
// Job type to monitor
21
kind?: "crawl" | "batch";
22
23
// Polling interval in seconds (fallback mode)
24
pollInterval?: number;
25
26
// Total timeout in seconds
27
timeout?: number;
28
}
29
```
30
31
## Watcher Class
32
33
```typescript { .api }
34
/**
35
* EventEmitter-based watcher for real-time job monitoring
36
* Automatically handles WebSocket connection with polling fallback
37
*/
38
class Watcher extends EventEmitter {
39
constructor(http: HttpClient, jobId: string, opts?: WatcherOptions);
40
41
/**
42
* Start monitoring the job
43
* @returns Promise that resolves when connection is established
44
*/
45
start(): Promise<void>;
46
47
/**
48
* Stop monitoring and close connections
49
*/
50
close(): void;
51
52
// Event methods inherited from EventEmitter
53
on(event: 'document', listener: (document: Document & { id: string }) => void): this;
54
on(event: 'snapshot', listener: (snapshot: CrawlJob | BatchScrapeJob) => void): this;
55
on(event: 'done', listener: (result: JobCompletionEvent) => void): this;
56
on(event: 'error', listener: (error: JobErrorEvent) => void): this;
57
58
emit(event: 'document', document: Document & { id: string }): boolean;
59
emit(event: 'snapshot', snapshot: CrawlJob | BatchScrapeJob): boolean;
60
emit(event: 'done', result: JobCompletionEvent): boolean;
61
emit(event: 'error', error: JobErrorEvent): boolean;
62
}
63
```
64
65
## Event Types
66
67
```typescript { .api }
68
// Job completion event
69
interface JobCompletionEvent {
70
status: "completed" | "failed" | "cancelled";
71
data: Document[];
72
id: string;
73
}
74
75
// Job error event
76
interface JobErrorEvent {
77
status: "failed";
78
data: Document[];
79
error: string;
80
id: string;
81
}
82
83
// Job status snapshot (CrawlJob or BatchScrapeJob)
84
type JobSnapshot = CrawlJob | BatchScrapeJob;
85
```
86
87
## Usage Examples
88
89
### Basic Crawl Monitoring
90
91
```typescript
92
// Start a crawl job
93
const crawlResponse = await app.startCrawl('https://example.com', {
94
limit: 100,
95
scrapeOptions: { formats: ['markdown'] }
96
});
97
98
// Create watcher for real-time monitoring
99
const watcher = app.watcher(crawlResponse.id, {
100
kind: 'crawl',
101
pollInterval: 2,
102
timeout: 300 // 5 minutes
103
});
104
105
// Listen for individual documents
106
watcher.on('document', (document) => {
107
console.log(`New document scraped: ${document.metadata?.sourceURL}`);
108
console.log(`Content length: ${document.markdown?.length || 0} characters`);
109
});
110
111
// Listen for job status updates
112
watcher.on('snapshot', (snapshot) => {
113
console.log(`Progress: ${snapshot.completed}/${snapshot.total} - Status: ${snapshot.status}`);
114
console.log(`Credits used: ${snapshot.creditsUsed || 0}`);
115
});
116
117
// Listen for job completion
118
watcher.on('done', (result) => {
119
console.log(`Crawl ${result.status}! Total documents: ${result.data.length}`);
120
watcher.close();
121
});
122
123
// Listen for errors
124
watcher.on('error', (error) => {
125
console.error(`Crawl failed: ${error.error}`);
126
watcher.close();
127
});
128
129
// Start monitoring
130
await watcher.start();
131
```
132
133
### Batch Job Monitoring
134
135
```typescript
136
const urls = Array.from({ length: 50 }, (_, i) =>
137
`https://api.example.com/items/${i + 1}`
138
);
139
140
// Start batch job
141
const batchResponse = await app.startBatchScrape(urls, {
142
options: { formats: ['json'] },
143
maxConcurrency: 5
144
});
145
146
// Monitor batch progress
147
const watcher = app.watcher(batchResponse.id, {
148
kind: 'batch',
149
pollInterval: 3,
150
timeout: 600 // 10 minutes
151
});
152
153
let processedCount = 0;
154
const results: Document[] = [];
155
156
watcher.on('document', (document) => {
157
processedCount++;
158
results.push(document);
159
160
console.log(`Processed ${processedCount} documents`);
161
162
if (document.metadata?.error) {
163
console.log(`Error processing ${document.metadata.sourceURL}: ${document.metadata.error}`);
164
}
165
});
166
167
watcher.on('snapshot', (snapshot) => {
168
const progress = Math.round((snapshot.completed / snapshot.total) * 100);
169
console.log(`Batch progress: ${progress}% (${snapshot.completed}/${snapshot.total})`);
170
171
if (snapshot.creditsUsed) {
172
console.log(`Credits used so far: ${snapshot.creditsUsed}`);
173
}
174
});
175
176
watcher.on('done', (result) => {
177
console.log(`Batch ${result.status}!`);
178
console.log(`Total processed: ${results.length}`);
179
180
// Process all results
181
const successfulResults = results.filter(doc => !doc.metadata?.error);
182
const failedResults = results.filter(doc => doc.metadata?.error);
183
184
console.log(`Successful: ${successfulResults.length}, Failed: ${failedResults.length}`);
185
186
watcher.close();
187
});
188
189
watcher.on('error', (error) => {
190
console.error(`Batch monitoring error: ${error.error}`);
191
watcher.close();
192
});
193
194
await watcher.start();
195
```
196
197
### Advanced Monitoring with Progress Tracking
198
199
```typescript
200
class CrawlProgressTracker {
201
private startTime: number;
202
private documentTimes: number[] = [];
203
private errors: string[] = [];
204
205
constructor(private watcher: Watcher) {
206
this.startTime = Date.now();
207
this.setupEventHandlers();
208
}
209
210
private setupEventHandlers() {
211
this.watcher.on('document', (document) => {
212
this.documentTimes.push(Date.now());
213
214
if (document.metadata?.error) {
215
this.errors.push(`${document.metadata.sourceURL}: ${document.metadata.error}`);
216
}
217
218
this.logProgress(document);
219
});
220
221
this.watcher.on('snapshot', (snapshot) => {
222
this.logSnapshot(snapshot);
223
});
224
225
this.watcher.on('done', (result) => {
226
this.logFinalStats(result);
227
});
228
}
229
230
private logProgress(document: Document) {
231
const elapsed = Date.now() - this.startTime;
232
const rate = this.documentTimes.length / (elapsed / 1000);
233
234
console.log(`Document ${this.documentTimes.length}: ${document.metadata?.sourceURL}`);
235
console.log(`Current rate: ${rate.toFixed(2)} docs/sec`);
236
}
237
238
private logSnapshot(snapshot: CrawlJob | BatchScrapeJob) {
239
const elapsed = Date.now() - this.startTime;
240
const progress = (snapshot.completed / snapshot.total) * 100;
241
const eta = snapshot.completed > 0
242
? ((snapshot.total - snapshot.completed) * elapsed / snapshot.completed) / 1000
243
: 0;
244
245
console.log(`\n--- Progress Update ---`);
246
console.log(`Status: ${snapshot.status}`);
247
console.log(`Progress: ${snapshot.completed}/${snapshot.total} (${progress.toFixed(1)}%)`);
248
console.log(`Elapsed: ${(elapsed / 1000).toFixed(0)}s`);
249
console.log(`ETA: ${eta.toFixed(0)}s`);
250
console.log(`Credits: ${snapshot.creditsUsed || 0}`);
251
console.log(`Errors: ${this.errors.length}`);
252
console.log(`-----------------------\n`);
253
}
254
255
private logFinalStats(result: JobCompletionEvent) {
256
const totalTime = Date.now() - this.startTime;
257
const avgRate = result.data.length / (totalTime / 1000);
258
259
console.log(`\n=== Final Statistics ===`);
260
console.log(`Status: ${result.status}`);
261
console.log(`Total documents: ${result.data.length}`);
262
console.log(`Total time: ${(totalTime / 1000).toFixed(1)}s`);
263
console.log(`Average rate: ${avgRate.toFixed(2)} docs/sec`);
264
console.log(`Total errors: ${this.errors.length}`);
265
266
if (this.errors.length > 0) {
267
console.log(`\nErrors:`);
268
this.errors.slice(0, 5).forEach(error => console.log(`- ${error}`));
269
if (this.errors.length > 5) {
270
console.log(`... and ${this.errors.length - 5} more`);
271
}
272
}
273
console.log(`========================\n`);
274
}
275
}
276
277
// Usage
278
const crawlResponse = await app.startCrawl('https://docs.example.com', {
279
limit: 500,
280
scrapeOptions: { formats: ['markdown'] }
281
});
282
283
const watcher = app.watcher(crawlResponse.id);
284
const tracker = new CrawlProgressTracker(watcher);
285
286
await watcher.start();
287
```
288
289
### Multiple Job Monitoring
290
291
```typescript
292
class MultiJobMonitor {
293
private watchers: Map<string, Watcher> = new Map();
294
private jobStats = new Map<string, {
295
type: 'crawl' | 'batch';
296
started: number;
297
completed: number;
298
total: number;
299
status: string;
300
}>();
301
302
async addCrawlJob(url: string, options: any) {
303
const response = await app.startCrawl(url, options);
304
this.addWatcher(response.id, 'crawl');
305
return response.id;
306
}
307
308
async addBatchJob(urls: string[], options: any) {
309
const response = await app.startBatchScrape(urls, options);
310
this.addWatcher(response.id, 'batch');
311
return response.id;
312
}
313
314
private addWatcher(jobId: string, type: 'crawl' | 'batch') {
315
const watcher = app.watcher(jobId, { kind: type });
316
317
this.jobStats.set(jobId, {
318
type,
319
started: Date.now(),
320
completed: 0,
321
total: 0,
322
status: 'starting'
323
});
324
325
watcher.on('snapshot', (snapshot) => {
326
const stats = this.jobStats.get(jobId)!;
327
stats.completed = snapshot.completed;
328
stats.total = snapshot.total;
329
stats.status = snapshot.status;
330
331
this.logAllJobs();
332
});
333
334
watcher.on('done', (result) => {
335
console.log(`Job ${jobId} ${result.status}`);
336
this.watchers.delete(jobId);
337
338
if (this.watchers.size === 0) {
339
console.log('All jobs completed!');
340
}
341
});
342
343
watcher.on('error', (error) => {
344
console.error(`Job ${jobId} error: ${error.error}`);
345
this.watchers.delete(jobId);
346
});
347
348
this.watchers.set(jobId, watcher);
349
watcher.start();
350
}
351
352
private logAllJobs() {
353
console.clear();
354
console.log('=== Multi-Job Monitor ===');
355
356
for (const [jobId, stats] of this.jobStats) {
357
const elapsed = (Date.now() - stats.started) / 1000;
358
const progress = stats.total > 0 ? (stats.completed / stats.total * 100) : 0;
359
360
console.log(`${jobId.substring(0, 8)}... (${stats.type}): ${stats.status}`);
361
console.log(` Progress: ${stats.completed}/${stats.total} (${progress.toFixed(1)}%)`);
362
console.log(` Elapsed: ${elapsed.toFixed(0)}s`);
363
console.log('');
364
}
365
}
366
367
closeAll() {
368
for (const watcher of this.watchers.values()) {
369
watcher.close();
370
}
371
this.watchers.clear();
372
}
373
}
374
375
// Usage
376
const monitor = new MultiJobMonitor();
377
378
// Start multiple jobs concurrently
379
await Promise.all([
380
monitor.addCrawlJob('https://site1.example.com', { limit: 100 }),
381
monitor.addCrawlJob('https://site2.example.com', { limit: 150 }),
382
monitor.addBatchJob([
383
'https://api.example.com/data1',
384
'https://api.example.com/data2'
385
], { options: { formats: ['json'] } })
386
]);
387
388
// Jobs will be monitored automatically
389
// Call monitor.closeAll() when done
390
```
391
392
### Error Recovery and Retry Monitoring
393
394
```typescript
395
class RobustCrawlMonitor {
396
private maxRetries = 3;
397
private retryCount = 0;
398
399
async startMonitoredCrawl(url: string, options: any) {
400
while (this.retryCount < this.maxRetries) {
401
try {
402
const response = await app.startCrawl(url, options);
403
return await this.monitorWithRetry(response.id);
404
} catch (error) {
405
this.retryCount++;
406
console.log(`Attempt ${this.retryCount} failed:`, error);
407
408
if (this.retryCount >= this.maxRetries) {
409
throw new Error(`Failed after ${this.maxRetries} attempts`);
410
}
411
412
// Wait before retry
413
await new Promise(resolve => setTimeout(resolve, 5000 * this.retryCount));
414
}
415
}
416
}
417
418
private async monitorWithRetry(jobId: string): Promise<Document[]> {
419
return new Promise((resolve, reject) => {
420
const watcher = app.watcher(jobId, {
421
timeout: 300,
422
pollInterval: 2
423
});
424
425
const documents: Document[] = [];
426
let lastSnapshot: CrawlJob | null = null;
427
428
watcher.on('document', (document) => {
429
documents.push(document);
430
});
431
432
watcher.on('snapshot', (snapshot) => {
433
lastSnapshot = snapshot as CrawlJob;
434
console.log(`Progress: ${snapshot.completed}/${snapshot.total}`);
435
});
436
437
watcher.on('done', (result) => {
438
if (result.status === 'completed') {
439
resolve(documents);
440
} else {
441
reject(new Error(`Job ${result.status}: ${JSON.stringify(result)}`));
442
}
443
watcher.close();
444
});
445
446
watcher.on('error', (error) => {
447
// Check if we got partial results
448
if (documents.length > 0) {
449
console.log(`Partial success: got ${documents.length} documents before error`);
450
resolve(documents);
451
} else {
452
reject(new Error(error.error));
453
}
454
watcher.close();
455
});
456
457
watcher.start().catch(reject);
458
});
459
}
460
}
461
462
// Usage
463
const monitor = new RobustCrawlMonitor();
464
465
try {
466
const documents = await monitor.startMonitoredCrawl('https://example.com', {
467
limit: 200,
468
scrapeOptions: { formats: ['markdown'] }
469
});
470
471
console.log(`Successfully crawled ${documents.length} documents`);
472
} catch (error) {
473
console.error('Crawl failed completely:', error);
474
}
475
```