Crawler Class

The Crawler class provides full control over the capture process with events, custom filtering, and advanced configuration.

Overview

import { Crawler } from 'smippo';

const crawler = new Crawler({
  url: 'https://example.com',
  output: './mirror',
  depth: 3,
});

crawler.on('page:complete', ({ url }) => {
  console.log(`Captured: ${url}`);
});

const result = await crawler.start();

Constructor

new Crawler(options: CrawlerOptions)

Required Options

OptionTypeDescription
urlstringStarting URL
outputstringOutput directory

All Options

interface CrawlerOptions {
  // Required
  url: string;
  output: string;
  
  // Crawling
  depth?: number;                    // Default: 0
  scope?: 'subdomain' | 'domain' | 'tld' | 'all';
  stayInDir?: boolean;
  externalAssets?: boolean;
  
  // Filtering
  include?: string[];
  exclude?: string[];
  mimeInclude?: string[];
  mimeExclude?: string[];
  maxSize?: number;
  minSize?: number;
  
  // Browser
  wait?: 'networkidle' | 'load' | 'domcontentloaded';
  waitTime?: number;
  timeout?: number;
  userAgent?: string;
  viewport?: { width: number; height: number };
  device?: string;
  
  // Network
  proxy?: string;
  cookies?: string;
  headers?: Record<string, string>;
  captureAuth?: boolean;
  
  // Output
  structure?: 'original' | 'flat' | 'domain';
  har?: boolean;
  screenshot?: boolean;
  pdf?: boolean;
  noJs?: boolean;
  inlineCss?: boolean;
  
  // Performance
  concurrency?: number;              // Default: 8
  maxPages?: number;
  maxTime?: number;
  rateLimit?: number;
  
  // Robots
  ignoreRobots?: boolean;
  
  // Cache
  useCache?: boolean;
  
  // Logging
  verbose?: boolean;
  quiet?: boolean;
  logFile?: string;
  debug?: boolean;
}

Methods

start()

Begin the crawl. Returns a promise that resolves when complete.

async start(): Promise<CrawlerResult>
const result = await crawler.start();

console.log(result.stats.pagesCapt);
console.log(result.stats.assetsCapt);
console.log(result.stats.errors);
console.log(result.stats.duration);
console.log(result.stats.totalSize);

on()

Register event listeners.

on(event: string, callback: (data: any) => void): void
crawler.on('page:complete', (data) => {
  console.log(data.url);
});

emit()

Emit events (typically used internally).

emit(event: string, data: any): void

Events

page:start

Fired when a page capture begins.

crawler.on('page:start', ({ url }) => {
  console.log(`Starting: ${url}`);
});

Event Data:

PropertyTypeDescription
urlstringPage URL

page:complete

Fired when a page is fully captured.

crawler.on('page:complete', ({ url, localPath, size, linksFound }) => {
  console.log(`Captured: ${url}`);
  console.log(`Saved to: ${localPath}`);
  console.log(`Size: ${size} bytes`);
  console.log(`Found ${linksFound} links`);
});

Event Data:

PropertyTypeDescription
urlstringPage URL
localPathstringLocal file path
sizenumberFile size in bytes
linksFoundnumberLinks discovered

asset:save

Fired when an asset (CSS, image, font, etc.) is saved.

crawler.on('asset:save', ({ url, localPath, size }) => {
  console.log(`Asset: ${url} (${size} bytes)`);
});

Event Data:

PropertyTypeDescription
urlstringAsset URL
localPathstringLocal file path
sizenumberFile size in bytes

error

Fired when an error occurs.

crawler.on('error', ({ url, error }) => {
  console.error(`Failed: ${url}`);
  console.error(`Reason: ${error.message}`);
});

Event Data:

PropertyTypeDescription
urlstringURL that failed
errorErrorError object

Result Object

interface CrawlerResult {
  stats: {
    pagesCapt: number;      // Total pages captured
    assetsCapt: number;     // Total assets saved
    totalSize: number;      // Total bytes written
    duration: number;       // Time in milliseconds
    errors: number;         // Number of errors
  };
  manifest: {
    version: string;
    created: string;
    updated: string;
    rootUrl: string;
    options: object;
    stats: object;
    pages: Array<{
      url: string;
      localPath: string;
      status: number;
      captured: string;
      size: number;
      title: string;
    }>;
    assets: Array<{
      url: string;
      localPath: string;
      mimeType: string;
      size: number;
    }>;
    errors: Array<{
      url: string;
      error: string;
      time: string;
    }>;
  };
}

Advanced Examples

Progress Tracking

import { Crawler } from 'smippo';

const crawler = new Crawler({
  url: 'https://docs.example.com',
  output: './docs',
  depth: 5,
  concurrency: 4,
});

let captured = 0;
let errors = 0;
const startTime = Date.now();

crawler.on('page:complete', ({ url }) => {
  captured++;
  const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
  console.log(`[${captured}] ${url} (${elapsed}s)`);
});

crawler.on('error', ({ url, error }) => {
  errors++;
  console.error(`[ERROR] ${url}: ${error.message}`);
});

const result = await crawler.start();

console.log('\n--- Summary ---');
console.log(`Captured: ${captured} pages`);
console.log(`Errors: ${errors}`);
console.log(`Duration: ${(result.stats.duration / 1000).toFixed(1)}s`);

Custom Filtering

import { Crawler, createFilter } from 'smippo';

const filter = createFilter({
  baseUrl: 'https://example.com',
  scope: 'domain',
  include: ['*/docs/*', '*/api/*'],
  exclude: ['*/admin/*', '*tracking*'],
  maxSize: 10 * 1024 * 1024,
});

const crawler = new Crawler({
  url: 'https://example.com',
  output: './filtered-site',
  depth: 5,
  include: filter.includePatterns,
  exclude: filter.excludePatterns,
  maxSize: filter.maxSize,
});

await crawler.start();

Batch Capturing

import { Crawler } from 'smippo';

const sites = [
  'https://site1.example.com',
  'https://site2.example.com',
  'https://site3.example.com',
];

async function captureSites(urls) {
  const results = [];
  
  for (const url of urls) {
    const domain = new URL(url).hostname;
    console.log(`\nCapturing ${domain}...`);
    
    const crawler = new Crawler({
      url,
      output: `./mirrors/${domain}`,
      depth: 3,
      scope: 'subdomain',
    });
    
    crawler.on('page:complete', ({ url }) => {
      console.log(`  ${url}`);
    });
    
    const result = await crawler.start();
    results.push({ domain, ...result.stats });
  }
  
  return results;
}

const results = await captureSites(sites);
console.log('\n--- All Results ---');
console.table(results);

With Timeout Handling

import { Crawler } from 'smippo';

async function captureWithTimeout(url, timeoutMs) {
  const crawler = new Crawler({
    url,
    output: './site',
    depth: 5,
    maxTime: timeoutMs,
  });
  
  let timedOut = false;
  const timer = setTimeout(() => {
    timedOut = true;
    console.log('Timeout reached, finishing up...');
  }, timeoutMs);
  
  try {
    const result = await crawler.start();
    clearTimeout(timer);
    
    if (timedOut) {
      console.log('Capture stopped due to timeout');
    }
    
    return result;
  } catch (error) {
    clearTimeout(timer);
    throw error;
  }
}

const result = await captureWithTimeout('https://large-site.com', 5 * 60 * 1000);

Next Steps

Was this page helpful?