Programmatic API

Use Smippo directly in your Node.js applications for automated captures, custom workflows, and integration with other tools.

Quick Start

import { capture } from 'smippo';

const result = await capture('https://example.com', {
  output: './mirror',
  depth: 2,
});

console.log(`Captured ${result.stats.pagesCapt} pages`);

capture() Function

The simplest way to capture a website programmatically.

Signature

async function capture(url: string, options?: CaptureOptions): Promise<CaptureResult>

Parameters

ParameterTypeDescription
urlstringURL to capture
optionsCaptureOptionsCapture options (see below)

Options

interface CaptureOptions {
  // Core
  output?: string;           // Output directory (default: './site')
  depth?: number;            // Crawl depth (default: 0)
  
  // Scope
  scope?: 'subdomain' | 'domain' | 'tld' | 'all';
  stayInDir?: boolean;
  externalAssets?: boolean;
  
  // Filtering
  include?: string[];        // URL patterns to include
  exclude?: string[];        // URL patterns to exclude
  mimeInclude?: string[];    // MIME types to include
  mimeExclude?: string[];    // MIME types to exclude
  maxSize?: number;          // Max file size in bytes
  minSize?: number;          // Min file size in bytes
  
  // Browser
  wait?: 'networkidle' | 'load' | 'domcontentloaded';
  waitTime?: number;         // Additional wait in ms
  timeout?: number;          // Page timeout in ms
  userAgent?: string;
  viewport?: { width: number; height: number };
  device?: string;           // Device name
  
  // Network
  proxy?: string;
  cookies?: string;          // Path to cookies.json
  headers?: Record<string, string>;
  
  // Output
  structure?: 'original' | 'flat' | 'domain';
  har?: boolean;             // Generate HAR file
  screenshot?: boolean;      // Screenshot each page
  pdf?: boolean;             // PDF each page
  noJs?: boolean;            // Strip JavaScript (static mode)
  
  // Performance
  concurrency?: number;      // Parallel workers (default: 8)
  maxPages?: number;
  maxTime?: number;          // Max time in ms
  rateLimit?: number;        // Delay between requests
  
  // Misc
  ignoreRobots?: boolean;
  useCache?: boolean;
  verbose?: boolean;
  quiet?: boolean;
}

Return Value

interface CaptureResult {
  stats: {
    pagesCapt: number;      // Pages captured
    assetsCapt: number;     // Assets saved
    totalSize: number;      // Total bytes
    duration: number;       // Time in ms
    errors: number;         // Error count
  };
  manifest: object;         // Full manifest data
}

Example

import { capture } from 'smippo';

async function archiveDocs() {
  const result = await capture('https://docs.example.com', {
    output: './docs-archive',
    depth: 5,
    scope: 'subdomain',
    externalAssets: true,
    noJs: true,  // Static mode
    maxPages: 500,
    concurrency: 4,
  });
  
  console.log('Archive complete!');
  console.log(`Pages: ${result.stats.pagesCapt}`);
  console.log(`Size: ${(result.stats.totalSize / 1024 / 1024).toFixed(2)} MB`);
  console.log(`Time: ${(result.stats.duration / 1000).toFixed(1)}s`);
}

archiveDocs();

Crawler Class

For advanced control with events and custom handling.

Basic Usage

import { Crawler } from 'smippo';

const crawler = new Crawler({
  url: 'https://example.com',
  output: './mirror',
  depth: 3,
});

const result = await crawler.start();

Event Handling

import { Crawler } from 'smippo';

const crawler = new Crawler({
  url: 'https://example.com',
  output: './mirror',
  depth: 3,
});

// Page started
crawler.on('page:start', ({ url }) => {
  console.log(`Starting: ${url}`);
});

// Page completed
crawler.on('page:complete', ({ url, size, localPath }) => {
  console.log(`Captured: ${url} (${size} bytes)`);
});

// Asset saved
crawler.on('asset:save', ({ url, localPath, size }) => {
  console.log(`Asset: ${url}`);
});

// Error occurred
crawler.on('error', ({ url, error }) => {
  console.error(`Failed: ${url} - ${error.message}`);
});

const result = await crawler.start();

Events Reference

EventDataDescription
page:start{ url }Page capture starting
page:complete{ url, localPath, size, linksFound }Page captured
asset:save{ url, localPath, size }Asset saved
error{ url, error }Error occurred

createServer() Function

Start a server programmatically.

Usage

import { createServer } from 'smippo';

const server = await createServer({
  directory: './site',
  port: 8080,
  open: true,  // Open browser
});

console.log(`Server running at ${server.url}`);

// Later: stop the server
await server.close();

Options

interface ServerOptions {
  directory?: string;    // Directory to serve (default: './site')
  port?: number;         // Port (default: 8080, auto-finds if busy)
  host?: string;         // Host (default: '127.0.0.1')
  open?: boolean;        // Open browser
  cors?: boolean;        // Enable CORS (default: true)
  verbose?: boolean;     // Log requests
  quiet?: boolean;       // Minimal output
}

Return Value

interface ServerInfo {
  server: http.Server;   // Node HTTP server
  port: number;          // Actual port used
  host: string;
  url: string;           // Full URL
  close(): Promise<void>; // Stop server
}

Utility Functions

URL Utilities

import { 
  normalizeUrl,
  isInScope,
  isSameOrigin,
  isLikelyPage,
  urlToPath 
} from 'smippo';

// Normalize URL
normalizeUrl('https://Example.com/Page/');
// → 'https://example.com/page'

// Check scope
isInScope('https://docs.example.com/page', {
  baseUrl: 'https://www.example.com',
  scope: 'domain'
});
// → true

// Check if URL looks like a page
isLikelyPage('https://example.com/about');
// → true

isLikelyPage('https://example.com/style.css');
// → false

Manifest Functions

import {
  createManifest,
  readManifest,
  writeManifest,
  manifestExists
} from 'smippo';

// Check if capture exists
if (manifestExists('./site')) {
  const manifest = await readManifest('./site');
  console.log(`Root URL: ${manifest.rootUrl}`);
  console.log(`Pages: ${manifest.stats.pagesCapt}`);
}

Filter Creation

import { createFilter } from 'smippo';

const filter = createFilter({
  baseUrl: 'https://example.com',
  scope: 'domain',
  include: ['*.html', '*.css'],
  exclude: ['*tracking*'],
  maxSize: 5 * 1024 * 1024,  // 5MB
});

filter.shouldFollow('https://example.com/page');  // true
filter.shouldFollow('https://other.com/page');    // false
filter.shouldDownload('text/html', 1000);         // true

Complete Example

import { Crawler, createServer } from 'smippo';

async function captureAndServe(url) {
  console.log(`Capturing ${url}...`);
  
  const crawler = new Crawler({
    url,
    output: './captured-site',
    depth: 3,
    scope: 'subdomain',
    externalAssets: true,
    concurrency: 4,
  });
  
  let pageCount = 0;
  crawler.on('page:complete', ({ url }) => {
    pageCount++;
    console.log(`[${pageCount}] ${url}`);
  });
  
  crawler.on('error', ({ url, error }) => {
    console.error(`Error: ${url} - ${error.message}`);
  });
  
  const result = await crawler.start();
  
  console.log('\nCapture complete!');
  console.log(`Pages: ${result.stats.pagesCapt}`);
  console.log(`Assets: ${result.stats.assetsCapt}`);
  console.log(`Errors: ${result.stats.errors}`);
  
  // Start server
  console.log('\nStarting server...');
  const server = await createServer({
    directory: './captured-site',
    open: true,
  });
  
  console.log(`View at: ${server.url}`);
  console.log('Press Ctrl+C to stop');
  
  // Keep running
  process.on('SIGINT', async () => {
    await server.close();
    process.exit(0);
  });
}

captureAndServe('https://docs.example.com');

Next Steps

Was this page helpful?