Supacrawler Docs
SDKsTypeScript SDK

Crawl Endpoint

Recursively crawl websites using the TypeScript SDK

Crawl Endpoint

Recursively scrape multiple pages from a website, following links to discover and extract content.

Basic Usage

import { SupacrawlerClient, CrawlCreateRequest } from '@supacrawler/js'

const client = new SupacrawlerClient({ apiKey: 'YOUR_API_KEY' })

// Create crawl job
const job = await client.createCrawlJob({
  url: 'https://supabase.com/docs',
  format: CrawlCreateRequest.format.MARKDOWN,
  link_limit: 10,
  depth: 2,
  include_subdomains: false,
  render_js: false
})

console.log('job created:', job.job_id)

Waiting for Completion

// Wait for crawl to complete
const status = await client.waitForCrawl(job.job_id, {
  intervalMs: 3000,    // Check every 3 seconds
  timeoutMs: 600000    // 10 minute timeout
})

console.log('final status:', status.status)

if (status.data?.crawl_data) {
  console.log('crawled pages:', Object.keys(status.data.crawl_data).length)
}

Job Parameters

const job = await client.createCrawlJob({
  url: 'https://docs.example.com',
  format: CrawlCreateRequest.format.MARKDOWN,
  link_limit: 50,           // Max 50 pages
  depth: 3,                 // 3 levels deep
  include_subdomains: false // Same subdomain only
})

JavaScript Rendering

const job = await client.createCrawlJob({
  url: 'https://spa-site.com',
  format: CrawlCreateRequest.format.MARKDOWN,
  render_js: true,  // Render JS for each page
  link_limit: 20,
  depth: 2
})

Processing Results

const result = await client.waitForCrawl(job.job_id)

// Access crawled data
const pages = result.data?.crawl_data

if (pages) {
  for (const [url, page] of Object.entries(pages)) {
    console.log(`URL: ${url}`)
    console.log(`title: ${page.metadata?.title}`)
    console.log(`content preview: ${page.markdown?.substring(0, 200)}...`)
    console.log('-'.repeat(50))
  }
}

Complete Example

import { SupacrawlerClient, CrawlCreateRequest } from '@supacrawler/js'

async function main() {
  const client = new SupacrawlerClient({
    apiKey: process.env.SUPACRAWLER_API_KEY || 'YOUR_API_KEY'
  })

  try {
    // Create job
    const job = await client.createCrawlJob({
      url: 'https://supabase.com/docs',
      format: CrawlCreateRequest.format.MARKDOWN,
      link_limit: 25,
      depth: 2,
      include_subdomains: false,
      render_js: false
    })

    console.log(`crawl job started: ${job.job_id}`)

    // Wait for completion
    const result = await client.waitForCrawl(job.job_id, {
      intervalMs: 5000,
      timeoutMs: 600000
    })

    console.log(`\n✅ crawl completed!`)
    console.log(`status: ${result.status}`)

    // Save pages
    const pages = result.data?.crawl_data
    if (pages) {
      let i = 1
      for (const [url, page] of Object.entries(pages)) {
        const filename = `page_${i}.md`
        await Bun.write(filename, `# ${page.metadata?.title}\n\nURL: ${url}\n\n${page.markdown || ''}`)
        console.log(`saved ${filename}`)
        i++
      }
    }
  } catch (error) {
    console.error('crawl failed:', error)
    process.exit(1)
  }
}

main()

Response Structure

interface CrawlResponse {
  job_id: string
  status: string  // 'completed', 'processing', 'failed'
  created_at: string
  data?: {
    crawl_data: {
      [url: string]: {
        markdown?: string
        html?: string
        metadata?: {
          title?: string
          description?: string
          status_code?: number
          source_url?: string
        }
      }
    }
  }
}

Error Handling

try {
  const job = await client.createCrawlJob({
    url: 'https://example.com',
    format: CrawlCreateRequest.format.MARKDOWN,
    link_limit: 10
  })

  const result = await client.waitForCrawl(job.job_id, {
    timeoutMs: 300000
  })

  if (result.status === 'completed') {
    console.log('✅ crawl completed successfully')
  }
} catch (error) {
  console.error('crawl error:', error)
}

Next Steps

Was this page helpful?