SDKsTypeScript SDK
Crawl Endpoint
Recursively crawl websites using the TypeScript SDK
Crawl Endpoint
Recursively scrape multiple pages from a website, following links to discover and extract content.
Basic Usage
import { SupacrawlerClient, CrawlCreateRequest } from '@supacrawler/js'
const client = new SupacrawlerClient({ apiKey: 'YOUR_API_KEY' })
// Create crawl job
const job = await client.createCrawlJob({
url: 'https://supabase.com/docs',
format: CrawlCreateRequest.format.MARKDOWN,
link_limit: 10,
depth: 2,
include_subdomains: false,
render_js: false
})
console.log('job created:', job.job_id)
Waiting for Completion
// Wait for crawl to complete
const status = await client.waitForCrawl(job.job_id, {
intervalMs: 3000, // Check every 3 seconds
timeoutMs: 600000 // 10 minute timeout
})
console.log('final status:', status.status)
if (status.data?.crawl_data) {
console.log('crawled pages:', Object.keys(status.data.crawl_data).length)
}
Job Parameters
Link Limits and Depth
const job = await client.createCrawlJob({
url: 'https://docs.example.com',
format: CrawlCreateRequest.format.MARKDOWN,
link_limit: 50, // Max 50 pages
depth: 3, // 3 levels deep
include_subdomains: false // Same subdomain only
})
JavaScript Rendering
const job = await client.createCrawlJob({
url: 'https://spa-site.com',
format: CrawlCreateRequest.format.MARKDOWN,
render_js: true, // Render JS for each page
link_limit: 20,
depth: 2
})
Processing Results
const result = await client.waitForCrawl(job.job_id)
// Access crawled data
const pages = result.data?.crawl_data
if (pages) {
for (const [url, page] of Object.entries(pages)) {
console.log(`URL: ${url}`)
console.log(`title: ${page.metadata?.title}`)
console.log(`content preview: ${page.markdown?.substring(0, 200)}...`)
console.log('-'.repeat(50))
}
}
Complete Example
import { SupacrawlerClient, CrawlCreateRequest } from '@supacrawler/js'
async function main() {
const client = new SupacrawlerClient({
apiKey: process.env.SUPACRAWLER_API_KEY || 'YOUR_API_KEY'
})
try {
// Create job
const job = await client.createCrawlJob({
url: 'https://supabase.com/docs',
format: CrawlCreateRequest.format.MARKDOWN,
link_limit: 25,
depth: 2,
include_subdomains: false,
render_js: false
})
console.log(`crawl job started: ${job.job_id}`)
// Wait for completion
const result = await client.waitForCrawl(job.job_id, {
intervalMs: 5000,
timeoutMs: 600000
})
console.log(`\n✅ crawl completed!`)
console.log(`status: ${result.status}`)
// Save pages
const pages = result.data?.crawl_data
if (pages) {
let i = 1
for (const [url, page] of Object.entries(pages)) {
const filename = `page_${i}.md`
await Bun.write(filename, `# ${page.metadata?.title}\n\nURL: ${url}\n\n${page.markdown || ''}`)
console.log(`saved ${filename}`)
i++
}
}
} catch (error) {
console.error('crawl failed:', error)
process.exit(1)
}
}
main()
Response Structure
interface CrawlResponse {
job_id: string
status: string // 'completed', 'processing', 'failed'
created_at: string
data?: {
crawl_data: {
[url: string]: {
markdown?: string
html?: string
metadata?: {
title?: string
description?: string
status_code?: number
source_url?: string
}
}
}
}
}
Error Handling
try {
const job = await client.createCrawlJob({
url: 'https://example.com',
format: CrawlCreateRequest.format.MARKDOWN,
link_limit: 10
})
const result = await client.waitForCrawl(job.job_id, {
timeoutMs: 300000
})
if (result.status === 'completed') {
console.log('✅ crawl completed successfully')
}
} catch (error) {
console.error('crawl error:', error)
}
Next Steps
- Scrape Endpoint - Single page extraction
- Screenshots Endpoint - Visual captures
- Watch Endpoint - Monitor changes
Was this page helpful?