Crawl

Create and manage asynchronous crawling jobs to extract content from multiple pages at scale. Perfect for processing entire websites or large amounts of data without waiting for immediate responses.

Quick Example

curl -X POST https://api.supacrawler.com/api/v1/crawl \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com/docs",
    "depth": 2,
    "link_limit": 10
  }'

from supacrawler import SupacrawlerClient
import os

client = SupacrawlerClient(api_key=os.environ.get('SUPACRAWLER_API_KEY'))

job = client.create_crawl_job(
    url="https://example.com/docs",
    depth=2,
    link_limit=10
)

final = client.wait_for_crawl(job.job_id, interval_seconds=3.0)
print(f"Crawled {len(final.data)} pages")

import { SupacrawlerClient } from '@supacrawler/js'

const client = new SupacrawlerClient({ apiKey: process.env.SUPACRAWLER_API_KEY })

const job = await client.createCrawlJob({
  url: 'https://example.com/docs',
  depth: 2,
  link_limit: 10
})

const status = await client.waitForCrawl(job.job_id)
console.log('Pages crawled:', Object.keys(status.data?.crawl_data || {}).length)

Job Created

{
  "success": true,
  "job_id": "550e8400-e29b-41d4-a716-446655440000"
}

Create Crawl Job

Endpoint

POST /v1/crawl

Create an asynchronous crawling job to extract content from multiple pages.

Parameters

Prop

Type

Request

curl -X POST https://api.supacrawler.com/api/v1/crawl \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "url": "https://example.com",
    "depth": 2,
    "link_limit": 10,
    "patterns": ["/docs/*"]
  }'

import requests

response = requests.post(
    "https://api.supacrawler.com/api/v1/crawl",
    headers={
        "Authorization": "Bearer YOUR_API_KEY",
        "Content-Type": "application/json"
    },
    json={
        "url": "https://example.com",
        "depth": 2,
        "link_limit": 10,
        "patterns": ["/docs/*"]
    }
)
print(response.json())

const response = await fetch(
  'https://api.supacrawler.com/api/v1/crawl',
  {
    method: 'POST',
    headers: {
      'Authorization': 'Bearer YOUR_API_KEY',
      'Content-Type': 'application/json'
    },
    body: JSON.stringify({
      url: 'https://example.com',
      depth: 2,
      link_limit: 10,
      patterns: ['/docs/*']
    })
  }
);

const result = await response.json();
console.log(result);

Response

{
  "success": true,
  "job_id": "550e8400-e29b-41d4-a716-446655440000"
}

Get Job Status

Endpoint

GET /v1/crawl/{jobId}

Poll this endpoint to check job status and retrieve results when completed.

Parameters

Prop

Type

Request

curl https://api.supacrawler.com/api/v1/crawl/550e8400-e29b-41d4-a716-446655440000 \
  -H "Authorization: Bearer YOUR_API_KEY"

import requests

job_id = "550e8400-e29b-41d4-a716-446655440000"
response = requests.get(
    f"https://api.supacrawler.com/api/v1/crawl/{job_id}",
    headers={"Authorization": "Bearer YOUR_API_KEY"}
)
print(response.json())

const jobId = '550e8400-e29b-41d4-a716-446655440000';
const response = await fetch(
  `https://api.supacrawler.com/api/v1/crawl/${jobId}`,
  { headers: { 'Authorization': 'Bearer YOUR_API_KEY' } }
);

const result = await response.json();
console.log(result);

Response

{
  "job_id": "550e8400-e29b-41d4-a716-446655440000",
  "status": "completed",
  "data": {
    "url": "https://example.com",
    "crawl_data": {
      "https://example.com": {
        "markdown": "# Example Domain...",
        "links": ["https://example.com/about"],
        "metadata": {
          "title": "Example Domain",
          "status_code": 200
        }
      }
    },
    "statistics": {
      "total_pages": 2,
      "successful_pages": 2,
      "failed_pages": 0
    }
  }
}

Response Model

Prop

Type

Was this page helpful?

Crawl

On this page