Supacrawler Docs
SDKsPython SDK

Crawl Endpoint

Recursively crawl websites using the Python SDK

Basic Usage

from supacrawler import SupacrawlerClient

client = SupacrawlerClient(api_key="YOUR_API_KEY")

# Create a crawl job
job = client.create_crawl_job(
    url="https://supabase.com/docs",
    format="markdown",
    link_limit=3,
    depth=1
)

print(f"job created: {job.job_id}")

Waiting for Completion

# Poll until completion
crawl_output = client.wait_for_crawl(
    job.job_id,
    interval_seconds=3.0,
    timeout_seconds=300.0
)

# Access results
print(f"status: {crawl_output.status}")
print(f"pages crawled: {len(crawl_output.data.crawl_data)}")

Job Parameters

URL and Format

job = client.create_crawl_job(
    url="https://example.com",
    format="markdown",  # or "html", "links"
)
job = client.create_crawl_job(
    url="https://docs.example.com",
    format="markdown",
    link_limit=50,          # Maximum 50 pages
    depth=2,                # Crawl 2 levels deep
    include_subdomains=False  # Stay on same subdomain
)

JavaScript Rendering

job = client.create_crawl_job(
    url="https://spa-site.com",
    format="markdown",
    render_js=True,  # Render JavaScript for each page
    link_limit=10,
    depth=1
)

Processing Results

Access Crawled Data

# Get crawl results
crawl_output = client.wait_for_crawl(job.job_id)

# Iterate through all pages
for page in crawl_output.data.crawl_data:
    print(f"URL: {page.metadata.source_url}")
    print(f"Title: {page.metadata.title}")
    print(f"Content: {page.markdown[:200]}...")  # First 200 chars
    print("-" * 50)

Extract Specific Page

# Get the first page
crawl_data = crawl_output.data.crawl_data
first_page = crawl_data[0]

# Display content
print(first_page.markdown)
print(first_page.metadata.to_json())

Filter Pages

# Find pages containing specific keywords
keyword = "authentication"
auth_pages = [
    page for page in crawl_output.data.crawl_data
    if keyword in page.markdown.lower()
]

print(f"found {len(auth_pages)} pages about {keyword}")

Advanced Options

Include Subdomains

job = client.create_crawl_job(
    url="https://example.com",
    format="markdown",
    include_subdomains=True,  # Crawl blog.example.com, docs.example.com, etc.
    link_limit=100,
    depth=2
)

URL Patterns

# Only crawl specific URL patterns
job = client.create_crawl_job(
    url="https://example.com/blog",
    format="markdown",
    link_limit=20,
    depth=2,
    # Additional filtering can be done after retrieval
)

# Filter results by URL pattern
blog_posts = [
    page for page in crawl_output.data.crawl_data
    if "/blog/post/" in page.metadata.source_url
]

Complete Example

import os
from dotenv import load_dotenv
from supacrawler import SupacrawlerClient
from supacrawler.scraper_client.models import CrawlCreateRequest

load_dotenv()

client = SupacrawlerClient(api_key=os.environ.get("SUPACRAWLER_API_KEY"))

# Create comprehensive crawl job
job = client.create_crawl_job(
    url="https://supabase.com/docs",
    format="markdown",
    link_limit=25,
    depth=2,
    include_subdomains=False,
    render_js=False
)

print(f"crawl job started: {job.job_id}")

# Wait for completion with progress updates
try:
    result = client.wait_for_crawl(
        job.job_id,
        interval_seconds=5.0,
        timeout_seconds=600.0
    )
    
    print(f"\n✅ crawl completed!")
    print(f"pages crawled: {len(result.data.crawl_data)}")
    
    # Save all pages to files
    for i, page in enumerate(result.data.crawl_data):
        filename = f"page_{i+1}.md"
        with open(filename, "w") as f:
            f.write(f"# {page.metadata.title}\n\n")
            f.write(f"URL: {page.metadata.source_url}\n\n")
            f.write(page.markdown)
        print(f"saved {filename}")
        
except Exception as e:
    print(f"crawl failed: {e}")

Job Status

Check Job Status

# Create job
job = client.create_crawl_job(
    url="https://example.com",
    format="markdown",
    link_limit=10
)

# Check status without blocking
status = client.get_crawl_status(job.job_id)
print(f"status: {status.status}")
print(f"progress: {status.progress}")

Response Structure

# Crawl output structure
result = client.wait_for_crawl(job.job_id)

# Job information
print(result.job_id)           # Job ID
print(result.status)           # "completed", "processing", "failed"
print(result.created_at)       # Creation timestamp

# Data
crawl_data = result.data.crawl_data  # List of scraped pages

# Each page contains:
for page in crawl_data:
    print(page.markdown)                    # Markdown content
    print(page.html)                        # HTML content (if requested)
    print(page.metadata.source_url)         # Page URL
    print(page.metadata.title)              # Page title
    print(page.metadata.description)        # Meta description
    print(page.metadata.status_code)        # HTTP status

Error Handling

from supacrawler import SupacrawlerClient

client = SupacrawlerClient(api_key="YOUR_API_KEY")

try:
    job = client.create_crawl_job(
        url="https://example.com",
        format="markdown",
        link_limit=10,
        depth=2
    )
    
    result = client.wait_for_crawl(
        job.job_id,
        timeout_seconds=300.0
    )
    
    if result.status == "completed":
        print(f"✅ crawled {len(result.data.crawl_data)} pages")
    else:
        print(f"⚠️ crawl status: {result.status}")
        
except TimeoutError:
    print("crawl took too long")
except Exception as e:
    print(f"error: {e}")

Use Cases

Documentation Crawler

# Crawl entire documentation site
job = client.create_crawl_job(
    url="https://docs.example.com",
    format="markdown",
    link_limit=200,
    depth=3,
    include_subdomains=False
)

result = client.wait_for_crawl(job.job_id)

# Build search index
docs_index = [
    {
        "url": page.metadata.source_url,
        "title": page.metadata.title,
        "content": page.markdown
    }
    for page in result.data.crawl_data
]

Blog Archive

# Crawl blog posts
job = client.create_crawl_job(
    url="https://example.com/blog",
    format="markdown",
    link_limit=100,
    depth=2
)

result = client.wait_for_crawl(job.job_id)

# Extract blog posts
blog_posts = [
    {
        "title": page.metadata.title,
        "url": page.metadata.source_url,
        "content": page.markdown,
        "date": page.metadata.modified_time
    }
    for page in result.data.crawl_data
    if "/blog/" in page.metadata.source_url
]

Next Steps

Was this page helpful?