Crawl Endpoint
Recursively crawl websites using the Python SDK
Basic Usage
from supacrawler import SupacrawlerClient
client = SupacrawlerClient(api_key="YOUR_API_KEY")
# Create a crawl job
job = client.create_crawl_job(
url="https://supabase.com/docs",
format="markdown",
link_limit=3,
depth=1
)
print(f"job created: {job.job_id}")
Waiting for Completion
# Poll until completion
crawl_output = client.wait_for_crawl(
job.job_id,
interval_seconds=3.0,
timeout_seconds=300.0
)
# Access results
print(f"status: {crawl_output.status}")
print(f"pages crawled: {len(crawl_output.data.crawl_data)}")
Job Parameters
URL and Format
job = client.create_crawl_job(
url="https://example.com",
format="markdown", # or "html", "links"
)
Link Limits and Depth
job = client.create_crawl_job(
url="https://docs.example.com",
format="markdown",
link_limit=50, # Maximum 50 pages
depth=2, # Crawl 2 levels deep
include_subdomains=False # Stay on same subdomain
)
JavaScript Rendering
job = client.create_crawl_job(
url="https://spa-site.com",
format="markdown",
render_js=True, # Render JavaScript for each page
link_limit=10,
depth=1
)
Processing Results
Access Crawled Data
# Get crawl results
crawl_output = client.wait_for_crawl(job.job_id)
# Iterate through all pages
for page in crawl_output.data.crawl_data:
print(f"URL: {page.metadata.source_url}")
print(f"Title: {page.metadata.title}")
print(f"Content: {page.markdown[:200]}...") # First 200 chars
print("-" * 50)
Extract Specific Page
# Get the first page
crawl_data = crawl_output.data.crawl_data
first_page = crawl_data[0]
# Display content
print(first_page.markdown)
print(first_page.metadata.to_json())
Filter Pages
# Find pages containing specific keywords
keyword = "authentication"
auth_pages = [
page for page in crawl_output.data.crawl_data
if keyword in page.markdown.lower()
]
print(f"found {len(auth_pages)} pages about {keyword}")
Advanced Options
Include Subdomains
job = client.create_crawl_job(
url="https://example.com",
format="markdown",
include_subdomains=True, # Crawl blog.example.com, docs.example.com, etc.
link_limit=100,
depth=2
)
URL Patterns
# Only crawl specific URL patterns
job = client.create_crawl_job(
url="https://example.com/blog",
format="markdown",
link_limit=20,
depth=2,
# Additional filtering can be done after retrieval
)
# Filter results by URL pattern
blog_posts = [
page for page in crawl_output.data.crawl_data
if "/blog/post/" in page.metadata.source_url
]
Complete Example
import os
from dotenv import load_dotenv
from supacrawler import SupacrawlerClient
from supacrawler.scraper_client.models import CrawlCreateRequest
load_dotenv()
client = SupacrawlerClient(api_key=os.environ.get("SUPACRAWLER_API_KEY"))
# Create comprehensive crawl job
job = client.create_crawl_job(
url="https://supabase.com/docs",
format="markdown",
link_limit=25,
depth=2,
include_subdomains=False,
render_js=False
)
print(f"crawl job started: {job.job_id}")
# Wait for completion with progress updates
try:
result = client.wait_for_crawl(
job.job_id,
interval_seconds=5.0,
timeout_seconds=600.0
)
print(f"\n✅ crawl completed!")
print(f"pages crawled: {len(result.data.crawl_data)}")
# Save all pages to files
for i, page in enumerate(result.data.crawl_data):
filename = f"page_{i+1}.md"
with open(filename, "w") as f:
f.write(f"# {page.metadata.title}\n\n")
f.write(f"URL: {page.metadata.source_url}\n\n")
f.write(page.markdown)
print(f"saved {filename}")
except Exception as e:
print(f"crawl failed: {e}")
Job Status
Check Job Status
# Create job
job = client.create_crawl_job(
url="https://example.com",
format="markdown",
link_limit=10
)
# Check status without blocking
status = client.get_crawl_status(job.job_id)
print(f"status: {status.status}")
print(f"progress: {status.progress}")
Response Structure
# Crawl output structure
result = client.wait_for_crawl(job.job_id)
# Job information
print(result.job_id) # Job ID
print(result.status) # "completed", "processing", "failed"
print(result.created_at) # Creation timestamp
# Data
crawl_data = result.data.crawl_data # List of scraped pages
# Each page contains:
for page in crawl_data:
print(page.markdown) # Markdown content
print(page.html) # HTML content (if requested)
print(page.metadata.source_url) # Page URL
print(page.metadata.title) # Page title
print(page.metadata.description) # Meta description
print(page.metadata.status_code) # HTTP status
Error Handling
from supacrawler import SupacrawlerClient
client = SupacrawlerClient(api_key="YOUR_API_KEY")
try:
job = client.create_crawl_job(
url="https://example.com",
format="markdown",
link_limit=10,
depth=2
)
result = client.wait_for_crawl(
job.job_id,
timeout_seconds=300.0
)
if result.status == "completed":
print(f"✅ crawled {len(result.data.crawl_data)} pages")
else:
print(f"⚠️ crawl status: {result.status}")
except TimeoutError:
print("crawl took too long")
except Exception as e:
print(f"error: {e}")
Use Cases
Documentation Crawler
# Crawl entire documentation site
job = client.create_crawl_job(
url="https://docs.example.com",
format="markdown",
link_limit=200,
depth=3,
include_subdomains=False
)
result = client.wait_for_crawl(job.job_id)
# Build search index
docs_index = [
{
"url": page.metadata.source_url,
"title": page.metadata.title,
"content": page.markdown
}
for page in result.data.crawl_data
]
Blog Archive
# Crawl blog posts
job = client.create_crawl_job(
url="https://example.com/blog",
format="markdown",
link_limit=100,
depth=2
)
result = client.wait_for_crawl(job.job_id)
# Extract blog posts
blog_posts = [
{
"title": page.metadata.title,
"url": page.metadata.source_url,
"content": page.markdown,
"date": page.metadata.modified_time
}
for page in result.data.crawl_data
if "/blog/" in page.metadata.source_url
]
Next Steps
- Parse Endpoint - Extract structured data with AI
- Screenshots Endpoint - Capture visual snapshots
- Watch Endpoint - Monitor for changes
Was this page helpful?