SDKsPython SDK
Parse Endpoint
AI-powered data extraction using the Python SDK
Basic Usage
from supacrawler import SupacrawlerClient
client = SupacrawlerClient(api_key="YOUR_API_KEY")
# Parse with natural language prompt
response = client.parse("""
Parse blog articles from https://supacrawler.com/blog and return JSON with:
- Title
- Published date
- Summary (2 sentences)
- Tags
- Reading time
""")
print(f"job ID: {response.job_id}")
Waiting for Results
# Wait for parsing to complete
result = client.wait_for_parse(response.job_id)
# Access parsed data
print(result.data) # Structured data based on your prompt
Structured Extraction
With JSON Schema
# Define expected structure
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"content": {"type": "string"},
"links": {
"type": "array",
"items": {"type": "string"}
}
},
"required": ["title", "content"]
}
response = client.parse(
"Extract the page title, main content, and any links from https://httpbin.org/html",
schema=schema,
output_format="json"
)
result = client.wait_for_parse(response.job_id)
print(result.data)
Product Information
# Extract product details
response = client.parse("""
From https://example.com/product/123, extract:
- Product name
- Price (USD)
- Available sizes
- Color options
- Customer rating
- In stock status
""")
result = client.wait_for_parse(response.job_id)
product_data = result.data
Output Formats
JSON (Default)
response = client.parse(
"Extract contact information from https://example.com/contact",
output_format="json"
)
result = client.wait_for_parse(response.job_id)
import json
print(json.dumps(result.data, indent=2))
CSV Format
response = client.parse(
"Extract pricing table from https://example.com/pricing",
output_format="csv"
)
result = client.wait_for_parse(response.job_id)
print(result.data) # CSV formatted data
# Save to file
with open("pricing.csv", "w") as f:
f.write(result.data)
Markdown Format
response = client.parse(
"Summarize the documentation from https://example.com/docs",
output_format="markdown"
)
result = client.wait_for_parse(response.job_id)
print(result.data) # Markdown formatted summary
Advanced Use Cases
Multi-Page Extraction
# Parse automatically decides whether to crawl multiple pages
response = client.parse("""
From https://example.com/team, extract all team members with:
- Name
- Role
- Bio
- Social links
Note: Team members may be on multiple pages
""")
result = client.wait_for_parse(response.job_id)
team_data = result.data
Table Extraction
# Extract tables from pages
response = client.parse(
"Extract all pricing tiers from https://example.com/pricing as a table",
output_format="csv"
)
result = client.wait_for_parse(response.job_id)
# Process CSV data
import csv
import io
reader = csv.DictReader(io.StringIO(result.data))
for row in reader:
print(f"tier: {row['name']}, price: {row['price']}")
News Articles
schema = {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"author": {"type": "string"},
"date": {"type": "string"},
"summary": {"type": "string"},
"url": {"type": "string"}
}
}
}
response = client.parse(
"Extract all news articles from https://example.com/news with title, author, date, and summary",
schema=schema,
output_format="json"
)
result = client.wait_for_parse(response.job_id)
articles = result.data
Complete Example
import os
import json
from dotenv import load_dotenv
from supacrawler import SupacrawlerClient
load_dotenv()
client = SupacrawlerClient(api_key=os.environ.get("SUPACRAWLER_API_KEY"))
# Define extraction schema
product_schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"currency": {"type": "string"},
"description": {"type": "string"},
"features": {
"type": "array",
"items": {"type": "string"}
},
"availability": {"type": "boolean"}
},
"required": ["name", "price"]
}
# Parse with custom prompt
prompt = """
Extract product information from https://example.com/product/laptop-x1
Include: name, price, description, key features, and availability
"""
try:
response = client.parse(
prompt,
schema=product_schema,
output_format="json"
)
print(f"parsing job started: {response.job_id}")
# Wait for completion
result = client.wait_for_parse(
response.job_id,
interval_seconds=2.0,
timeout_seconds=120.0
)
# Process results
if result.data:
product = result.data
print(f"\n✅ product extracted:")
print(json.dumps(product, indent=2))
# Save to file
with open("product_data.json", "w") as f:
json.dump(product, f, indent=2)
else:
print("no data extracted")
except Exception as e:
print(f"parse failed: {e}")
Error Handling
try:
response = client.parse(
"Extract data from https://example.com",
output_format="json"
)
result = client.wait_for_parse(response.job_id)
if result.data:
print("✅ parsing successful")
print(result.data)
else:
print("⚠️ no data returned")
except TimeoutError:
print("parsing took too long")
except Exception as e:
print(f"error: {e}")
Best Practices
Clear Prompts
# ✅ Good: Specific and clear
prompt = """
From https://example.com/pricing, extract:
1. Plan name (string)
2. Monthly price in USD (number)
3. Features list (array of strings)
4. Has free trial (boolean)
"""
# ❌ Bad: Too vague
prompt = "Get pricing info from example.com"
Structured Schemas
# ✅ Good: Well-defined schema
schema = {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"}
},
"required": ["name", "price"]
}
}
}
}
# ❌ Bad: Too loose
schema = {"type": "object"}
Handle Timeouts
# Set appropriate timeout based on complexity
simple_parse = client.wait_for_parse(job_id, timeout_seconds=60.0)
# Longer timeout for complex multi-page extraction
complex_parse = client.wait_for_parse(job_id, timeout_seconds=300.0)
Next Steps
- Scrape Endpoint - Basic content extraction
- Crawl Endpoint - Multi-page scraping
- Watch Endpoint - Monitor for changes
Was this page helpful?