SDKsPython SDK

Parse Endpoint

AI-powered data extraction using the Python SDK

Basic Usage

from supacrawler import SupacrawlerClient

client = SupacrawlerClient(api_key="YOUR_API_KEY")

# Parse with natural language prompt
response = client.parse("""
Parse blog articles from https://supacrawler.com/blog and return JSON with:
  - Title
  - Published date
  - Summary (2 sentences)
  - Tags
  - Reading time
""")

print(f"job ID: {response.job_id}")

Waiting for Results

# Wait for parsing to complete
result = client.wait_for_parse(response.job_id)

# Access parsed data
print(result.data)  # Structured data based on your prompt

Structured Extraction

With JSON Schema

# Define expected structure
schema = {
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "content": {"type": "string"},
        "links": {
            "type": "array",
            "items": {"type": "string"}
        }
    },
    "required": ["title", "content"]
}

response = client.parse(
    "Extract the page title, main content, and any links from https://httpbin.org/html",
    schema=schema,
    output_format="json"
)

result = client.wait_for_parse(response.job_id)
print(result.data)

Product Information

# Extract product details
response = client.parse("""
From https://example.com/product/123, extract:
- Product name
- Price (USD)
- Available sizes
- Color options
- Customer rating
- In stock status
""")

result = client.wait_for_parse(response.job_id)
product_data = result.data

Output Formats

JSON (Default)

response = client.parse(
    "Extract contact information from https://example.com/contact",
    output_format="json"
)

result = client.wait_for_parse(response.job_id)
import json
print(json.dumps(result.data, indent=2))

CSV Format

response = client.parse(
    "Extract pricing table from https://example.com/pricing",
    output_format="csv"
)

result = client.wait_for_parse(response.job_id)
print(result.data)  # CSV formatted data

# Save to file
with open("pricing.csv", "w") as f:
    f.write(result.data)

Markdown Format

response = client.parse(
    "Summarize the documentation from https://example.com/docs",
    output_format="markdown"
)

result = client.wait_for_parse(response.job_id)
print(result.data)  # Markdown formatted summary

Advanced Use Cases

Multi-Page Extraction

# Parse automatically decides whether to crawl multiple pages
response = client.parse("""
From https://example.com/team, extract all team members with:
- Name
- Role
- Bio
- Social links
Note: Team members may be on multiple pages
""")

result = client.wait_for_parse(response.job_id)
team_data = result.data

Table Extraction

# Extract tables from pages
response = client.parse(
    "Extract all pricing tiers from https://example.com/pricing as a table",
    output_format="csv"
)

result = client.wait_for_parse(response.job_id)

# Process CSV data
import csv
import io

reader = csv.DictReader(io.StringIO(result.data))
for row in reader:
    print(f"tier: {row['name']}, price: {row['price']}")

News Articles

schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "title": {"type": "string"},
            "author": {"type": "string"},
            "date": {"type": "string"},
            "summary": {"type": "string"},
            "url": {"type": "string"}
        }
    }
}

response = client.parse(
    "Extract all news articles from https://example.com/news with title, author, date, and summary",
    schema=schema,
    output_format="json"
)

result = client.wait_for_parse(response.job_id)
articles = result.data

Complete Example

import os
import json
from dotenv import load_dotenv
from supacrawler import SupacrawlerClient

load_dotenv()

client = SupacrawlerClient(api_key=os.environ.get("SUPACRAWLER_API_KEY"))

# Define extraction schema
product_schema = {
    "type": "object",
    "properties": {
        "name": {"type": "string"},
        "price": {"type": "number"},
        "currency": {"type": "string"},
        "description": {"type": "string"},
        "features": {
            "type": "array",
            "items": {"type": "string"}
        },
        "availability": {"type": "boolean"}
    },
    "required": ["name", "price"]
}

# Parse with custom prompt
prompt = """
Extract product information from https://example.com/product/laptop-x1
Include: name, price, description, key features, and availability
"""

try:
    response = client.parse(
        prompt,
        schema=product_schema,
        output_format="json"
    )
    
    print(f"parsing job started: {response.job_id}")
    
    # Wait for completion
    result = client.wait_for_parse(
        response.job_id,
        interval_seconds=2.0,
        timeout_seconds=120.0
    )
    
    # Process results
    if result.data:
        product = result.data
        print(f"\nβœ… product extracted:")
        print(json.dumps(product, indent=2))
        
        # Save to file
        with open("product_data.json", "w") as f:
            json.dump(product, f, indent=2)
    else:
        print("no data extracted")
        
except Exception as e:
    print(f"parse failed: {e}")

Error Handling

try:
    response = client.parse(
        "Extract data from https://example.com",
        output_format="json"
    )
    
    result = client.wait_for_parse(response.job_id)
    
    if result.data:
        print("βœ… parsing successful")
        print(result.data)
    else:
        print("⚠️ no data returned")
        
except TimeoutError:
    print("parsing took too long")
except Exception as e:
    print(f"error: {e}")

Best Practices

Clear Prompts

# βœ… Good: Specific and clear
prompt = """
From https://example.com/pricing, extract:
1. Plan name (string)
2. Monthly price in USD (number)
3. Features list (array of strings)
4. Has free trial (boolean)
"""

# ❌ Bad: Too vague
prompt = "Get pricing info from example.com"

Structured Schemas

# βœ… Good: Well-defined schema
schema = {
    "type": "object",
    "properties": {
        "items": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "price": {"type": "number"}
                },
                "required": ["name", "price"]
            }
        }
    }
}

# ❌ Bad: Too loose
schema = {"type": "object"}

Handle Timeouts

# Set appropriate timeout based on complexity
simple_parse = client.wait_for_parse(job_id, timeout_seconds=60.0)

# Longer timeout for complex multi-page extraction
complex_parse = client.wait_for_parse(job_id, timeout_seconds=300.0)

Next Steps

Was this page helpful?