Supacrawler Docs

Supabase Vector

Production-ready Postgres vector store with pgvector for semantic search. Build semantic search with Supabase and Supacrawler in minutes.

Supabase Vector (pgvector)

Use Supabase's pgvector extension for production-ready vector storage and semantic search.

Setup

Enable pgvector Extension

-- Enable the extension
CREATE EXTENSION IF NOT EXISTS vector;

Create Table

-- Create documents table
CREATE TABLE documents (
  id BIGSERIAL PRIMARY KEY,
  content TEXT,
  metadata JSONB,
  embedding VECTOR(1536)
);

-- Create HNSW index for fast similarity search
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops);

Complete Example

import os
from supabase import create_client, Client
from supacrawler import SupacrawlerClient
import openai

# Configuration
SUPABASE_URL = os.environ['SUPABASE_URL']
SUPABASE_KEY = os.environ['SUPABASE_SERVICE_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
SUPACRAWLER_API_KEY = os.environ['SUPACRAWLER_API_KEY']

# Initialize clients
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
crawler = SupacrawlerClient(api_key=SUPACRAWLER_API_KEY)
openai.api_key = OPENAI_API_KEY

# Step 1: Scrape content
result = crawler.scrape('https://example.com/docs', format='markdown')

# Step 2: Generate embedding
response = openai.embeddings.create(
    model='text-embedding-3-small',
    input=result.content
)
embedding = response.data[0].embedding

# Step 3: Store in Supabase
data = supabase.table('documents').insert({
    'content': result.content,
    'metadata': {
        'url': result.url,
        'title': result.title
    },
    'embedding': embedding
}).execute()

print(f"Stored document: {data.data[0]['id']}")

# Step 4: Semantic search
query = "How do I configure authentication?"
query_response = openai.embeddings.create(
    model='text-embedding-3-small',
    input=query
)
query_embedding = query_response.data[0].embedding

# Search with pgvector
results = supabase.rpc('match_documents', {
    'query_embedding': query_embedding,
    'match_count': 5
}).execute()

for doc in results.data:
    print(f"Similarity: {doc['similarity']:.3f}")
    print(f"Title: {doc['metadata']['title']}")
    print(f"Content: {doc['content'][:200]}...")
    print("---")

Create Match Function

-- Function for similarity search
CREATE OR REPLACE FUNCTION match_documents(
  query_embedding VECTOR(1536),
  match_count INT DEFAULT 5
)
RETURNS TABLE (
  id BIGINT,
  content TEXT,
  metadata JSONB,
  similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
  RETURN QUERY
  SELECT
    documents.id,
    documents.content,
    documents.metadata,
    1 - (documents.embedding <=> query_embedding) AS similarity
  FROM documents
  ORDER BY documents.embedding <=> query_embedding
  LIMIT match_count;
END;
$$;

Batch Processing

# Crawl multiple pages
job = crawler.create_crawl_job(
    url='https://example.com/docs',
    depth=2,
    link_limit=50
)
final = crawler.wait_for_crawl(job.job_id)

# Batch embed and store
documents = []
for url, page in final.data.crawl_data.items():
    if hasattr(page, 'markdown') and page.markdown:
        # Generate embedding
        response = openai.embeddings.create(
            model='text-embedding-3-small',
            input=page.markdown
        )
        
        documents.append({
            'content': page.markdown,
            'metadata': {
                'url': url,
                'title': page.metadata.title if hasattr(page, 'metadata') else None
            },
            'embedding': response.data[0].embedding
        })

# Batch insert
result = supabase.table('documents').insert(documents).execute()
print(f"Stored {len(result.data)} documents")

Performance Tips

  1. Use HNSW indexes for faster queries (up to 100x)
  2. Batch operations to reduce network overhead
  3. Choose appropriate dimensions (1536 for text-embedding-3-small)
  4. Cache embeddings to avoid recomputing
  5. Use RLS policies for multi-tenant applications

Resources

Was this page helpful?