Supabase Vector
Production-ready Postgres vector store with pgvector for semantic search. Build semantic search with Supabase and Supacrawler in minutes.
Supabase Vector (pgvector)
Use Supabase's pgvector extension for production-ready vector storage and semantic search.
Setup
Enable pgvector Extension
-- Enable the extension
CREATE EXTENSION IF NOT EXISTS vector;
Create Table
-- Create documents table
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
content TEXT,
metadata JSONB,
embedding VECTOR(1536)
);
-- Create HNSW index for fast similarity search
CREATE INDEX ON documents
USING hnsw (embedding vector_cosine_ops);
Complete Example
import os
from supabase import create_client, Client
from supacrawler import SupacrawlerClient
import openai
# Configuration
SUPABASE_URL = os.environ['SUPABASE_URL']
SUPABASE_KEY = os.environ['SUPABASE_SERVICE_KEY']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
SUPACRAWLER_API_KEY = os.environ['SUPACRAWLER_API_KEY']
# Initialize clients
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
crawler = SupacrawlerClient(api_key=SUPACRAWLER_API_KEY)
openai.api_key = OPENAI_API_KEY
# Step 1: Scrape content
result = crawler.scrape('https://example.com/docs', format='markdown')
# Step 2: Generate embedding
response = openai.embeddings.create(
model='text-embedding-3-small',
input=result.content
)
embedding = response.data[0].embedding
# Step 3: Store in Supabase
data = supabase.table('documents').insert({
'content': result.content,
'metadata': {
'url': result.url,
'title': result.title
},
'embedding': embedding
}).execute()
print(f"Stored document: {data.data[0]['id']}")
# Step 4: Semantic search
query = "How do I configure authentication?"
query_response = openai.embeddings.create(
model='text-embedding-3-small',
input=query
)
query_embedding = query_response.data[0].embedding
# Search with pgvector
results = supabase.rpc('match_documents', {
'query_embedding': query_embedding,
'match_count': 5
}).execute()
for doc in results.data:
print(f"Similarity: {doc['similarity']:.3f}")
print(f"Title: {doc['metadata']['title']}")
print(f"Content: {doc['content'][:200]}...")
print("---")
Create Match Function
-- Function for similarity search
CREATE OR REPLACE FUNCTION match_documents(
query_embedding VECTOR(1536),
match_count INT DEFAULT 5
)
RETURNS TABLE (
id BIGINT,
content TEXT,
metadata JSONB,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY
SELECT
documents.id,
documents.content,
documents.metadata,
1 - (documents.embedding <=> query_embedding) AS similarity
FROM documents
ORDER BY documents.embedding <=> query_embedding
LIMIT match_count;
END;
$$;
Batch Processing
# Crawl multiple pages
job = crawler.create_crawl_job(
url='https://example.com/docs',
depth=2,
link_limit=50
)
final = crawler.wait_for_crawl(job.job_id)
# Batch embed and store
documents = []
for url, page in final.data.crawl_data.items():
if hasattr(page, 'markdown') and page.markdown:
# Generate embedding
response = openai.embeddings.create(
model='text-embedding-3-small',
input=page.markdown
)
documents.append({
'content': page.markdown,
'metadata': {
'url': url,
'title': page.metadata.title if hasattr(page, 'metadata') else None
},
'embedding': response.data[0].embedding
})
# Batch insert
result = supabase.table('documents').insert(documents).execute()
print(f"Stored {len(result.data)} documents")
Performance Tips
- Use HNSW indexes for faster queries (up to 100x)
- Batch operations to reduce network overhead
- Choose appropriate dimensions (1536 for text-embedding-3-small)
- Cache embeddings to avoid recomputing
- Use RLS policies for multi-tenant applications
Resources
Was this page helpful?