LlamaIndex + Supabase
Ingest with Supacrawler, index with LlamaIndex, and store/query vectors in Supabase. Complete RAG pipeline in minutes.
LlamaIndex + Supabase
Build a RAG pipeline with Supacrawler for content ingestion, LlamaIndex for indexing, and Supabase pgvector for storage.
Prerequisites
Enable pgvector
extension:
- Supabase: Database → Extensions → enable
pgvector
- Self-hosted:
CREATE EXTENSION IF NOT EXISTS vector;
Install Dependencies
pip install -U supacrawler-py llama-index-embeddings-openai \
llama-index-vector-stores-postgres
Complete Example
import os
from supacrawler import SupacrawlerClient
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import Document, VectorStoreIndex, StorageContext
# Configuration
DB_URL = os.environ['DATABASE_URL']
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
SUPACRAWLER_API_KEY = os.environ['SUPACRAWLER_API_KEY']
# Step 1: Scrape content
crawler = SupacrawlerClient(api_key=SUPACRAWLER_API_KEY)
result = crawler.scrape('https://example.com', format='markdown')
docs = [
Document(
text=result.content,
metadata={
'url': result.url,
'title': result.title
}
)
]
# Step 2: Setup embeddings and vector store
embed_model = OpenAIEmbedding(
model='text-embedding-3-small',
api_key=OPENAI_API_KEY
)
store = PGVectorStore.from_params(
database_url=DB_URL,
collection_name='llama_docs',
embed_dim=1536
)
# Step 3: Create index
ctx = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
docs,
storage_context=ctx,
embed_model=embed_model
)
# Step 4: Query
query_engine = index.as_query_engine()
response = query_engine.query('What is this page about?')
print(response)
Crawling Multiple Pages
# Crawl instead of single scrape
job = crawler.create_crawl_job(
url='https://example.com/docs',
depth=2,
link_limit=20
)
final = crawler.wait_for_crawl(job.job_id)
# Convert crawl data to documents
docs = []
for url, page in final.data.crawl_data.items():
if hasattr(page, 'markdown') and page.markdown:
docs.append(
Document(
text=page.markdown,
metadata={
'url': url,
'title': page.metadata.title if hasattr(page, 'metadata') else None
}
)
)
# Index as before
index = VectorStoreIndex.from_documents(
docs,
storage_context=ctx,
embed_model=embed_model
)
Production Optimization
Create Indexes
-- HNSW index for better performance
CREATE INDEX ON llama_docs
USING hnsw (embedding vector_cosine_ops);
-- IVFFlat alternative
CREATE INDEX ON llama_docs
USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100);
Chunking Strategy
from llama_index.core.node_parser import SentenceSplitter
# Add text splitter
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=200)
# Parse into nodes
nodes = splitter.get_nodes_from_documents(docs)
# Index nodes
index = VectorStoreIndex(
nodes,
storage_context=ctx,
embed_model=embed_model
)
Resources
Was this page helpful?