This guide builds a complete RAG pipeline step by step. By the end you will have working code that loads documents, splits and embeds them, stores vectors, retrieves relevant context, and generates grounded answers using Claude.
Prerequisites
- Python 3.10+
- Anthropic API key (for generation)
- Voyage AI API key (for embeddings) — or use the local alternative at the end
pip install anthropic voyageai numpy
Architecture Overview
Documents → Chunking → Embedding → Vector Store
↓
User Query → Query Embedding → Similarity Search → Top-k Chunks
↓
Context + Query → Claude → Answer
Step 1: Load and Chunk Documents
import re
from dataclasses import dataclass
from typing import List
@dataclass
class Chunk:
text: str
source: str
chunk_index: int
def load_text(file_path: str) -> str:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
def chunk_text(text: str, source: str, chunk_size: int = 500, overlap: int = 50) -> List[Chunk]:
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks, current_chunk, current_length, chunk_index = [], [], 0, 0
for sentence in sentences:
sentence_length = len(sentence.split())
if current_length + sentence_length > chunk_size and current_chunk:
chunks.append(Chunk(" ".join(current_chunk), source, chunk_index))
chunk_index += 1
# Keep last sentences for overlap
overlap_sentences, overlap_length = [], 0
for s in reversed(current_chunk):
s_len = len(s.split())
if overlap_length + s_len <= overlap:
overlap_sentences.insert(0, s)
overlap_length += s_len
else:
break
current_chunk, current_length = overlap_sentences, overlap_length
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(Chunk(" ".join(current_chunk), source, chunk_index))
return chunks
Step 2: Embed the Chunks
import voyageai
voyage_client = voyageai.Client() # reads VOYAGE_API_KEY from environment
def embed_chunks(chunks: List[Chunk]) -> List[List[float]]:
texts = [chunk.text for chunk in chunks]
result = voyage_client.embed(texts, model="voyage-3", input_type="document")
return result.embeddings
def embed_query(query: str) -> List[float]:
result = voyage_client.embed([query], model="voyage-3", input_type="query")
return result.embeddings[0]
The input_type parameter matters — Voyage AI distinguishes document embeddings (indexing time) from query embeddings (retrieval time). Using the correct type improves retrieval quality.
Step 3: Build a Simple Vector Store
import numpy as np
from dataclasses import dataclass, field
from typing import Tuple
@dataclass
class VectorStore:
chunks: List[Chunk] = field(default_factory=list)
embeddings: np.ndarray = field(default_factory=lambda: np.array([]))
def add(self, chunks: List[Chunk], embeddings: List[List[float]]) -> None:
self.chunks.extend(chunks)
new_emb = np.array(embeddings)
self.embeddings = new_emb if len(self.embeddings) == 0 else np.vstack([self.embeddings, new_emb])
def search(self, query_embedding: List[float], top_k: int = 5) -> List[Tuple[Chunk, float]]:
if len(self.embeddings) == 0:
return []
query_vec = np.array(query_embedding)
query_norm = query_vec / np.linalg.norm(query_vec)
doc_norms = self.embeddings / np.linalg.norm(self.embeddings, axis=1, keepdims=True)
similarities = doc_norms @ query_norm
top_indices = np.argsort(similarities)[::-1][:top_k]
return [(self.chunks[i], float(similarities[i])) for i in top_indices]
Step 4: Index Your Documents
import os
store = VectorStore()
def index_document(file_path: str) -> None:
print(f"Indexing {file_path}...")
text = load_text(file_path)
chunks = chunk_text(text, source=os.path.basename(file_path))
embeddings = embed_chunks(chunks)
store.add(chunks, embeddings)
print(f" {len(chunks)} chunks indexed")
index_document("your_document.txt")
Step 5: Generate Answers with Claude
import anthropic
anthropic_client = anthropic.Anthropic()
SYSTEM_PROMPT = """You are a precise question-answering assistant.
Answer questions based strictly on the provided context.
If the context does not contain enough information, say so clearly.
Do not speculate beyond what the context supports."""
def answer_question(question: str, top_k: int = 4) -> dict:
query_embedding = embed_query(question)
results = store.search(query_embedding, top_k=top_k)
if not results:
return {"answer": "No relevant documents found.", "sources": []}
context_parts = []
for i, (chunk, score) in enumerate(results):
context_parts.append(f"[{i+1}] Source: {chunk.source} (score: {score:.3f})\n{chunk.text}")
context = "\n\n---\n\n".join(context_parts)
message = anthropic_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=SYSTEM_PROMPT,
messages=[{"role": "user", "content": f"Context:\n\n{context}\n\n---\n\nQuestion: {question}"}]
)
return {
"answer": message.content[0].text,
"sources": [{"source": c.source, "score": s} for c, s in results],
"tokens": {"input": message.usage.input_tokens, "output": message.usage.output_tokens}
}
Putting It All Together
def main():
index_document("example.txt")
print("\nRAG system ready. Enter questions (or 'quit' to exit):")
while True:
question = input("\n> ").strip()
if question.lower() in ("quit", "exit", "q"):
break
if not question:
continue
result = answer_question(question)
print(f"\n{result['answer']}")
for s in result["sources"]:
print(f" {s['source']} (score: {s['score']:.3f})")
if __name__ == "__main__":
main()
Alternative: Local Embeddings
If you prefer not to use an external embedding API:
pip install sentence-transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("BAAI/bge-large-en-v1.5")
def embed_chunks_local(chunks):
texts = [chunk.text for chunk in chunks]
return model.encode(texts, normalize_embeddings=True).tolist()
def embed_query_local(query: str):
prefixed = f"Represent this sentence for searching relevant passages: {query}"
return model.encode(prefixed, normalize_embeddings=True).tolist()
Common Problems and Fixes
Low retrieval quality: Your chunk size may be too large (chunks contain too many unrelated topics) or too small (chunks lack sufficient context). Try 300–600 words per chunk with 10–20% overlap as a starting point.
The model invents information not in the documents: Your system prompt is not strong enough. Explicitly instruct the model not to use prior knowledge. Also check that relevant chunks are actually being retrieved.
Context window exceeded: Reduce top_k or chunk size. A rough check: top_k × average_chunk_tokens should be well below your model's context limit.
What to Build Next
In rough order of impact:
- Hybrid search — add BM25 alongside vector search, combine with Reciprocal Rank Fusion.
- Reranking — add a cross-encoder reranker to re-score retrieved chunks before generation.
- Metadata filtering — filter by document type, date, section before vector search.
- Persistent storage — move from in-memory to a real vector database (Chroma is easy to start with).
- Evaluation — build a question-answer evaluation set and measure retrieval recall and answer quality.