Building a Basic RAG Pipeline in Python

This guide builds a complete RAG pipeline step by step. By the end you will have working code that loads documents, splits and embeds them, stores vectors, retrieves relevant context, and generates grounded answers using Claude.

Prerequisites

Python 3.10+
Anthropic API key (for generation)
Voyage AI API key (for embeddings) — or use the local alternative at the end

pip install anthropic voyageai numpy

Architecture Overview

Documents → Chunking → Embedding → Vector Store
                                         ↓
User Query → Query Embedding → Similarity Search → Top-k Chunks
                                                         ↓
                                               Context + Query → Claude → Answer

Step 1: Load and Chunk Documents

import re
from dataclasses import dataclass
from typing import List

@dataclass
class Chunk:
    text: str
    source: str
    chunk_index: int

def load_text(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def chunk_text(text: str, source: str, chunk_size: int = 500, overlap: int = 50) -> List[Chunk]:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks, current_chunk, current_length, chunk_index = [], [], 0, 0

    for sentence in sentences:
        sentence_length = len(sentence.split())

        if current_length + sentence_length > chunk_size and current_chunk:
            chunks.append(Chunk(" ".join(current_chunk), source, chunk_index))
            chunk_index += 1
            # Keep last sentences for overlap
            overlap_sentences, overlap_length = [], 0
            for s in reversed(current_chunk):
                s_len = len(s.split())
                if overlap_length + s_len <= overlap:
                    overlap_sentences.insert(0, s)
                    overlap_length += s_len
                else:
                    break
            current_chunk, current_length = overlap_sentences, overlap_length

        current_chunk.append(sentence)
        current_length += sentence_length

    if current_chunk:
        chunks.append(Chunk(" ".join(current_chunk), source, chunk_index))

    return chunks

Step 2: Embed the Chunks

import voyageai

voyage_client = voyageai.Client()  # reads VOYAGE_API_KEY from environment

def embed_chunks(chunks: List[Chunk]) -> List[List[float]]:
    texts = [chunk.text for chunk in chunks]
    result = voyage_client.embed(texts, model="voyage-3", input_type="document")
    return result.embeddings

def embed_query(query: str) -> List[float]:
    result = voyage_client.embed([query], model="voyage-3", input_type="query")
    return result.embeddings[0]

The input_type parameter matters — Voyage AI distinguishes document embeddings (indexing time) from query embeddings (retrieval time). Using the correct type improves retrieval quality.

Step 3: Build a Simple Vector Store

import numpy as np
from dataclasses import dataclass, field
from typing import Tuple

@dataclass
class VectorStore:
    chunks: List[Chunk] = field(default_factory=list)
    embeddings: np.ndarray = field(default_factory=lambda: np.array([]))

    def add(self, chunks: List[Chunk], embeddings: List[List[float]]) -> None:
        self.chunks.extend(chunks)
        new_emb = np.array(embeddings)
        self.embeddings = new_emb if len(self.embeddings) == 0 else np.vstack([self.embeddings, new_emb])

    def search(self, query_embedding: List[float], top_k: int = 5) -> List[Tuple[Chunk, float]]:
        if len(self.embeddings) == 0:
            return []
        query_vec = np.array(query_embedding)
        query_norm = query_vec / np.linalg.norm(query_vec)
        doc_norms = self.embeddings / np.linalg.norm(self.embeddings, axis=1, keepdims=True)
        similarities = doc_norms @ query_norm
        top_indices = np.argsort(similarities)[::-1][:top_k]
        return [(self.chunks[i], float(similarities[i])) for i in top_indices]

Step 4: Index Your Documents

import os

store = VectorStore()

def index_document(file_path: str) -> None:
    print(f"Indexing {file_path}...")
    text = load_text(file_path)
    chunks = chunk_text(text, source=os.path.basename(file_path))
    embeddings = embed_chunks(chunks)
    store.add(chunks, embeddings)
    print(f"  {len(chunks)} chunks indexed")

index_document("your_document.txt")

Step 5: Generate Answers with Claude

import anthropic

anthropic_client = anthropic.Anthropic()

SYSTEM_PROMPT = """You are a precise question-answering assistant.
Answer questions based strictly on the provided context.
If the context does not contain enough information, say so clearly.
Do not speculate beyond what the context supports."""

def answer_question(question: str, top_k: int = 4) -> dict:
    query_embedding = embed_query(question)
    results = store.search(query_embedding, top_k=top_k)

    if not results:
        return {"answer": "No relevant documents found.", "sources": []}

    context_parts = []
    for i, (chunk, score) in enumerate(results):
        context_parts.append(f"[{i+1}] Source: {chunk.source} (score: {score:.3f})\n{chunk.text}")

    context = "\n\n---\n\n".join(context_parts)

    message = anthropic_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        system=SYSTEM_PROMPT,
        messages=[{"role": "user", "content": f"Context:\n\n{context}\n\n---\n\nQuestion: {question}"}]
    )

    return {
        "answer": message.content[0].text,
        "sources": [{"source": c.source, "score": s} for c, s in results],
        "tokens": {"input": message.usage.input_tokens, "output": message.usage.output_tokens}
    }

Putting It All Together

def main():
    index_document("example.txt")

    print("\nRAG system ready. Enter questions (or 'quit' to exit):")
    while True:
        question = input("\n> ").strip()
        if question.lower() in ("quit", "exit", "q"):
            break
        if not question:
            continue

        result = answer_question(question)
        print(f"\n{result['answer']}")
        for s in result["sources"]:
            print(f"  {s['source']} (score: {s['score']:.3f})")

if __name__ == "__main__":
    main()

Alternative: Local Embeddings

If you prefer not to use an external embedding API:

pip install sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-large-en-v1.5")

def embed_chunks_local(chunks):
    texts = [chunk.text for chunk in chunks]
    return model.encode(texts, normalize_embeddings=True).tolist()

def embed_query_local(query: str):
    prefixed = f"Represent this sentence for searching relevant passages: {query}"
    return model.encode(prefixed, normalize_embeddings=True).tolist()

Common Problems and Fixes

Low retrieval quality: Your chunk size may be too large (chunks contain too many unrelated topics) or too small (chunks lack sufficient context). Try 300–600 words per chunk with 10–20% overlap as a starting point.

The model invents information not in the documents: Your system prompt is not strong enough. Explicitly instruct the model not to use prior knowledge. Also check that relevant chunks are actually being retrieved.

Context window exceeded: Reduce top_k or chunk size. A rough check: top_k × average_chunk_tokens should be well below your model's context limit.

What to Build Next

In rough order of impact:

Hybrid search — add BM25 alongside vector search, combine with Reciprocal Rank Fusion.
Reranking — add a cross-encoder reranker to re-score retrieved chunks before generation.
Metadata filtering — filter by document type, date, section before vector search.
Persistent storage — move from in-memory to a real vector database (Chroma is easy to start with).
Evaluation — build a question-answer evaluation set and measure retrieval recall and answer quality.