ChromaDB Integration

ChromaDB is our vector database for semantic food search. Let's set it up and create our retriever.

ChromaDB Basics

ChromaDB organizes data into collections (like tables). Each item in a collection has:

ID - Unique identifier
Document - The text content
Embedding - Vector representation (auto-generated or provided)
Metadata - Key-value pairs for filtering

Step 1: Create the Retriever

Create app/rag/retriever.py:

app/rag/retriever.py
import json
import chromadb
from chromadb.config import Settings

from app.core.config import settings

class FoodRetriever:
    def __init__(self):
        # Connect to ChromaDB server
        self.client = chromadb.HttpClient(
            host=settings.chroma_host,
            port=settings.chroma_port
        )
        
        # Get or create our collection
        self.collection = self.client.get_or_create_collection(
            name="foods",
            metadata={"description": "Indian vegetarian food database"}
        )
    
    def add_foods(self, foods: list[dict]):
        """Add food items to the collection."""
        ids = []
        documents = []
        metadatas = []
        
        for food in foods:
            ids.append(food["id"])
            
            # Create rich document for embedding
            doc = self._create_document(food)
            documents.append(doc)
            
            # Store metadata for filtering
            metadatas.append({
                "name": food["name"],
                "cuisine": food["cuisine"],
                "spice_level": food["spice_level"],
                "meal_type": json.dumps(food["meal_type"]),
                "allergens": json.dumps(food.get("allergens", [])),
                "is_vegetarian": food.get("is_vegetarian", True),
                "is_vegan": food.get("is_vegan", False),
                "is_high_protein": food.get("is_high_protein", False),
                "is_low_carb": food.get("is_low_carb", False),
            })
        
        # Upsert (insert or update)
        self.collection.upsert(
            ids=ids,
            documents=documents,
            metadatas=metadatas
        )
        
        return len(ids)
    
    def _create_document(self, food: dict) -> str:
        """Create a rich text representation for embedding."""
        parts = [
            f"{food['name']}: {food['description']}",
            f"Cuisine: {food['cuisine']}",
            f"Region: {food.get('region', 'India')}",
            f"Meal types: {', '.join(food['meal_type'])}",
            f"Spice level: {food['spice_level']}",
            f"Ingredients: {', '.join(food.get('ingredients', []))}",
        ]
        
        # Add health tags
        if food.get("is_high_protein"):
            parts.append("High protein dish")
        if food.get("is_low_carb"):
            parts.append("Low carb option")
        
        return "\n".join(parts)

Key Decisions

Why create a rich document?

The document is what gets embedded. More context = better semantic matching:

# Bad: Just the name
"Masala Dosa"

# Good: Rich context
"""
Masala Dosa: A crispy, savory crepe made from fermented rice batter
Cuisine: south_indian
Region: Karnataka
Meal types: breakfast, dinner
Spice level: medium
Ingredients: rice, urad dal, potato, onion, spices
"""

Why store metadata separately?

Metadata enables exact filtering that doesn't rely on embeddings:

# "Find vegan dishes" - metadata filter is exact
where={"is_vegan": True}

# "Find something spicy" - relies on embedding similarity
query_texts=["spicy food"]

Step 2: Add Search Method

Continue in app/rag/retriever.py:

app/rag/retriever.py (continued)
    def search(
        self,
        query: str,
        cuisine: str = None,
        spice_level: str = None,
        meal_type: str = None,
        exclude_allergens: list[str] = None,
        dietary_type: str = None,
        top_k: int = 5
    ) -> list[dict]:
        """Search for foods matching the query and filters."""
        
        # Build where clause
        where_conditions = []
        
        if cuisine:
            where_conditions.append({"cuisine": cuisine})
        
        if spice_level:
            where_conditions.append({"spice_level": spice_level})
        
        if dietary_type == "vegan":
            where_conditions.append({"is_vegan": True})
        
        # Build the where clause
        where = None
        if len(where_conditions) == 1:
            where = where_conditions[0]
        elif len(where_conditions) > 1:
            where = {"$and": where_conditions}
        
        # Execute query
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k * 2,  # Get extra, we'll filter more
            where=where
        )
        
        # Post-filter for allergens (ChromaDB doesn't support $nin on arrays well)
        foods = []
        for i, id in enumerate(results["ids"][0]):
            metadata = results["metadatas"][0][i]
            
            # Check allergens
            if exclude_allergens:
                food_allergens = json.loads(metadata.get("allergens", "[]"))
                if any(a in food_allergens for a in exclude_allergens):
                    continue
            
            foods.append({
                "id": id,
                "name": metadata["name"],
                "cuisine": metadata["cuisine"],
                "spice_level": metadata["spice_level"],
                "document": results["documents"][0][i],
                "score": results["distances"][0][i] if results["distances"] else None
            })
            
            if len(foods) >= top_k:
                break
        
        return foods
    
    def get_by_name(self, name: str) -> dict | None:
        """Get a specific food by name."""
        results = self.collection.get(
            where={"name": name},
            limit=1
        )
        
        if not results["ids"]:
            return None
        
        return {
            "id": results["ids"][0],
            "name": results["metadatas"][0]["name"],
            "document": results["documents"][0],
            **results["metadatas"][0]
        }
    
    def get_count(self) -> int:
        """Get total number of foods in collection."""
        return self.collection.count()


# Singleton instance
retriever = FoodRetriever()

Step 3: Understanding the Search Flow

User query: "high protein breakfast"

Step 1: Build filters
       where = {"is_high_protein": True}

Step 2: Query ChromaDB
       query_texts=["high protein breakfast"]
       
       ChromaDB internally:
       1. Embeds "high protein breakfast" → [0.23, 0.45, ...]
       2. Finds similar document embeddings
       3. Applies metadata filter
       4. Returns ranked results

Step 3: Post-filter allergens
       Remove any dishes with user's allergens

Step 4: Return formatted results
       [
         {"name": "Pesarattu", "score": 0.92, ...},
         {"name": "Sprouts Salad", "score": 0.87, ...}
       ]

Step 4: Export the Retriever

Create app/rag/__init__.py:

app/rag/__init__.py
from .retriever import retriever, FoodRetriever

__all__ = ["retriever", "FoodRetriever"]

Step 5: Create the Admin Ingest Endpoint

Update app/api/admin.py:

app/api/admin.py
import json
from pathlib import Path
from fastapi import APIRouter

from app.rag import retriever

router = APIRouter(tags=["admin"])

@router.post("/admin/ingest")
async def ingest_data():
    """Load food data into ChromaDB."""
    # Load from JSON file
    data_path = Path(__file__).parent.parent.parent / "data" / "foods.json"
    
    with open(data_path) as f:
        foods = json.load(f)
    
    # Add to ChromaDB
    count = retriever.add_foods(foods)
    
    return {
        "message": f"Ingested {count} foods",
        "total_in_db": retriever.get_count()
    }

@router.get("/admin/stats")
async def get_stats():
    """Get database statistics."""
    return {
        "foods_count": retriever.get_count()
    }

Testing the Retriever

After starting ChromaDB and ingesting data:

# Ingest food data
curl -X POST http://localhost:8080/api/admin/ingest

# Test search (once chat endpoint is built)
curl -X POST http://localhost:8080/api/chat \
  -H "Content-Type: application/json" \
  -d '{"message": "I want something spicy for breakfast"}'

ChromaDB Query Options

# Basic query
results = collection.query(
    query_texts=["breakfast"],
    n_results=5
)

# With metadata filter
results = collection.query(
    query_texts=["breakfast"],
    where={"cuisine": "south_indian"}
)

# Multiple conditions (AND)
results = collection.query(
    query_texts=["breakfast"],
    where={
        "$and": [
            {"cuisine": "south_indian"},
            {"spice_level": "mild"}
        ]
    }
)

# Multiple conditions (OR)
results = collection.query(
    query_texts=["breakfast"],
    where={
        "$or": [
            {"cuisine": "south_indian"},
            {"cuisine": "north_indian"}
        ]
    }
)

# Include/exclude specific fields
results = collection.query(
    query_texts=["breakfast"],
    include=["documents", "metadatas", "distances"]
)

Next, let's create the OpenAI client for generating responses.

ChromaDB Basics​

Step 1: Create the Retriever​

Key Decisions​

Step 2: Add Search Method​

Step 3: Understanding the Search Flow​

Step 4: Export the Retriever​

Step 5: Create the Admin Ingest Endpoint​

Testing the Retriever​

ChromaDB Query Options​