
Generative AI has shifted from research curiosity to production reality in under two years. Whether it's a context-aware chatbot, a document Q&A system, or an intelligent search bar, the pattern is the same: large language models (LLMs) connected to your data through a retrieval pipeline. In this guide I'll walk through exactly how I added a RAG-powered AI assistant to a React + FastAPI project using LangChain for orchestration, OpenAI for the LLM, and Pinecone for vector search.
Retrieval-Augmented Generation (RAG) is the backbone. Instead of relying solely on the LLM's training data, we:
We'll build the backend in Python with FastAPI and LangChain, and the frontend in React with TypeScript. Install the core dependencies:
# Backend
pip install fastapi uvicorn langchain langchain-openai \
langchain-pinecone pinecone-client python-dotenv
# Frontend
npm create vite@latest ai-app -- --template react-ts
npm installBefore the chatbot can answer anything, you need to embed your knowledge base. LangChain's RecursiveCharacterTextSplitter chunks documents, and OpenAIEmbeddings converts them to vectors stored in Pinecone.
1# ingest.py
2from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
3from langchain.text_splitter import RecursiveCharacterTextSplitter
4from langchain_openai import OpenAIEmbeddings
5from langchain_pinecone import PineconeVectorStore
6import os
7
8loader = DirectoryLoader("./docs", glob="**/*.pdf", loader_cls=PyPDFLoader)
9documents = loader.load()
10
11splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
12chunks = splitter.split_documents(documents)
13
14embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
15
16vectorstore = PineconeVectorStore.from_documents(
17 documents=chunks,
18 embedding=embeddings,
19 index_name=os.environ["PINECONE_INDEX"],
20)
21print(f"Ingested {len(chunks)} chunks into Pinecone")LangChain Expression Language (LCEL) lets you compose chains with the pipe | operator. Here's the full retrieval + generation chain:
1# chain.py
2from langchain_openai import ChatOpenAI
3from langchain_pinecone import PineconeVectorStore
4from langchain_openai import OpenAIEmbeddings
5from langchain_core.prompts import ChatPromptTemplate
6from langchain_core.output_parsers import StrOutputParser
7from langchain_core.runnables import RunnablePassthrough
8import os
9
10embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
11vectorstore = PineconeVectorStore(
12 index_name=os.environ["PINECONE_INDEX"], embedding=embeddings
13)
14retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
15
16SYSTEM_PROMPT = """You are a helpful assistant. Answer the user's question
17using ONLY the context below. If the answer is not in the context, say
18"I don't have enough information to answer that."
19
20Context:
21{context}"""
22
23prompt = ChatPromptTemplate.from_messages([
24 ("system", SYSTEM_PROMPT),
25 ("human", "{question}"),
26])
27
28llm = ChatOpenAI(model="gpt-4o", temperature=0, streaming=True)
29
30def format_docs(docs):
31 return "\n\n".join(d.page_content for d in docs)
32
33rag_chain = (
34 {"context": retriever | format_docs, "question": RunnablePassthrough()}
35 | prompt
36 | llm
37 | StrOutputParser()
38)Streaming the LLM output token-by-token gives a much better UX than waiting for the full response. FastAPI's StreamingResponse pairs perfectly with LangChain's async stream API:
1# main.py
2from fastapi import FastAPI
3from fastapi.responses import StreamingResponse
4from fastapi.middleware.cors import CORSMiddleware
5from pydantic import BaseModel
6from chain import rag_chain
7
8app = FastAPI()
9app.add_middleware(
10 CORSMiddleware, allow_origins=["http://localhost:5173"],
11 allow_methods=["*"], allow_headers=["*"],
12)
13
14class ChatRequest(BaseModel):
15 question: str
16
17@app.post("/chat")
18async def chat(req: ChatRequest):
19 async def token_stream():
20 async for chunk in rag_chain.astream(req.question):
21 yield f"data: {chunk}\n\n"
22 yield "data: [DONE]\n\n"
23
24 return StreamingResponse(token_stream(), media_type="text/event-stream")On the frontend, we consume the SSE stream with the native EventSource API and progressively append tokens to the UI state:
1// useChatStream.ts
2import { useState } from "react";
3
4export function useChatStream() {
5 const [answer, setAnswer] = useState("");
6 const [loading, setLoading] = useState(false);
7
8 const ask = async (question: string) => {
9 setAnswer("");
10 setLoading(true);
11
12 const res = await fetch("http://localhost:8000/chat", {
13 method: "POST",
14 headers: { "Content-Type": "application/json" },
15 body: JSON.stringify({ question }),
16 });
17
18 const reader = res.body!.getReader();
19 const decoder = new TextDecoder();
20
21 while (true) {
22 const { done, value } = await reader.read();
23 if (done) break;
24 const text = decoder.decode(value);
25 const lines = text.split("\n").filter(l => l.startsWith("data: "));
26 for (const line of lines) {
27 const token = line.replace("data: ", "");
28 if (token === "[DONE]") { setLoading(false); return; }
29 setAnswer(prev => prev + token);
30 }
31 }
32 setLoading(false);
33 };
34
35 return { answer, loading, ask };
36}1// ChatWidget.tsx
2import { useState } from "react";
3import { useChatStream } from "./useChatStream";
4
5export default function ChatWidget() {
6 const [input, setInput] = useState("");
7 const { answer, loading, ask } = useChatStream();
8
9 return (
10 <div className="flex flex-col gap-4 p-6 bg-gray-900 rounded-2xl">
11 <div className="min-h-32 text-slate-200 leading-7 whitespace-pre-wrap">
12 {answer || (loading ? "Thinking…" : "Ask me anything!")}
13 </div>
14 <div className="flex gap-2">
15 <input
16 className="flex-1 bg-white/10 rounded-lg px-4 py-2 text-white outline-none"
17 value={input}
18 onChange={e => setInput(e.target.value)}
19 onKeyDown={e => e.key === "Enter" && ask(input)}
20 placeholder="Type your question…"
21 />
22 <button
23 onClick={() => ask(input)}
24 disabled={loading}
25 className="bg-green-500 hover:bg-green-400 text-black font-semibold px-5 py-2 rounded-lg disabled:opacity-50"
26 >
27 Send
28 </button>
29 </div>
30 </div>
31 );
32}A single-turn Q&A is useful, but a multi-turn conversation requires keeping history. LangChain's RunnableWithMessageHistory wraps any chain with session-scoped memory:
1from langchain_core.runnables.history import RunnableWithMessageHistory
2from langchain_community.chat_message_histories import ChatMessageHistory
3
4store: dict[str, ChatMessageHistory] = {}
5
6def get_session_history(session_id: str) -> ChatMessageHistory:
7 if session_id not in store:
8 store[session_id] = ChatMessageHistory()
9 return store[session_id]
10
11chain_with_history = RunnableWithMessageHistory(
12 rag_chain,
13 get_session_history,
14 input_messages_key="question",
15 history_messages_key="chat_history",
16)For production, replace ChatMessageHistory with a Redis-backed store so conversations persist across server restarts.
text-embedding-3-small costs 5× less than text-embedding-ada-002 with >5% better benchmark scores.CacheBackedEmbeddings wraps any embedder and stores results in a local SQLite store re-ingestion becomes near-instant.The RAG pattern democratises AI for domain-specific use cases without expensive fine-tuning. LangChain handles the orchestration complexity so you can focus on your product logic, Pinecone gives you sub-100ms semantic search at scale, and streaming SSE keeps the UX snappy. The entire stack from ingestion script to streaming chat widget can be production-ready in a weekend.
If you want to go further, look into LangGraph for multi-agent workflows, or swap Pinecone for pgvector if you already run Postgres.
