Conversational RAG (Multi-Turn)¶
Single-turn RAG breaks on follow-up questions.
User asks: "What is SSO?" — works fine. User then asks: "Which plans include it?" — the word "it" means nothing to the retriever.
The retriever sees "Which plans include it?" and searches for chunks about "it" — which matches nothing useful.
Why single-turn fails¶
Single-turn RAG treats each query independently:
turn 1: embed("What is SSO?") → retrieves SSO chunks ✓
turn 2: embed("Which plans include it?") → retrieves nothing ✗
The pronoun "it" has no meaning without conversation history.
Three strategies¶
| Strategy | How it works | Trade-off |
|---|---|---|
| Full history | Pass all prior Q&A turns into the prompt | Context grows unboundedly; expensive |
| Sliding window | Keep the last N turns | Simple; can lose early context |
| Query rewriting | Rewrite each follow-up into a standalone query | Small overhead per turn; best recall |
Recommendation: query rewriting. It keeps retrieval independent and doesn't bloat your LLM context.
Install¶
uv pip install openai psycopg
ConversationBuffer dataclass¶
from __future__ import annotations
from dataclasses import dataclass, field
@dataclass
class Turn:
role: str # "user" or "assistant"
content: str
@dataclass
class ConversationBuffer:
turns: list[Turn] = field(default_factory=list)
def add(self, role: str, content: str) -> None:
self.turns.append(Turn(role=role, content=content))
def to_openai_messages(self) -> list[dict]:
return [{"role": t.role, "content": t.content} for t in self.turns]
def token_count(self) -> int:
"""Rough estimate: 1 token ≈ 4 characters."""
total = sum(len(t.content) for t in self.turns)
return total // 4
def last_n(self, n: int) -> "ConversationBuffer":
"""Return a buffer with only the last n turns."""
buf = ConversationBuffer()
buf.turns = self.turns[-n:]
return buf
Rewrite follow-up queries¶
This is the key function. It takes the chat history and the latest user message, then produces a standalone search query:
from openai import OpenAI
client = OpenAI()
REWRITE_SYSTEM = """You are a search query optimizer for a RAG system.
Given a conversation history and the latest user message, rewrite the user message
into a standalone search query that can be understood without the conversation history.
Rules:
- Replace all pronouns and references ("it", "that", "they") with explicit nouns.
- Keep the query concise (under 20 words).
- Return ONLY the rewritten query — no explanation, no quotes."""
def rewrite_query_with_history(history: ConversationBuffer, latest_query: str) -> str:
"""Produce a standalone search query from chat history + latest message."""
if not history.turns:
# No history — no rewriting needed
return latest_query
history_text = "\n".join(
f"{t.role.upper()}: {t.content}" for t in history.last_n(6).turns
)
resp = client.chat.completions.create(
model="gpt-4o-mini",
temperature=0,
messages=[
{"role": "system", "content": REWRITE_SYSTEM},
{
"role": "user",
"content": (
f"Conversation history:\n{history_text}\n\n"
f"Latest user message: {latest_query}\n\n"
"Rewritten standalone query:"
),
},
],
)
return resp.choices[0].message.content.strip()
# Example
buf = ConversationBuffer()
buf.add("user", "What is SSO?")
buf.add("assistant", "SSO stands for Single Sign-On...")
standalone = rewrite_query_with_history(buf, "Which plans include it?")
print(standalone)
# → "Which pricing plans include Single Sign-On (SSO)?"
End-to-end chat loop¶
from openai import OpenAI
from rag.retrieve import retrieve # your existing retrieve() function
client = OpenAI()
SYSTEM_PROMPT = """You are a helpful assistant. Answer using ONLY the provided context.
If the context does not contain the answer, say "I don't know based on the provided context."
Cite sources as [source: {filename}#chunk:{id}]."""
def format_context(chunks: list[dict]) -> str:
parts = ["--- BEGIN CONTEXT ---"]
for c in chunks:
parts.append(f'\n[chunk_id={c["id"]} source="{c["source"]}"]\n{c["content"]}')
parts.append("\n--- END CONTEXT ---")
return "\n".join(parts)
def chat(history: ConversationBuffer, user_message: str, *, k: int = 8) -> str:
# 1. Rewrite query using history
search_query = rewrite_query_with_history(history, user_message)
# 2. Retrieve fresh context for the rewritten query
chunks = retrieve(search_query, k=k)
context = format_context(chunks)
# 3. Build messages: system + history + new user turn (with context)
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
messages.extend(history.to_openai_messages())
messages.append({
"role": "user",
"content": f"Question:\n{user_message}\n\nContext:\n{context}",
})
# 4. Generate answer
resp = client.chat.completions.create(
model="gpt-4o-mini",
temperature=0,
messages=messages,
)
answer = resp.choices[0].message.content or ""
# 5. Update history (store original message, not the rewritten query)
history.add("user", user_message)
history.add("assistant", answer)
return answer
# Run the chat loop
buf = ConversationBuffer()
while True:
user_input = input("You: ").strip()
if not user_input:
break
response = chat(buf, user_input)
print(f"Assistant: {response}\n")
What NOT to put in history¶
| Don't add | Why |
|---|---|
| Retrieved chunks | Bloats context; re-retrieve fresh each turn |
| System prompt | It's already in messages[0]; duplicating it wastes tokens |
| Rewritten queries | The history should reflect what the user said, not your internal rewrites |
Persist sessions in Postgres¶
For production, persist conversations so users can resume sessions:
CREATE TABLE conversations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
session_id TEXT NOT NULL,
role TEXT NOT NULL CHECK (role IN ('user', 'assistant')),
content TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX ON conversations (session_id, created_at);
import psycopg
import uuid
def load_history(conn: psycopg.Connection, session_id: str) -> ConversationBuffer:
buf = ConversationBuffer()
rows = conn.execute(
"SELECT role, content FROM conversations "
"WHERE session_id = %s ORDER BY created_at",
(session_id,),
).fetchall()
for role, content in rows:
buf.add(role, content)
return buf
def save_turn(
conn: psycopg.Connection,
session_id: str,
role: str,
content: str,
) -> None:
conn.execute(
"INSERT INTO conversations (session_id, role, content) VALUES (%s, %s, %s)",
(session_id, role, content),
)
conn.commit()
Next steps¶
- Improve what retrieval finds: Retrieval Strategies
- Expose the chat loop as an HTTP API: Serving RAG as an API