Skip to content

Conversational RAG (Multi-Turn)

Single-turn RAG breaks on follow-up questions.

User asks: "What is SSO?" — works fine. User then asks: "Which plans include it?" — the word "it" means nothing to the retriever.

The retriever sees "Which plans include it?" and searches for chunks about "it" — which matches nothing useful.


Why single-turn fails

Single-turn RAG treats each query independently:

turn 1: embed("What is SSO?")          → retrieves SSO chunks  ✓
turn 2: embed("Which plans include it?") → retrieves nothing    ✗

The pronoun "it" has no meaning without conversation history.


Three strategies

Strategy How it works Trade-off
Full history Pass all prior Q&A turns into the prompt Context grows unboundedly; expensive
Sliding window Keep the last N turns Simple; can lose early context
Query rewriting Rewrite each follow-up into a standalone query Small overhead per turn; best recall

Recommendation: query rewriting. It keeps retrieval independent and doesn't bloat your LLM context.


Install

uv pip install openai psycopg

ConversationBuffer dataclass

from __future__ import annotations

from dataclasses import dataclass, field


@dataclass
class Turn:
    role: str   # "user" or "assistant"
    content: str


@dataclass
class ConversationBuffer:
    turns: list[Turn] = field(default_factory=list)

    def add(self, role: str, content: str) -> None:
        self.turns.append(Turn(role=role, content=content))

    def to_openai_messages(self) -> list[dict]:
        return [{"role": t.role, "content": t.content} for t in self.turns]

    def token_count(self) -> int:
        """Rough estimate: 1 token ≈ 4 characters."""
        total = sum(len(t.content) for t in self.turns)
        return total // 4

    def last_n(self, n: int) -> "ConversationBuffer":
        """Return a buffer with only the last n turns."""
        buf = ConversationBuffer()
        buf.turns = self.turns[-n:]
        return buf

Rewrite follow-up queries

This is the key function. It takes the chat history and the latest user message, then produces a standalone search query:

from openai import OpenAI

client = OpenAI()

REWRITE_SYSTEM = """You are a search query optimizer for a RAG system.
Given a conversation history and the latest user message, rewrite the user message
into a standalone search query that can be understood without the conversation history.

Rules:
- Replace all pronouns and references ("it", "that", "they") with explicit nouns.
- Keep the query concise (under 20 words).
- Return ONLY the rewritten query — no explanation, no quotes."""


def rewrite_query_with_history(history: ConversationBuffer, latest_query: str) -> str:
    """Produce a standalone search query from chat history + latest message."""
    if not history.turns:
        # No history — no rewriting needed
        return latest_query

    history_text = "\n".join(
        f"{t.role.upper()}: {t.content}" for t in history.last_n(6).turns
    )
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": REWRITE_SYSTEM},
            {
                "role": "user",
                "content": (
                    f"Conversation history:\n{history_text}\n\n"
                    f"Latest user message: {latest_query}\n\n"
                    "Rewritten standalone query:"
                ),
            },
        ],
    )
    return resp.choices[0].message.content.strip()


# Example
buf = ConversationBuffer()
buf.add("user", "What is SSO?")
buf.add("assistant", "SSO stands for Single Sign-On...")

standalone = rewrite_query_with_history(buf, "Which plans include it?")
print(standalone)
# → "Which pricing plans include Single Sign-On (SSO)?"

End-to-end chat loop

from openai import OpenAI
from rag.retrieve import retrieve  # your existing retrieve() function

client = OpenAI()

SYSTEM_PROMPT = """You are a helpful assistant. Answer using ONLY the provided context.
If the context does not contain the answer, say "I don't know based on the provided context."
Cite sources as [source: {filename}#chunk:{id}]."""


def format_context(chunks: list[dict]) -> str:
    parts = ["--- BEGIN CONTEXT ---"]
    for c in chunks:
        parts.append(f'\n[chunk_id={c["id"]} source="{c["source"]}"]\n{c["content"]}')
    parts.append("\n--- END CONTEXT ---")
    return "\n".join(parts)


def chat(history: ConversationBuffer, user_message: str, *, k: int = 8) -> str:
    # 1. Rewrite query using history
    search_query = rewrite_query_with_history(history, user_message)

    # 2. Retrieve fresh context for the rewritten query
    chunks = retrieve(search_query, k=k)
    context = format_context(chunks)

    # 3. Build messages: system + history + new user turn (with context)
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    messages.extend(history.to_openai_messages())
    messages.append({
        "role": "user",
        "content": f"Question:\n{user_message}\n\nContext:\n{context}",
    })

    # 4. Generate answer
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=messages,
    )
    answer = resp.choices[0].message.content or ""

    # 5. Update history (store original message, not the rewritten query)
    history.add("user", user_message)
    history.add("assistant", answer)

    return answer


# Run the chat loop
buf = ConversationBuffer()

while True:
    user_input = input("You: ").strip()
    if not user_input:
        break
    response = chat(buf, user_input)
    print(f"Assistant: {response}\n")

What NOT to put in history

Don't add Why
Retrieved chunks Bloats context; re-retrieve fresh each turn
System prompt It's already in messages[0]; duplicating it wastes tokens
Rewritten queries The history should reflect what the user said, not your internal rewrites

Persist sessions in Postgres

For production, persist conversations so users can resume sessions:

CREATE TABLE conversations (
    id          UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    session_id  TEXT NOT NULL,
    role        TEXT NOT NULL CHECK (role IN ('user', 'assistant')),
    content     TEXT NOT NULL,
    created_at  TIMESTAMPTZ NOT NULL DEFAULT now()
);

CREATE INDEX ON conversations (session_id, created_at);
import psycopg
import uuid


def load_history(conn: psycopg.Connection, session_id: str) -> ConversationBuffer:
    buf = ConversationBuffer()
    rows = conn.execute(
        "SELECT role, content FROM conversations "
        "WHERE session_id = %s ORDER BY created_at",
        (session_id,),
    ).fetchall()
    for role, content in rows:
        buf.add(role, content)
    return buf


def save_turn(
    conn: psycopg.Connection,
    session_id: str,
    role: str,
    content: str,
) -> None:
    conn.execute(
        "INSERT INTO conversations (session_id, role, content) VALUES (%s, %s, %s)",
        (session_id, role, content),
    )
    conn.commit()

Next steps