"Which 5 customers spent the most last quarter?" is a question that takes a data analyst 10 minutes to answer — schema lookup, SQL, query, export. Or it takes an AI agent 15 seconds.
This post builds a data analyst agent that turns plain English questions into SQL queries, runs them, explains the results, and generates charts. No SQL knowledge required for the end user. The stack is Python, Claude, and whatever database you're already using — MySQL, PostgreSQL, or SQLite.
Architecture
User question
→ Claude: inspect schema (get_schema tool)
→ Claude: generate and run SQL (run_query tool)
→ Claude: explain results in plain English
→ Claude: generate chart if requested (create_chart tool)
→ User sees: answer + chart
Claude drives the whole thing. It inspects the schema first (so it knows what tables and columns exist), writes SQL against that schema, runs it, and then explains the output. The tools are simple Python functions — no LLM magic in the tools themselves.
Tool design
The three tools the agent gets:
tools = [
{
"name": "get_schema",
"description": (
"Get the database schema — all tables, columns, data types, and relationships. "
"Always call this first before writing any SQL query. "
"Returns a structured description of every table available."
),
"input_schema": {
"type": "object",
"properties": {},
},
},
{
"name": "run_query",
"description": (
"Execute a read-only SQL SELECT query and return results as a JSON array. "
"Only SELECT statements are allowed — never use INSERT, UPDATE, DELETE, DROP, or DDL. "
"Limit results to 500 rows maximum. "
"Returns: list of row dicts, row count, and column names."
),
"input_schema": {
"type": "object",
"properties": {
"sql": {
"type": "string",
"description": "A valid SELECT SQL query. Always add LIMIT if result could be large.",
}
},
"required": ["sql"],
},
},
{
"name": "create_chart",
"description": (
"Create a visualization from query results. "
"Call this after run_query when the user asks for a chart or when a chart would clarify the data. "
"Saves the chart as a PNG and returns the file path."
),
"input_schema": {
"type": "object",
"properties": {
"chart_type": {
"type": "string",
"enum": ["bar", "line", "pie", "scatter"],
},
"data": {
"type": "array",
"description": "Array of row dicts from run_query result",
},
"title": {"type": "string"},
"x_column": {"type": "string", "description": "Column name for X axis"},
"y_column": {"type": "string", "description": "Column name for Y axis (numeric)"},
},
"required": ["chart_type", "data", "title", "x_column", "y_column"],
},
},
]
Tool implementations
import sqlite3
import json
import sqlparse
import matplotlib
matplotlib.use("Agg") # Non-interactive backend
import matplotlib.pyplot as plt
import os
import time
DATABASE_PATH = "your_database.db" # or use MySQL/PostgreSQL connection below
def get_db_connection():
"""SQLite example. Replace with pymysql/psycopg2 for MySQL/PostgreSQL."""
conn = sqlite3.connect(DATABASE_PATH)
conn.row_factory = sqlite3.Row
return conn
def get_schema() -> dict:
conn = get_db_connection()
try:
cursor = conn.cursor()
# Get all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")
tables = [row[0] for row in cursor.fetchall()]
schema = {}
for table in tables:
cursor.execute(f"PRAGMA table_info({table})")
columns = [
{
"name": row[1],
"type": row[2],
"not_null": bool(row[3]),
"primary_key": bool(row[5]),
}
for row in cursor.fetchall()
]
# Get row count for context
cursor.execute(f"SELECT COUNT(*) FROM {table}")
row_count = cursor.fetchone()[0]
schema[table] = {"columns": columns, "row_count": row_count}
return {"tables": schema, "database_type": "sqlite"}
finally:
conn.close()
def is_safe_sql(sql: str) -> tuple[bool, str]:
"""Block non-SELECT statements using sqlparse."""
parsed = sqlparse.parse(sql.strip())
if not parsed:
return False, "Empty query"
statement = parsed[0]
statement_type = statement.get_type()
if statement_type != "SELECT":
return False, f"Only SELECT queries are allowed. Got: {statement_type}"
# Block any dangerous keywords even inside SELECT
sql_upper = sql.upper()
dangerous = ["INSERT", "UPDATE", "DELETE", "DROP", "CREATE", "ALTER", "TRUNCATE", "EXEC", "EXECUTE"]
for keyword in dangerous:
if keyword in sql_upper:
return False, f"Query contains forbidden keyword: {keyword}"
return True, "OK"
def run_query(sql: str) -> dict:
safe, reason = is_safe_sql(sql)
if not safe:
return {
"error": reason,
"suggestion": "Rewrite as a SELECT query without data-modifying keywords."
}
# Enforce row limit
if "LIMIT" not in sql.upper():
sql = sql.rstrip(";") + " LIMIT 500"
conn = get_db_connection()
try:
cursor = conn.cursor()
cursor.execute(sql)
rows = cursor.fetchall()
if not rows:
return {"rows": [], "row_count": 0, "columns": []}
columns = [description[0] for description in cursor.description]
data = [dict(zip(columns, row)) for row in rows]
return {
"rows": data,
"row_count": len(data),
"columns": columns,
}
except Exception as e:
return {
"error": str(e),
"suggestion": "Check the table and column names against the schema."
}
finally:
conn.close()
def create_chart(chart_type: str, data: list, title: str, x_column: str, y_column: str) -> dict:
if not data:
return {"error": "No data to chart"}
try:
x_values = [str(row[x_column]) for row in data]
y_values = [float(row[y_column]) for row in data]
except (KeyError, ValueError) as e:
return {"error": f"Column error: {e}. Available columns: {list(data[0].keys())}"}
fig, ax = plt.subplots(figsize=(10, 6))
if chart_type == "bar":
ax.bar(x_values, y_values)
plt.xticks(rotation=45, ha="right")
elif chart_type == "line":
ax.plot(x_values, y_values, marker="o")
plt.xticks(rotation=45, ha="right")
elif chart_type == "pie":
ax.pie(y_values, labels=x_values, autopct="%1.1f%%")
elif chart_type == "scatter":
ax.scatter(x_values, y_values)
ax.set_title(title)
ax.set_xlabel(x_column)
ax.set_ylabel(y_column)
plt.tight_layout()
output_path = f"/tmp/chart_{int(time.time())}.png"
plt.savefig(output_path, dpi=150, bbox_inches="tight")
plt.close()
return {"chart_path": output_path, "chart_type": chart_type, "data_points": len(data)}
The agent loop
import anthropic
client = anthropic.Anthropic()
SYSTEM_PROMPT = """You are a data analyst assistant. You have tools to inspect a database schema and run SQL queries.
When answering questions:
1. Always call get_schema first to understand what data is available
2. Write precise SQL based on what you see in the schema — don't guess column names
3. After running a query, explain the results in plain English — not just the numbers
4. Offer to create a chart when results are comparative or trend-based
5. If a question is ambiguous (e.g. "last quarter" could mean different things), ask for clarification before running SQL
Safety: Only run SELECT queries. Never attempt to modify data."""
tool_map = {
"get_schema": lambda args: get_schema(),
"run_query": lambda args: run_query(args["sql"]),
"create_chart": lambda args: create_chart(
args["chart_type"], args["data"], args["title"],
args["x_column"], args["y_column"]
),
}
def run_analyst_agent(question: str) -> dict:
messages = [{"role": "user", "content": question}]
chart_paths = []
for iteration in range(8): # max 8 tool rounds
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=2000,
system=SYSTEM_PROMPT,
tools=tools,
messages=messages,
)
if response.stop_reason == "end_turn":
answer = next(
(block.text for block in response.content if hasattr(block, "text")), ""
)
return {"answer": answer, "charts": chart_paths}
tool_results = []
for block in response.content:
if block.type == "tool_use":
result = tool_map[block.name](block.input)
# Track chart files
if "chart_path" in result:
chart_paths.append(result["chart_path"])
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": json.dumps(result),
})
messages.append({"role": "assistant", "content": response.content})
messages.append({"role": "user", "content": tool_results})
return {"answer": "Could not complete the analysis in the allowed steps.", "charts": []}
# Example usage
result = run_analyst_agent("Which 5 customers spent the most last quarter? Show me a bar chart.")
print(result["answer"])
# → "The top 5 customers by spend last quarter were: Rajan Enterprises (₹4.2L), ..."
# chart saved to /tmp/chart_XXXXX.png
Multi-step analysis: comparing periods
The agent handles multi-step questions naturally. "Compare this quarter vs last quarter" generates two queries, compares the results, and explains the delta:
User: "How did our revenue this quarter compare to Q4 2025? Which product category drove the difference?"
Agent calls:
1. get_schema() → sees orders, products, categories tables
2. run_query("SELECT SUM(amount) FROM orders WHERE created_at >= '2026-01-01' AND created_at < '2026-04-01'")
3. run_query("SELECT SUM(amount) FROM orders WHERE created_at >= '2025-10-01' AND created_at < '2025-12-31'")
4. run_query("SELECT c.name, SUM(o.amount) as revenue, ... GROUP BY c.name ORDER BY delta DESC LIMIT 10")
5. create_chart(chart_type="bar", ...)
The prompting for data analysis post has more on structuring complex analytical questions.
MySQL/PostgreSQL connection
Replace the SQLite connection with:
import pymysql # pip install pymysql
def get_db_connection():
return pymysql.connect(
host="localhost",
user="readonly_user", # Use a read-only DB user — not root
password=os.environ["DB_PASSWORD"],
database="your_db",
cursorclass=pymysql.cursors.DictCursor,
)
# For PostgreSQL:
import psycopg2
from psycopg2.extras import RealDictCursor
def get_db_connection():
return psycopg2.connect(
os.environ["DATABASE_URL"],
cursor_factory=RealDictCursor,
)
Always use a read-only database user for AI agents. Even with SQL validation, the safest defense is a user that can't write. Create one:
-- PostgreSQL
CREATE USER analyst_bot WITH PASSWORD 'your_password';
GRANT CONNECT ON DATABASE yourdb TO analyst_bot;
GRANT USAGE ON SCHEMA public TO analyst_bot;
GRANT SELECT ON ALL TABLES IN SCHEMA public TO analyst_bot;
-- MySQL
CREATE USER 'analyst_bot'@'%' IDENTIFIED BY 'your_password';
GRANT SELECT ON yourdb.* TO 'analyst_bot'@'%';
Tally CSV export (India-specific)
If you're working with Tally data, export to CSV and load into SQLite:
import pandas as pd
import sqlite3
def load_tally_exports(csv_files: dict, db_path: str):
"""Load Tally CSV exports into SQLite for the agent to query."""
conn = sqlite3.connect(db_path)
for table_name, csv_path in csv_files.items():
df = pd.read_csv(csv_path, encoding="utf-8-sig") # Tally exports use BOM
# Clean column names
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
df.to_sql(table_name, conn, if_exists="replace", index=False)
print(f"Loaded {len(df)} rows into {table_name}")
conn.close()
load_tally_exports({
"sales": "tally_sales_export.csv",
"purchases": "tally_purchase_export.csv",
"ledger": "tally_ledger_export.csv",
}, "tally_data.db")
Then run the agent against the SQLite file. The schema introspection works the same regardless of data source.
Streamlit UI
50 lines to turn this into a web app for non-technical users:
import streamlit as st
from PIL import Image
st.set_page_config(page_title="Data Analyst", page_icon="📊", layout="wide")
st.title("Ask your database")
if "messages" not in st.session_state:
st.session_state.messages = []
# Chat history
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.write(message["content"])
if message.get("charts"):
for chart_path in message["charts"]:
st.image(Image.open(chart_path))
# Input
if question := st.chat_input("Ask a question about your data..."):
st.session_state.messages.append({"role": "user", "content": question})
with st.chat_message("user"):
st.write(question)
with st.chat_message("assistant"):
with st.spinner("Analyzing..."):
result = run_analyst_agent(question)
st.write(result["answer"])
for chart_path in result.get("charts", []):
st.image(Image.open(chart_path))
st.session_state.messages.append({
"role": "assistant",
"content": result["answer"],
"charts": result.get("charts", []),
})
Run with streamlit run app.py. Share the URL with your team — they can ask questions without knowing SQL. The structured outputs post covers how to extend this with typed query results for downstream processing.
What to watch out for
Schema size matters. If you have 200 tables, the schema dump will be huge. Solve this by only exposing the tables the agent needs — either filter in get_schema or maintain a separate "analyst schema" view with the relevant tables.
Ambiguous date ranges. "Last month", "this quarter", "year to date" mean different things depending on your business calendar. Add a current_date and fiscal_year_start note to your system prompt, or have the agent explicitly confirm the date range before running.
Float vs integer for money. Store amounts in paise (integers) in the database. When displaying results, the agent will format them as ₹ amounts — that's fine. But the SQL should work with integers to avoid floating-point comparison errors.
Cost. Each analytical question uses Sonnet (for quality SQL generation) and typically makes 3–5 tool calls. Budget ~₹1.50–3 per complex question. For a team of 10 analysts each asking 20 questions/day, that's ~₹900–1,800/day. Use Haiku for simpler single-query questions.



