Hands-on evals for agentic applications
AIE Europe 2026-04-08
These slides: slides.com/seldo/ship-real-agents
1. Notebook
3. Phoenix cloud
2. Claude key
Everybody vibes until they can't any more
Code eval: score: 1 · label: "valid"
LLM judge: score: 0 · label: "incorrect"
explanation: "The response fails to include..."
label: "incorrect"
explanation: "The response fails to include a budget
breakdown, which is a core requirement. The agent
provides destination info and local recommendations
but omits all cost estimates, making the plan
incomplete for a user who asked specifically
about budget travel to Tokyo."
pip install claude-agent-sdk
openinference-instrumentation-claude-agent-sdk
arize-phoenix anthropic
from google.colab import userdata
os.environ["ANTHROPIC_API_KEY"] = "sk-ant-XXXX"
os.environ["PHOENIX_API_KEY"] = "YYYY"
os.environ["PHOENIX_COLLECTOR_ENDPOINT"]
= "https://app.phoenix.arize.com/s/yourusername/"
from phoenix.otel import register
register(
project_name="aie-claude-financial-agent",
auto_instrument=True
)
from claude_agent_sdk import ClaudeSDKClient,
ClaudeAgentOptions, AssistantMessage, TextBlock
options = ClaudeAgentOptions(
model="claude-haiku-4-5-20251001",
allowed_tools=["WebSearch"],
permission_mode="acceptEdits",
)
RESEARCH_PROMPT = "Research {tickers}. Focus on: {focus}.
Use web search to find current financial data."
WRITE_PROMPT = "Now write a concise financial report
based on your research above."
async def financial_report(tickers, focus):
async with ClaudeSDKClient(options=options) as client:
await client.query(RESEARCH_PROMPT.format(...))
async for message in client.receive_response():
... # research completes
await client.query(WRITE_PROMPT)
async for message in client.receive_response():
... # collect the report
return report
with tracer.start_as_current_span(
"financial_report",
attributes={
"input.value": f"Research: {tickers}\nFocus: {focus}",
},
) as span:
result = await financial_report(
"TSLA",
"financial performance and growth outlook"
)
print(result)
Here's one I made earlier
test_queries = [
{"tickers": "AAPL", "focus": "revenue growth"},
{"tickers": "NVDA", "focus": "AI chip demand"},
{"tickers": "AMZN", "focus": "AWS performance"},
{"tickers": "GOOGL", "focus": "advertising revenue"},
{"tickers": "MSFT", "focus": "cloud computing segment"},
{"tickers": "META", "focus": "metaverse investments"},
{"tickers": "TSLA", "focus": "vehicle deliveries"},
{"tickers": "RIVN", "focus": "financial health"},
{"tickers": "AAPL, MSFT", "focus": "comparative analysis"},
{"tickers": "NVDA", "focus": "competitive landscape"},
{"tickers": "KO", "focus": "dividend yield"},
{"tickers": "AMZN", "focus": "profitability trends"},
]
trace_categories = {
"TSLA performance": "looks good",
"NVDA competitive": "possible hallucination",
"AAPL vs MSFT": "reasoning gap",
"RIVN financial health": "looks good",
...
}
Category counts:
looks good 5 █████
possible hallucination 3 ███
reasoning gap 2 ██
unverifiable data 1 █
missing recommendation 1 █
Frequency x Severity = Priority
from phoenix.client import Client
px_client = Client()
spans_df = px_client.spans.get_spans_dataframe(
project_name="aie-claude-financial-agent"
)
parent_spans = spans_df[
spans_df["parent_id"].isna()
]
parent_spans.rename(columns={
"attributes.input.value": "input",
"attributes.output.value": "output"
}, inplace=True)
from phoenix.evals import create_evaluator
@create_evaluator(name="mentions_ticker", kind="code")
def mentions_ticker(input, output):
tickers = re.findall(r"\b([A-Z]{1,5})\b", input)
likely_tickers = [t for t in tickers
if len(t) >= 2 and t not in ("AI", "US", ...)]
missing = [t for t in likely_tickers
if t not in output.upper()]
if not missing:
return {"label": "pass", "score": 1}
return {"label": "fail", "score": 0,
"explanation": f"Missing: {', '.join(missing)}"}
from phoenix.evals import LLM
from phoenix.evals.metrics import CorrectnessEvaluator
llm = LLM(model="claude-sonnet-4-6", provider="anthropic")
correctness_eval = CorrectnessEvaluator(llm=llm)
from phoenix.evals import evaluate_dataframe
from phoenix.trace import suppress_tracing
with suppress_tracing():
results_df = evaluate_dataframe(
dataframe=parent_spans,
evaluators=[correctness_eval]
)
from phoenix.evals.utils import to_annotation_dataframe
evaluations = to_annotation_dataframe(dataframe=results_df)
Client().spans.log_span_annotations_dataframe(
dataframe=evaluations
)
child_spans = spans_df[spans_df["parent_id"].notna()]
from phoenix.evals.metrics import FaithfulnessEvaluator
faithfulness_eval = FaithfulnessEvaluator(llm=llm)
with suppress_tracing():
faith_results = evaluate_dataframe(
dataframe=parent_spans,
evaluators=[faithfulness_eval]
)
"You are an expert financial analyst evaluator.
Your task is to judge whether a financial report
provides actionable investment guidance,
not just raw data."
Example — ACTIONABLE:
"Based on NVDA's 122% YoY revenue growth driven by
data center demand, strong forward P/E of 35x relative
to sector median of 22x, and expanding margins, NVDA
presents a compelling growth position. Key risk:
concentration in AI training chips (~70% of revenue).
Recommendation: accumulate on pullbacks below $800."
Example — NOT ACTIONABLE:
"NVDA is a major player in the semiconductor industry.
The company has seen significant growth in recent years
driven by AI demand. NVDA's stock has performed well.
Investors should consider various factors when making
investment decisions."
actionability_template = """
You are an expert financial analyst evaluator...
ACTIONABLE — [criteria]
NOT ACTIONABLE — [criteria]
[examples]
[BEGIN DATA]
User query: {input}
Financial Report: {output}
[END DATA]
Is this report ACTIONABLE or NOT ACTIONABLE?
"""
actionability_eval = ClassificationEvaluator(
name="actionability",
prompt_template=actionability_template,
llm=llm,
choices={"actionable": 1.0, "not actionable": 0.0},
)
with suppress_tracing():
action_results_df = evaluate_dataframe(
dataframe=parent_spans,
evaluators=[actionability_eval]
)
action_evaluations = to_annotation_dataframe(
dataframe=action_results_df
)
Client().spans.log_span_annotations_dataframe(
dataframe=action_evaluations
)
| Positive | Negative | |
|---|---|---|
| Positive | True positive ✅ | Miss ❌ |
| Negative | False positive ❌ | True negative ✅ |
Actual
Predicted
Precision = True positives / (true positive + false positive)
Recall = True positives / (true positive + false negative)
Precision: when the judge says "fail," is it right?
Recall: of all real fails, how many does it catch?
Prioritize recall — catching defects matters more
than occasional false positives
IMPROVED_RESEARCH_PROMPT = """Research {tickers}.
Focus on: {focus}.
You MUST include:
- Specific financial ratios (P/E, P/B, debt-to-equity)
- News from the last 6 months
- Current stock price or recent performance data
- Competitive context and market positioning"""
IMPROVED_WRITE_PROMPT = """Write a concise financial report.
The report MUST be actionable. Specifically:
- Include explicit buy/sell/hold recommendations
- Identify concrete risks with supporting data
- Include forward-looking analysis
- Provide context for WHY each recommendation is made"""
dataset = Client().datasets.get_dataset(
dataset="aie-financial-agent-fails"
)
async def my_task(example):
return await improved_financial_report(
tickers, focus
)
experiment = await async_client.experiments.run_experiment(
dataset=dataset,
task=my_task,
evaluators=evaluators
)
Find failures → Read explanations → Fix the prompt → Run experiment → Repeat
🦋 @seldo.com on BlueSky
arize.com/docs/phoenix
These slides: