Add evals to your app using Pytest or Vitest/Jest
This tutorial will show you how to use LangSmith's testing frameworks in both Python and JS/TS to write evals for your LLM application. In this tutorial, we will be creating and testing a ReAct agent that answers financial data questions related to publicly traded stocks. This tutorial uses LangGraph for agent orchestration, OpenAI's GPT-4o, Tavily for search, E2B's code interpreter and Polygon to retrieve stock data but it can be adapted for other frameworks, models and tools with minor modifications. Tavily, E2B and Polygon are free to sign up for.
Setup
Installation
First, install the packages required for making the agent:
- Python
- TypeScript
pip install -U langgraph langchain-openai langchain-community e2b-code-interpreter
yarn add @langchain/openai @langchain/community @langchain/langgraph @langchain/core @e2b/code-interpreter @polygon.io/client-js openai zod
Next, install the testing framework:
- Pytest
- Vitest
- Jest
# Make sure you have langsmith>=0.3
pip install -U "langsmith[pytest]"
yarn add -D langsmith vitest
yarn add -D langsmith jest
Environment Variables
Set the following environment variables:
export OPENAI_API_KEY=<YOUR_OPENAI_API_KEY>
export TAVILY_API_KEY=<YOUR_TAVILY_API_KEY>
export E2B_API_KEY=<YOUR_E2B_API_KEY>
export POLYGON_API_KEY=<YOUR_POLYGON_API_KEY>
Create your app
To define our React agent, we will use LangGraph/LangGraph.js for the orchestation and LangChain for the LLM and tools.
Define tools
First we are going to define the tools we are going to use in our agent. There are going to be 3 tools:
- A search tool using Tavily
- A code interpreter tool using E2B
- A stock information tool using Polygon
Search Tool
We will use Tavily for the search tool.
- Python
- TypeScript
from langchain_community.tools import TavilySearchResults
search_tool = TavilySearchResults(
max_results=5,
include_raw_content=True,
)
import { TavilySearchResults } from "@langchain/community/tools/tavily_search";
const searchTool = new TavilySearchResults({
maxResults: 5,
});
Code Interpreter Tool
We will use E2B for the code interpreter tool.
- Python
- TypeScript
from e2b_code_interpreter import Sandbox
def code_tool(code: str) -> str:
"""Execute python code and return the result."""
sbx = Sandbox()
execution = sbx.run_code(code)
if execution.error:
return f"Error: {execution.error}"
return f"Results: {execution.results}, Logs: {execution.logs}"
import { Sandbox } from "@e2b/code-interpreter";
import { tool } from "@langchain/core/tools";
import { z } from "zod";
const codeTool = tool(
async (input): Promise<string> => {
const sbx = await Sandbox.create();
const execution = await sbx.runCode(input.code);
if (execution.error) {
return `Error: ${execution.error}`;
}
return `Results: ${execution.results}, Logs: ${execution.logs}`;
},
{
name: "code",
description: "Execute python code and return the result.",
schema: z.object({
code: z.string().describe("The python code to execute"),
}),
}
);
Stock Information Tool
We will use Polygon to get real-time data about stocks.
- Python
- TypeScript
from langchain_community.tools.polygon.aggregates import PolygonAggregates
from langchain_community.utilities.polygon import PolygonAPIWrapper
from typing_extensions import Annotated, TypedDict, Optional, Literal
class TickerToolInput(TypedDict):
"""Input format for the ticker tool.
The tool will pull data in aggregate blocks (timespan_multiplier * timespan) from the from_date to the to_date
"""
ticker: Annotated[str, ..., "The ticker symbol of the stock"]
timespan: Annotated[Literal["minute", "hour", "day", "week", "month", "quarter", "year"], ..., "The size of the time window."]
timespan_multiplier: Annotated[int, ..., "The multiplier for the time window"]
from_date: Annotated[str, ..., "The date to start pulling data from, YYYY-MM-DD format - ONLY include the year month and day"]
to_date: Annotated[str, ..., "The date to stop pulling data, YYYY-MM-DD format - ONLY include the year month and day"]
api_wrapper = PolygonAPIWrapper()
polygon_aggregate = PolygonAggregates(api_wrapper=api_wrapper)
def ticker_tool(query: TickerToolInput) -> str:
"""Pull data for the ticker."""
return polygon_aggregate.invoke(query)
import { restClient } from "@polygon.io/client-js";
import { tool } from "@langchain/core/tools";
import { z } from "zod";
const TickerToolInputSchema = z.object({
ticker: z.string().describe("The ticker symbol of the stock"),
timespan: z.enum(["minute", "hour", "day", "week", "month", "quarter", "year"]).describe("The size of the time window."),
timespan_multiplier: z.number().describe("The multiplier for the time window"),
from_date: z
.string()
.describe("The date to start pulling data from, YYYY-MM-DD format - ONLY include the year, month, and day"),
to_date: z
.string()
.describe("The date to stop pulling data, YYYY-MM-DD format - ONLY include the year, month, and day"),
});
const rest = restClient(process.env.POLYGON_API_KEY);
const tickerTool = tool(async (query): Promise<string> =>
{
const parsed = TickerToolInputSchema.parse(query);
const result = await rest.stocks.aggregates(
parsed.ticker,
parsed.timespan_multiplier,
parsed.timespan,
parsed.from_date,
parsed.to_date
);
return JSON.stringify(result);
},
{
name: "ticker",
description: "Pull data for the ticker",
schema: TickerToolInputSchema,
}
);
Define agent
Now that we have defined all of our tools, we can use LangGraph's create_react_agent
/createReactAgent
to create our agent.
- Python
- TypeScript
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from typing_extensions import Annotated, TypedDict, Optional
class AgentOutputFormat(TypedDict):
numeric_answer: Annotated[Optional[float], ..., "The numeric answer, if the user asked for one"]
text_answer: Annotated[Optional[str], ..., "The text answer, if the user asked for one"]
reasoning: Annotated[str, ..., "The reasoning behind the answer"]
tools = [code_tool, search_tool, ticker_tool]
agent = create_react_agent(
model=ChatOpenAI(model="gpt-4o-mini"),
tools=tools,
response_format=AgentOutputFormat,
state_modifier="You are a financial expert. Respond to the users query accurately",
)
import { z } from "zod";
import { ChatOpenAI } from "@langchain/openai";
import { createReactAgent } from "@langchain/langgraph/prebuilt";
const AgentOutputFormatSchema = z.object({
numeric_answer: z.number().optional().describe("The numeric answer, if the user asked for one"),
text_answer: z.string().optional().describe("The text answer, if the user asked for one"),
reasoning: z.string().describe("The reasoning behind the answer"),
})
const tools = [codeTool, searchTool, tickerTool];
const agent = createReactAgent({
llm: new ChatOpenAI({ model: "gpt-4o" }),
tools: tools,
responseFormat: AgentOutputFormatSchema,
stateModifier: "You are a financial expert. Respond to the users query accurately",
});
export default agent;
Write tests
Now that we have defined our agent, let's write a few tests to ensure basic functionality. In this tutorial we are going to test whether the agent's tool calling abilities are working, whether the agent knows to ignore irrelevant questions, and whether it is able to answer complex questions that involve using all of the tools.
Setup test file
We need to first set up a test file and add the imports needed at the top of the file.
- Pytest
- Vitest
- Jest
Name your test file test_evals.py
from agent import agent # import from wherever your agent is defined
import pytest
import json
from langsmith import testing
Name your test file agent.vitest.eval.ts
import { expect } from "vitest";
import * as ls from "langsmith/vitest";
import agent from "../../src/agent"; // import from wherever your agent is defined
Name your test file agent.jest.eval.ts
import { expect } from "@jest/globals";
import * as ls from "langsmith/jest";
import agent from "../../src/agent"; // import from wherever your agent is defined
Test 1: Ignore irrelevant questions
The first test will be a simple check that the agent does not use tools on irrelevant queries.
- Pytest
- Vitest
- Jest
@pytest.mark.langsmith
@pytest.mark.parametrize( # Can still use all normal pytest markers
"query",
[
"Hello!",
"How are you doing?"
],
)
def test_no_tools_on_unrelated_query(query):
testing.log_inputs({"query": query})
# Test that the agent does not use tools on unrelated queries
result = agent.invoke({"messages": [("user", query)]})
testing.log_outputs({"result": result})
testing.log_reference_outputs({"numMessages": 2})
# Check that the flow was HUMAN -> AI FINAL RESPONSE (no tools called)
assert len(result['messages']) == 2
Remember to place each individual test inside a describe block
`ls.describe("Agent Tests", () => {
// PLACE TESTS Here
})`
ls.test.each([
{ inputs: { query: "Hello!" }, expected: { numMessages: 2 } },
{ inputs: { query: "How are you doing?" }, expected: { numMessages: 2 } },
])(
"should not use tools on unrelated query: %s",
async ({ inputs: { query }, expected: { numMessages } }) => {
const result = await agent.invoke({ messages: [{ role: "user", content: query }] });
ls.logOutputs(result);
// Check that the flow was HUMAN -> AI FINAL RESPONSE (no tools called)
expect(result.messages).toHaveLength(numMessages);
}
);
Remember to place each individual test inside a describe block
`ls.describe("Agent Tests", () => {
// PLACE TESTS Here
})`
ls.test.each([
{ inputs: { query: "Hello!" }, expected: { numMessages: 2 } },
{ inputs: { query: "How are you doing?" }, expected: { numMessages: 2 } },
])(
"should not use tools on unrelated query: %s",
async ({ inputs: { query }, expected: { numMessages } }) => {
const result = await agent.invoke({ messages: [{ role: "user", content: query }] });
ls.logOutputs(result);
// Check that the flow was HUMAN -> AI FINAL RESPONSE (no tools called)
expect(result.messages).toHaveLength(numMessages);
}
);
Test 2: Tool calling
For tool calling, we are going to verify that the agent calls the correct tool with the correct parameters.
- Pytest
- Vitest
- Jest
@pytest.mark.langsmith
def test_searches_for_correct_ticker():
user_query = "What is the price of Apple?"
testing.log_inputs({"user_query": user_query})
# Test that the agent searches for the correct ticker
result = agent.invoke({"messages": [("user", user_query)]})
testing.log_outputs({"result": result})
testing.log_reference_outputs({"ticker": "AAPL", "numMessages": 4})
# The agent should have made a single tool call to the ticker tool
ticker = json.loads(result['messages'][1].additional_kwargs['tool_calls'][0]['function']['arguments'])["query"]["ticker"]
# Check that the right ticker was queried
assert ticker == "AAPL"
# Check that the flow was HUMAN -> AI -> TOOL -> AI FINAL RESPONSE
assert len(result['messages']) == 4
Remember to place each individual test inside a describe block
`ls.describe("Agent Tests", () => {
// PLACE TESTS Here
})`
ls.test(
"should search for correct ticker",
{
inputs: { query: "What is the price of Aple?" },
expected: { numMessages: 4 },
},
async ({ inputs: { query }, expected: { numMessages } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
ls.logOutputs(result);
// The agent should have made a single tool call to the ticker tool
const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
const tickerQuery = JSON.parse(toolCalls[0].function.arguments).query.ticker;
// Check that the right ticker was queried
expect(tickerQuery).toBe("AAPL");
// Check that the flow was HUMAN -> AI -> TOOL -> AI FINAL RESPONSE
expect(result.messages).toHaveLength(numMessages);
}
);
Remember to place each individual test inside a describe block
`ls.describe("Agent Tests", () => {
// PLACE TESTS Here
})`
ls.test(
"should search for correct ticker",
{
inputs: { query: "What is the price of Aple?" },
expected: { numMessages: 4 },
},
async ({ inputs: { query }, expected: { numMessages } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
ls.logOutputs(result);
// The agent should have made a single tool call to the ticker tool
const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
const tickerQuery = JSON.parse(toolCalls[0].function.arguments).query.ticker;
// Check that the right ticker was queried
expect(tickerQuery).toBe("AAPL");
// Check that the flow was HUMAN -> AI -> TOOL -> AI FINAL RESPONSE
expect(result.messages).toHaveLength(numMessages);
}
);
Test 3: Complex question answering
For answering a complex question, we are going to ensure that the agent uses the coding tool for any calculations, and also log the total number of steps taken so we can track over time how the performance of the agent improves.
- Pytest
- Vitest
- Jest
@pytest.mark.langsmith
def test_executes_code_when_needed():
user_query = "What was the average return rate for FAANG stock in 2024?"
testing.log_inputs({"user_query": user_query})
# Test that the agent executes code when needed
result = agent.invoke({"messages": [("user", user_query)]})
testing.log_outputs({"result": result})
testing.log_reference_outputs({"answer": 53})
# Grab all the tool calls made by the LLM
tool_calls = [tc['function']['name'] for m in result['messages'] if m.additional_kwargs and 'tool_calls' in m.additional_kwargs for tc in m.additional_kwargs['tool_calls'] ]
# This will log the number of steps taken by the LLM, which we can track over time to measure performance
testing.log_feedback(key="num_steps", value=len(result['messages']) - 1)
# Assert that the code tool was used
assert "code_tool" in tool_calls
# Assert that the answer is within 1 of the expected value
assert abs(result['structured_response']['numeric_answer'] - 53) <= 1
Remember to place each individual test inside a describe block
`ls.describe("Agent Tests", () => {
// PLACE TESTS Here
})`
ls.test(
"should execute code when needed",
{
inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
expected: { answer: 53 },
},
async ({ inputs: { query }, expected: { answer } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
ls.logOutputs(result);
// Grab all the tool calls made by the LLM
const toolCalls = result.messages
.filter(m => (m as AIMessage).tool_calls)
.flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));
// This will log the number of steps taken by the LLM, which we can track over time to measure performance
ls.logFeedback({
key: "num_steps",
score: result.messages.length - 1, // The first message is the user message
});
// Assert that the tool calls include the "code_tool" function
expect(toolCalls).toContain("code_tool");
// Assert that the answer is within 1 of the expected answer
expect(Math.abs(result.structured_response.numeric_answer - answer)).toBeLessThanOrEqual(1);
}
);
Remember to place each individual test inside a describe block
`ls.describe("Agent Tests", () => {
// PLACE TESTS Here
})`
ls.test(
"should execute code when needed",
{
inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
expected: { answer: 53 },
},
async ({ inputs: { query }, expected: { answer } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
ls.logOutputs(result);
// Grab all the tool calls made by the LLM
const toolCalls = result.messages
.filter(m => (m as AIMessage).tool_calls)
.flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));
// This will log the number of steps taken by the LLM, which we can track over time to measure performance
ls.logFeedback({
key: "num_steps",
score: result.messages.length - 1, // The first message is the user message
});
// Assert that the tool calls include the "code_tool" function
expect(toolCalls).toContain("code_tool");
// Assert that the answer is within 1 of the expected answer
expect(Math.abs(result.structured_response.numeric_answer - answer)).toBeLessThanOrEqual(1);
}
);
Test 4: LLM-as-a-judge
We are going to ensure that the agent's answer is grounded in the search results by running an LLM-as-a-judge evaluation.
In order to trace the LLM as a judge call separately from our agent, we will use the LangSmith provided trace_feedback
context manager
in Python and wrapEvaluator
function in JS/TS.
- Pytest
- Vitest
- Jest
judge_llm = ChatOpenAI(model="gpt-4o")
@pytest.mark.langsmith
def test_grounded_in_the_source():
# Test that the agent is grounded in the source
user_query = "How did Nvidia stock do in 2024 according to analysts?"
t.log_inputs({"user_query": user_query})
result = agent.invoke({"messages": [("user", user_query)]})
t.log_outputs({"result": result})
# Grab all the search calls made by the LLM
tavily_tool_message_indices = []
for i, m in enumerate(result['messages']):
if m.additional_kwargs and 'tool_calls' in m.additional_kwargs and
m.additional_kwargs['tool_calls'][0]['function']['name'] == 'tavily_search_results_json':
tavily_tool_message_indices.append(i+1)
documents = "\n".join([m.content for i,m in enumerate(result['messages']) if i in tavily_tool_message_indices])
with testing.trace_feedback():
# Instructions for the LLM judge
instructions = (
"Return 1 if the ANSWER is grounded in the DOCUMENTS"
"Return 0 if the ANSWER is not grounded in the DOCUMENTS"
)
# Run the judge LLM
grade = judge_llm.invoke([
{"role": "system", "content": instructions},
{"role": "user", "content": f"ANSWER: {result['structured_response']['text_answer']}\nDOCUMENTS: {documents}"},
])
score = int(grade.content)
testing.log_feedback(key="groundedness", score=score)
assert score
// THIS CODE GOES OUTSIDE THE TEST - IT IS JUST A HELPER FUNCTION
const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });
const groundedEvaluator = async (params: {
answer: string;
referenceDocuments: string,
}) => {
// Instructions for the LLM judge
const instructions = [
"Return 1 if the ANSWER is grounded in the DOCUMENTS",
"Return 0 if the ANSWER is not grounded in the DOCUMENTS",
].join("\n");
// Run the judge LLM
const grade = await judgeLLM.invoke([
{ role: "system", content: instructions },
{ role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
]);
const score = parseInt(grade.content.toString());
return { key: "groundedness", score };
};
// THIS CODE GOES INSIDE THE TEST
ls.test(
"grounded in the source",
{
inputs: { query: "How did Nvidia stock do in 2024 according to analysts?" },
},
async ({ inputs: { query } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);
await wrappedEvaluator({
answer: result.structuredResponse.text_answer ?? "",
referenceDocuments: result.structuredResponse.reasoning,
})
ls.logOutputs(result);
});
// THIS CODE GOES OUTSIDE THE TEST - IT IS JUST A HELPER FUNCTION
const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });
const groundedEvaluator = async (params: {
answer: string;
referenceDocuments: string,
}) => {
const instructions = [
"Return 1 if the ANSWER is grounded in the DOCUMENTS",
"Return 0 if the ANSWER is not grounded in the DOCUMENTS",
].join("\n");
const grade = await judgeLLM.invoke([
{ role: "system", content: instructions },
{ role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
]);
const score = parseInt(grade.content.toString());
return { key: "groundedness", score };
};
// THIS CODE GOES INSIDE THE TEST
ls.test(
"grounded in the source",
{
inputs: { query: "How did Nvidia stock do in 2024 according to analysts?" },
},
async ({ inputs: { query } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);
await wrappedEvaluator({
answer: result.structuredResponse.text_answer ?? "",
referenceDocuments: result.structuredResponse.reasoning,
})
ls.logOutputs(result);
});
Run tests
Once you have setup your config files (if you are using Vitest or Jest), you can run your tests using the following commands:
Config files for Vitest/Jest
- Vitest
- Jest
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
include: ["**/*.eval.?(c|m)[jt]s"],
reporters: ["langsmith/vitest/reporter"],
setupFiles: ["dotenv/config"],
},
});
require('dotenv').config();
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
testMatch: [
'<rootDir>/tests/jest/**/*.jest.eval.ts'
],
testPathIgnorePatterns: [
'<rootDir>/tests/vitest/.*.vitest.eval.ts$'
],
reporters: ["langsmith/jest/reporter"],
};
- Pytest
- Vitest
- Jest
pytest --output='ls' tests
yarn vitest --config ls.vitest.config.ts
yarn jest --config ls.jest.config.ts
Reference code
Remember to also add the config files for Vitest and Jest to your project.
Agent
Agent code
- Pytest
- TypeScript
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_community.tools import TavilySearchResults
from e2b_code_interpreter import Sandbox
from langchain_community.tools.polygon.aggregates import PolygonAggregates
from langchain_community.utilities.polygon import PolygonAPIWrapper
from typing_extensions import Annotated, TypedDict, Optional, Literal
def code_tool(code: str) -> str:
"""Execute python code and return the result."""
sbx = Sandbox()
execution = sbx.run_code(code)
if execution.error:
return f"Error: {execution.error}"
return f"Results: {execution.results}, Logs: {execution.logs}"
class TickerToolInput(TypedDict):
"""Input format for the ticker tool.
The tool will pull data in aggregate blocks (timespan_multiplier * timespan) from the from_date to the to_date
"""
ticker: Annotated[str, ..., "The ticker symbol of the stock"]
timespan: Annotated[Literal["minute", "hour", "day", "week", "month", "quarter", "year"], ..., "The size of the time window."]
timespan_multiplier: Annotated[int, ..., "The multiplier for the time window"]
from_date: Annotated[str, ..., "The date to start pulling data from, YYYY-MM-DD format - ONLY include the year month and day"]
to_date: Annotated[str, ..., "The date to stop pulling data, YYYY-MM-DD format - ONLY include the year month and day"]
api_wrapper = PolygonAPIWrapper()
polygon_aggregate = PolygonAggregates(api_wrapper=api_wrapper)
def ticker_tool(query: TickerToolInput) -> str:
"""Pull data for the ticker."""
return polygon_aggregate.invoke(query)
search_tool = TavilySearchResults(
max_results=5,
include_raw_content=True,
)
class AgentOutputFormat(TypedDict):
numeric_answer: Annotated[Optional[float], ..., "The numeric answer, if the user asked for one"]
text_answer: Annotated[Optional[str], ..., "The text answer, if the user asked for one"]
reasoning: Annotated[str, ..., "The reasoning behind the answer"]
tools = [code_tool, search_tool, ticker_tool]
agent = create_react_agent(
model=ChatOpenAI(model="gpt-4o-mini"),
tools=tools,
response_format=AgentOutputFormat,
state_modifier="You are a financial expert. Respond to the users query accurately",
)
import { ChatOpenAI } from "@langchain/openai";
import { createReactAgent } from "@langchain/langgraph/prebuilt";
import { TavilySearchResults } from "@langchain/community/tools/tavily_search";
import { Sandbox } from '@e2b/code-interpreter'
import { restClient } from '@polygon.io/client-js';
import { tool } from "@langchain/core/tools";
import { z } from "zod";
const codeTool = tool(
async (input): Promise<string> => {
const sbx = await Sandbox.create();
const execution = await sbx.runCode(input.code);
if (execution.error) {
return `Error: ${execution.error}`;
}
return `Results: ${execution.results}, Logs: ${execution.logs}`;
},
{
name: "code",
description: "Execute python code and return the result.",
schema: z.object({
code: z.string().describe("The python code to execute"),
}),
}
);
const TickerToolInputSchema = z.object({
ticker: z.string().describe("The ticker symbol of the stock"),
timespan: z.enum(["minute", "hour", "day", "week", "month", "quarter", "year"]).describe("The size of the time window."),
timespan_multiplier: z.number().describe("The multiplier for the time window"),
from_date: z
.string()
.describe("The date to start pulling data from, YYYY-MM-DD format - ONLY include the year, month, and day"),
to_date: z
.string()
.describe("The date to stop pulling data, YYYY-MM-DD format - ONLY include the year, month, and day"),
});
const rest = restClient(process.env.POLYGON_API_KEY);
const tickerTool = tool(async (query): Promise<string> =>
{
const parsed = TickerToolInputSchema.parse(query);
const result = await rest.stocks.aggregates(
parsed.ticker,
parsed.timespan_multiplier,
parsed.timespan,
parsed.from_date,
parsed.to_date
);
return JSON.stringify(result);
},
{
name: "ticker",
description: "Pull data for the ticker",
schema: TickerToolInputSchema,
}
);
const searchTool = new TavilySearchResults({
maxResults: 5,
});
const AgentOutputFormatSchema = z.object({
numeric_answer: z.number().optional().describe("The numeric answer, if the user asked for one"),
text_answer: z.string().optional().describe("The text answer, if the user asked for one"),
reasoning: z.string().describe("The reasoning behind the answer"),
})
const tools = [codeTool, searchTool, tickerTool];
const agent = createReactAgent({
llm: new ChatOpenAI({ model: "gpt-4o" }),
tools: tools,
responseFormat: AgentOutputFormatSchema,
stateModifier: "You are a financial expert. Respond to the users query accurately",
});
export default agent;
Tests
Test code
- Pytest
- Vitest
- Jest
from agent import agent
import pytest
import json
from langsmith import testing
from langchain_openai import ChatOpenAI
@pytest.mark.langsmith
@pytest.mark.parametrize(
"query",
[
"Hello!",
"How are you doing?"
],
)
def test_no_tools_on_unrelated_query(query):
# Test that the agent does not use tools on unrelated queries
result = agent.invoke({"messages": [("user", query)]})
# first human message and then the AI response
assert len(result['messages']) == 2
@pytest.mark.langsmith
def test_searches_for_correct_ticker():
# Test that the agent searches for the correct ticker
result = agent.invoke({"messages": [("user", "What is the price of Aple?")]})
ticker = json.loads(result['messages'][1].additional_kwargs['tool_calls'][0]['function']['arguments'])["query"]["ticker"]
assert ticker == "AAPL"
assert len(result['messages']) == 4
@pytest.mark.langsmith
def test_executes_code_when_needed():
# Test that the agent executes code when needed
result = agent.invoke({"messages": [("user", "What was the average return rate for FAANG stock in 2024?")]})
tool_calls = [tc['function']['name'] for m in result['messages'] if m.additional_kwargs and 'tool_calls' in m.additional_kwargs for tc in m.additional_kwargs['tool_calls'] ]
testing.log_feedback(key="num_steps", score=len(result['messages']) - 1)
assert "code_tool" in tool_calls
assert abs(result['structured_response']['numeric_answer'] - 53) <= 1
judge_llm = ChatOpenAI(model="gpt-4o")
@pytest.mark.langsmith
def test_grounded_in_the_source():
# Test that the agent is grounded in the source
result = agent.invoke({"messages": [("user", "How did Nvidia stock do in 2024 according to analysts?")]})
tavily_tool_message_indices = []
for i, m in enumerate(result['messages']):
if m.additional_kwargs and 'tool_calls' in m.additional_kwargs and m.additional_kwargs['tool_calls'][0]['function']['name'] == 'tavily_search_results_json':
tavily_tool_message_indices.append(i+1)
documents = "\n".join([m.content for i,m in enumerate(result['messages']) if i in tavily_tool_message_indices])
with testing.trace_feedback():
instructions = (
"Return 1 if the ANSWER is grounded in the DOCUMENTS"
"Return 0 if the ANSWER is not grounded in the DOCUMENTS"
)
grade = judge_llm.invoke([
{"role": "system", "content": instructions},
{"role": "user", "content": f"ANSWER: {result['structured_response']['text_answer']}\nDOCUMENTS: {documents}"},
])
score = int(grade.content)
testing.log_feedback(key="groundedness", score=score)
assert score
import { expect } from "vitest";
import * as ls from "langsmith/vitest";
import agent from "../../src/agent";
import { AIMessage, ToolMessage } from "@langchain/core/messages";
import { ChatOpenAI } from "@langchain/openai";
const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });
const groundedEvaluator = async (params: {
answer: string;
referenceDocuments: string,
}) => {
const instructions = [
"Return 1 if the ANSWER is grounded in the DOCUMENTS",
"Return 0 if the ANSWER is not grounded in the DOCUMENTS",
].join("\n");
const grade = await judgeLLM.invoke([
{ role: "system", content: instructions },
{ role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
]);
const score = parseInt(grade.content.toString());
return { key: "groundedness", score };
}
ls.describe("Agent Tests", () => {
ls.test.each([
{ inputs: { query: "Hello!" }, referenceOutputs: { numMessages: 2 } },
{ inputs: { query: "How are you doing?" }, referenceOutputs: { numMessages: 2 } },
])(
"should not use tools on unrelated query: %s",
async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
ls.logOutputs(result);
expect(result.messages).toHaveLength(numMessages);
}
);
ls.test(
"should search for correct ticker",
{
inputs: { query: "What is the price of Aple?" },
referenceOutputs: { numMessages: 4 },
},
async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
const tickerQuery = toolCalls[0].args.ticker;
ls.logOutputs(result);
expect(tickerQuery).toBe("AAPL");
expect(result.messages).toHaveLength(numMessages);
}
);
ls.test(
"should execute code when needed",
{
inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
referenceOutputs: { answer: 53 },
},
async ({ inputs: { query }, referenceOutputs: { answer } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const toolCalls = result.messages
.filter(m => (m as AIMessage).tool_calls)
.flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));
ls.logFeedback({
key: "num_steps",
score: result.messages.length - 1,
});
ls.logOutputs(result);
expect(toolCalls).toContain("code_tool");
expect(Math.abs((result.structuredResponse.numeric_answer ?? 0 - answer) - answer)).toBeLessThanOrEqual(1);
}
);
ls.test(
"grounded in the source",
{
inputs: { query: "How did Nvidia stock do in 2024?" },
referenceOutputs: {},
},
async ({ inputs: { query }, referenceOutputs: {} }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const referenceDocuments = result.messages
.filter((m): m is ToolMessage => m.name?.includes('tavily_search_results_json') ?? false)
.map(m => m.content)
.join('\n');
const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);
await wrappedEvaluator({
answer: result.structuredResponse.text_answer ?? "",
referenceDocuments: referenceDocuments,
})
ls.logOutputs(result);
});
});
import { expect } from "@jest/globals";
import * as ls from "langsmith/jest";
import agent from "../../src/agent";
import { AIMessage } from "@langchain/core/messages";
import { ChatOpenAI } from "@langchain/openai";
const judgeLLM = new ChatOpenAI({ model: "gpt-4o" });
const groundedEvaluator = async (params: {
answer: string;
referenceDocuments: string,
}) => {
const instructions = [
"Return 1 if the ANSWER is grounded in the DOCUMENTS",
"Return 0 if the ANSWER is not grounded in the DOCUMENTS",
].join("\n");
const grade = await judgeLLM.invoke([
{ role: "system", content: instructions },
{ role: "user", content: `ANSWER: ${params.answer}\nDOCUMENTS: ${params.referenceDocuments}` },
]);
const score = parseInt(grade.content.toString());
return { key: "groundedness", score };
}
ls.describe("Agent Tests", () => {
ls.test.each([
{ inputs: { query: "Hello!" }, referenceOutputs: { numMessages: 2 } },
{ inputs: { query: "How are you doing?" }, referenceOutputs: { numMessages: 2 } },
])(
"should not use tools on unrelated query: %s",
async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
ls.logOutputs(result);
expect(result.messages).toHaveLength(numMessages);
}
);
ls.test(
"should search for correct ticker",
{
inputs: { query: "What is the price of Aple?" },
referenceOutputs: { numMessages: 4 },
},
async ({ inputs: { query }, referenceOutputs: { numMessages } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const toolCalls = (result.messages[1] as AIMessage).tool_calls || [];
const tickerQuery = toolCalls[0].args.ticker;
ls.logOutputs(result);
expect(tickerQuery).toBe("AAPL");
expect(result.messages).toHaveLength(numMessages);
}
);
ls.test(
"should execute code when needed",
{
inputs: { query: "What was the average return rate for FAANG stock in 2024?" },
referenceOutputs: { answer: 53 },
},
async ({ inputs: { query }, referenceOutputs: { answer } }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const toolCalls = result.messages
.filter(m => (m as AIMessage).tool_calls)
.flatMap(m => (m as AIMessage).tool_calls?.map(tc => tc.name));
ls.logFeedback({
key: "num_steps",
score: result.messages.length - 1,
});
ls.logOutputs(result);
expect(toolCalls).toContain("code_tool");
expect(Math.abs(result.structuredResponse.numeric_answer ?? 0 - answer)).toBeLessThanOrEqual(1);
}
);
ls.test(
"grounded in the source",
{
inputs: { query: "How did Nvidia stock do in 2024 according to analysts?" },
referenceOutputs: {},
},
async ({ inputs: { query }, referenceOutputs: {} }) => {
const result = await agent.invoke({
messages: [{ role: "user", content: query }],
});
const wrappedEvaluator = ls.wrapEvaluator(groundedEvaluator);
await wrappedEvaluator({
answer: result.structuredResponse.text_answer ?? "",
referenceDocuments: result.structuredResponse.reasoning,
})
ls.logOutputs(result);
});
});