Files
llm-promptfoo/test/evaluator.test.ts
2023-08-08 11:43:42 -07:00

558 lines
18 KiB
TypeScript

import { evaluate } from '../src/evaluator.js';
import type { ApiProvider, TestSuite, Prompt } from '../src/types.js';
jest.mock('node-fetch', () => jest.fn());
jest.mock('../src/esm.js');
const mockApiProvider: ApiProvider = {
id: jest.fn().mockReturnValue('test-provider'),
callApi: jest.fn().mockResolvedValue({
output: 'Test output',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
}),
};
const mockGradingApiProviderPasses: ApiProvider = {
id: jest.fn().mockReturnValue('test-grading-provider'),
callApi: jest.fn().mockResolvedValue({
output: JSON.stringify({ pass: true, reason: 'Test grading output' }),
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
}),
};
const mockGradingApiProviderFails: ApiProvider = {
id: jest.fn().mockReturnValue('test-grading-provider'),
callApi: jest.fn().mockResolvedValue({
output: JSON.stringify({ pass: false, reason: 'Grading failed reason' }),
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
}),
};
function toPrompt(text: string): Prompt {
return { raw: text, display: text };
}
describe('evaluator', () => {
afterEach(() => {
jest.clearAllMocks();
});
test('evaluate with vars', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
tests: [
{
vars: { var1: 'value1', var2: 'value2' },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1 }} {{ var2 }}');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with vars - no escaping', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
tests: [
{
vars: { var1: '1 < 2', var2: 'he said "hello world"...' },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt 1 < 2 he said "hello world"...');
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1 }} {{ var2 }}');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with vars as object', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt {{ var1.prop1 }} {{ var2 }}')],
tests: [
{
vars: { var1: { prop1: 'value1' }, var2: 'value2' },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1.prop1 }} {{ var2 }}');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with named prompt', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [{ raw: 'Test prompt {{ var1 }} {{ var2 }}', display: 'test display name' }],
tests: [
{
vars: { var1: 'value1', var2: 'value2' },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
expect(summary.results[0].prompt.display).toBe('test display name');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with multiple vars', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
tests: [
{
vars: { var1: ['value1', 'value3'], var2: ['value2', 'value4'] },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
expect(summary.stats.successes).toBe(4);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 40, prompt: 20, completion: 20, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1 }} {{ var2 }}');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with multiple providers', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider, mockApiProvider],
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
tests: [
{
vars: { var1: 'value1', var2: 'value2' },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
expect(summary.stats.successes).toBe(2);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 20, prompt: 10, completion: 10, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
expect(summary.results[0].prompt.display).toBe(
'[test-provider] Test prompt {{ var1 }} {{ var2 }}',
);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate without tests', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt');
expect(summary.results[0].prompt.display).toBe('Test prompt');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate without tests with multiple providers', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider, mockApiProvider, mockApiProvider],
prompts: [toPrompt('Test prompt')],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(3);
expect(summary.stats.successes).toBe(3);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 30, prompt: 15, completion: 15, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt');
expect(summary.results[0].prompt.display).toBe('[test-provider] Test prompt');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with expected value matching output', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'equals',
value: 'Test output',
},
],
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].success).toBe(true);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with expected value not matching output', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'equals',
value: 'Different output',
},
],
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);
expect(summary.stats.failures).toBe(1);
expect(summary.results[0].success).toBe(false);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with fn: expected value', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'javascript',
value: 'output === "Test output";',
},
],
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].success).toBe(true);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with fn: expected value not matching output', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'javascript',
value: 'output === "Different output";',
},
],
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);
expect(summary.stats.failures).toBe(1);
expect(summary.results[0].success).toBe(false);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with grading expected value', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'llm-rubric',
value: 'output is a test output',
},
],
},
],
defaultTest: {
options: {
provider: mockGradingApiProviderPasses,
},
},
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].success).toBe(true);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with grading expected value does not pass', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'llm-rubric',
value: 'output is a test output',
},
],
},
],
defaultTest: {
options: {
provider: mockGradingApiProviderFails,
},
},
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);
expect(summary.stats.failures).toBe(1);
expect(summary.results[0].success).toBe(false);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with postprocess option - default test', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
defaultTest: {
options: {
postprocess: 'output + " postprocessed"',
},
},
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].response?.output).toBe('Test output postprocessed');
});
test('evaluate with postprocess option - single test', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt')],
tests: [
{
assert: [
{
type: 'equals',
value: 'Test output postprocessed',
},
],
options: {
postprocess: 'output + " postprocessed"',
},
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].response?.output).toBe('Test output postprocessed');
});
test('evaluate with providerPromptMap', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt 1'), toPrompt('Test prompt 2')],
providerPromptMap: {
'test-provider': ['Test prompt 1'],
},
tests: [
{
vars: { var1: 'value1', var2: 'value2' },
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
expect(summary.results[0].prompt.raw).toBe('Test prompt 1');
expect(summary.results[0].prompt.display).toBe('Test prompt 1');
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with scenarios', async () => {
const mockApiProvider: ApiProvider = {
id: jest.fn().mockReturnValue('test-provider'),
callApi: jest
.fn()
.mockResolvedValueOnce({
output: 'Hola mundo',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
})
.mockResolvedValueOnce({
output: 'Bonjour le monde',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
}),
};
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt {{ language }}')],
scenarios: [
{
config: [
{
vars: {
language: 'Spanish',
expectedHelloWorld: 'Hola mundo',
},
},
{
vars: {
language: 'French',
expectedHelloWorld: 'Bonjour le monde',
},
},
],
tests: [
{
assert: [
{
type: 'equals',
value: '{{expectedHelloWorld}}',
},
],
},
],
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
expect(summary.stats.successes).toBe(2);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].response?.output).toBe('Hola mundo');
expect(summary.results[1].response?.output).toBe('Bonjour le monde');
});
test('evaluate with scenarios and multiple vars', async () => {
const mockApiProvider: ApiProvider = {
id: jest.fn().mockReturnValue('test-provider'),
callApi: jest
.fn()
.mockResolvedValueOnce({
output: 'Spanish Hola',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
})
.mockResolvedValueOnce({
output: 'Spanish Bonjour',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
})
.mockResolvedValueOnce({
output: 'French Hola',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
})
.mockResolvedValueOnce({
output: 'French Bonjour',
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
}),
};
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: [toPrompt('Test prompt {{ language }} {{ greeting }}')],
scenarios: [
{
config: [
{
vars: {
language: ['Spanish', 'French'],
greeting: ['Hola', 'Bonjour'],
},
},
],
tests: [
{
assert: [
{
type: 'equals',
value: '{{language}} {{greeting}}',
},
],
},
],
},
],
};
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
expect(summary.stats.successes).toBe(4);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].response?.output).toBe('Spanish Hola');
expect(summary.results[1].response?.output).toBe('Spanish Bonjour');
expect(summary.results[2].response?.output).toBe('French Hola');
expect(summary.results[3].response?.output).toBe('French Bonjour');
});
});