mirror of
https://github.com/promptfoo/promptfoo.git
synced 2023-08-15 01:10:51 +03:00
558 lines
18 KiB
TypeScript
558 lines
18 KiB
TypeScript
import { evaluate } from '../src/evaluator.js';
|
|
|
|
import type { ApiProvider, TestSuite, Prompt } from '../src/types.js';
|
|
|
|
jest.mock('node-fetch', () => jest.fn());
|
|
|
|
jest.mock('../src/esm.js');
|
|
|
|
const mockApiProvider: ApiProvider = {
|
|
id: jest.fn().mockReturnValue('test-provider'),
|
|
callApi: jest.fn().mockResolvedValue({
|
|
output: 'Test output',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
}),
|
|
};
|
|
|
|
const mockGradingApiProviderPasses: ApiProvider = {
|
|
id: jest.fn().mockReturnValue('test-grading-provider'),
|
|
callApi: jest.fn().mockResolvedValue({
|
|
output: JSON.stringify({ pass: true, reason: 'Test grading output' }),
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
}),
|
|
};
|
|
|
|
const mockGradingApiProviderFails: ApiProvider = {
|
|
id: jest.fn().mockReturnValue('test-grading-provider'),
|
|
callApi: jest.fn().mockResolvedValue({
|
|
output: JSON.stringify({ pass: false, reason: 'Grading failed reason' }),
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
}),
|
|
};
|
|
|
|
function toPrompt(text: string): Prompt {
|
|
return { raw: text, display: text };
|
|
}
|
|
|
|
describe('evaluator', () => {
|
|
afterEach(() => {
|
|
jest.clearAllMocks();
|
|
});
|
|
|
|
test('evaluate with vars', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
|
|
tests: [
|
|
{
|
|
vars: { var1: 'value1', var2: 'value2' },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
|
|
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1 }} {{ var2 }}');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with vars - no escaping', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
|
|
tests: [
|
|
{
|
|
vars: { var1: '1 < 2', var2: 'he said "hello world"...' },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt 1 < 2 he said "hello world"...');
|
|
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1 }} {{ var2 }}');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with vars as object', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ var1.prop1 }} {{ var2 }}')],
|
|
tests: [
|
|
{
|
|
vars: { var1: { prop1: 'value1' }, var2: 'value2' },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
|
|
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1.prop1 }} {{ var2 }}');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with named prompt', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [{ raw: 'Test prompt {{ var1 }} {{ var2 }}', display: 'test display name' }],
|
|
tests: [
|
|
{
|
|
vars: { var1: 'value1', var2: 'value2' },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
|
|
expect(summary.results[0].prompt.display).toBe('test display name');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with multiple vars', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
|
|
tests: [
|
|
{
|
|
vars: { var1: ['value1', 'value3'], var2: ['value2', 'value4'] },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
|
|
expect(summary.stats.successes).toBe(4);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 40, prompt: 20, completion: 20, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
|
|
expect(summary.results[0].prompt.display).toBe('Test prompt {{ var1 }} {{ var2 }}');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with multiple providers', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider, mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ var1 }} {{ var2 }}')],
|
|
tests: [
|
|
{
|
|
vars: { var1: 'value1', var2: 'value2' },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
|
|
expect(summary.stats.successes).toBe(2);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 20, prompt: 10, completion: 10, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt value1 value2');
|
|
expect(summary.results[0].prompt.display).toBe(
|
|
'[test-provider] Test prompt {{ var1 }} {{ var2 }}',
|
|
);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate without tests', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt');
|
|
expect(summary.results[0].prompt.display).toBe('Test prompt');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate without tests with multiple providers', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider, mockApiProvider, mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(3);
|
|
expect(summary.stats.successes).toBe(3);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 30, prompt: 15, completion: 15, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt');
|
|
expect(summary.results[0].prompt.display).toBe('[test-provider] Test prompt');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with expected value matching output', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'equals',
|
|
value: 'Test output',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].success).toBe(true);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with expected value not matching output', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'equals',
|
|
value: 'Different output',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(0);
|
|
expect(summary.stats.failures).toBe(1);
|
|
expect(summary.results[0].success).toBe(false);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with fn: expected value', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'javascript',
|
|
value: 'output === "Test output";',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].success).toBe(true);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with fn: expected value not matching output', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'javascript',
|
|
value: 'output === "Different output";',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(0);
|
|
expect(summary.stats.failures).toBe(1);
|
|
expect(summary.results[0].success).toBe(false);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with grading expected value', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'llm-rubric',
|
|
value: 'output is a test output',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
defaultTest: {
|
|
options: {
|
|
provider: mockGradingApiProviderPasses,
|
|
},
|
|
},
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].success).toBe(true);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with grading expected value does not pass', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'llm-rubric',
|
|
value: 'output is a test output',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
defaultTest: {
|
|
options: {
|
|
provider: mockGradingApiProviderFails,
|
|
},
|
|
},
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(0);
|
|
expect(summary.stats.failures).toBe(1);
|
|
expect(summary.results[0].success).toBe(false);
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with postprocess option - default test', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
defaultTest: {
|
|
options: {
|
|
postprocess: 'output + " postprocessed"',
|
|
},
|
|
},
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].response?.output).toBe('Test output postprocessed');
|
|
});
|
|
|
|
test('evaluate with postprocess option - single test', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt')],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'equals',
|
|
value: 'Test output postprocessed',
|
|
},
|
|
],
|
|
options: {
|
|
postprocess: 'output + " postprocessed"',
|
|
},
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].response?.output).toBe('Test output postprocessed');
|
|
});
|
|
|
|
test('evaluate with providerPromptMap', async () => {
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt 1'), toPrompt('Test prompt 2')],
|
|
providerPromptMap: {
|
|
'test-provider': ['Test prompt 1'],
|
|
},
|
|
tests: [
|
|
{
|
|
vars: { var1: 'value1', var2: 'value2' },
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
expect(summary.stats.successes).toBe(1);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.stats.tokenUsage).toEqual({ total: 10, prompt: 5, completion: 5, cached: 0 });
|
|
expect(summary.results[0].prompt.raw).toBe('Test prompt 1');
|
|
expect(summary.results[0].prompt.display).toBe('Test prompt 1');
|
|
expect(summary.results[0].response?.output).toBe('Test output');
|
|
});
|
|
|
|
test('evaluate with scenarios', async () => {
|
|
const mockApiProvider: ApiProvider = {
|
|
id: jest.fn().mockReturnValue('test-provider'),
|
|
callApi: jest
|
|
.fn()
|
|
.mockResolvedValueOnce({
|
|
output: 'Hola mundo',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
})
|
|
.mockResolvedValueOnce({
|
|
output: 'Bonjour le monde',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
}),
|
|
};
|
|
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ language }}')],
|
|
scenarios: [
|
|
{
|
|
config: [
|
|
{
|
|
vars: {
|
|
language: 'Spanish',
|
|
expectedHelloWorld: 'Hola mundo',
|
|
},
|
|
},
|
|
{
|
|
vars: {
|
|
language: 'French',
|
|
expectedHelloWorld: 'Bonjour le monde',
|
|
},
|
|
},
|
|
],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'equals',
|
|
value: '{{expectedHelloWorld}}',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
|
|
expect(summary.stats.successes).toBe(2);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].response?.output).toBe('Hola mundo');
|
|
expect(summary.results[1].response?.output).toBe('Bonjour le monde');
|
|
});
|
|
|
|
test('evaluate with scenarios and multiple vars', async () => {
|
|
const mockApiProvider: ApiProvider = {
|
|
id: jest.fn().mockReturnValue('test-provider'),
|
|
callApi: jest
|
|
.fn()
|
|
.mockResolvedValueOnce({
|
|
output: 'Spanish Hola',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
})
|
|
.mockResolvedValueOnce({
|
|
output: 'Spanish Bonjour',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
})
|
|
.mockResolvedValueOnce({
|
|
output: 'French Hola',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
})
|
|
.mockResolvedValueOnce({
|
|
output: 'French Bonjour',
|
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
}),
|
|
};
|
|
const testSuite: TestSuite = {
|
|
providers: [mockApiProvider],
|
|
prompts: [toPrompt('Test prompt {{ language }} {{ greeting }}')],
|
|
scenarios: [
|
|
{
|
|
config: [
|
|
{
|
|
vars: {
|
|
language: ['Spanish', 'French'],
|
|
greeting: ['Hola', 'Bonjour'],
|
|
},
|
|
},
|
|
],
|
|
tests: [
|
|
{
|
|
assert: [
|
|
{
|
|
type: 'equals',
|
|
value: '{{language}} {{greeting}}',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
],
|
|
};
|
|
|
|
const summary = await evaluate(testSuite, {});
|
|
|
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
|
|
expect(summary.stats.successes).toBe(4);
|
|
expect(summary.stats.failures).toBe(0);
|
|
expect(summary.results[0].response?.output).toBe('Spanish Hola');
|
|
expect(summary.results[1].response?.output).toBe('Spanish Bonjour');
|
|
expect(summary.results[2].response?.output).toBe('French Hola');
|
|
expect(summary.results[3].response?.output).toBe('French Bonjour');
|
|
});
|
|
});
|