Simplify API and add support for unified test suite definition (#14)

This commit is contained in:
Ian Webster
2023-05-30 09:02:49 -04:00
committed by GitHub
parent f259969051
commit bf81260b44
58 changed files with 2249 additions and 549 deletions

196
README.md
View File

@@ -32,21 +32,21 @@ It works on the command line too:
Start by establishing a handful of test cases - core use cases and failure cases that you want to ensure your prompt can handle.
As you explore modifications to the prompt, use `promptfoo eval` to rate all outputs. This ensures the prompt is actually improving overall.
As you explore modifications to the prompt, use `promptfoo eval` to rate all outputs. This ensures the prompt is actually improving overall.
As you collect more examples and establish a user feedback loop, continue to build the pool of test cases.
<img width="772" alt="LLM ops" src="https://github.com/typpo/promptfoo/assets/310310/cf0461a7-2832-4362-9fbb-4ebd911d06ff">
## Usage (command line & web viewer)
## Usage
To get started, run the following command:
To get started, run this command:
```
npx promptfoo init
```
This will create some templates in your current directory: `prompts.txt`, `vars.csv`, and `promptfooconfig.js`.
This will create some placeholders in your current directory: `prompts.txt` and `promptfooconfig.yaml`.
After editing the prompts and variables to your liking, run the eval command to kick off an evaluation:
@@ -54,20 +54,75 @@ After editing the prompts and variables to your liking, run the eval command to
npx promptfoo eval
```
If you're looking to customize your usage, you have a wide set of parameters at your disposal. See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/parameters) for more detail:
### Configuration
| Option | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `-p, --prompts <paths...>` | Paths to prompt files, directory, or glob |
| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers](https://www.promptfoo.dev/docs/configuration/providers) |
| `-o, --output <path>` | Path to output file (csv, json, yaml, html) |
| `-v, --vars <path>` | Path to file with prompt variables (csv, json, yaml) |
| `-c, --config <path>` | Path to configuration file. `promptfooconfig.js[on]` is automatically loaded if present |
| `-j, --max-concurrency <number>` | Maximum number of concurrent API calls |
| `--table-cell-max-length <number>` | Truncate console table cells to this length |
| `--prompt-prefix <path>` | This prefix is prepended to every prompt |
| `--prompt-suffix <path>` | This suffix is append to every prompt |
| `--grader` | Provider that will grade outputs, if you are using [LLM grading](https://www.promptfoo.dev/docs/configuration/expected-outputs) |
The YAML configuration format runs each prompt through a series of example inputs (aka "test case") and checks if they meet requirements (aka "assert").
See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/parameters) for more detail.
```yaml
prompts: [prompts.txt]
providers: [openai:gpt-3.5-turbo]
tests:
- description: First test case - automatic review
vars:
var1: first variable's value
var2: another value
var3: some other value
assert:
- type: equality
value: expected LLM output goes here
- type: function
value: output.includes('some text')
- description: Second test case - manual review
# Test cases don't need assertions if you prefer to review the output yourself
vars:
var1: new value
var2: another value
var3: third value
- description: Third test case - other types of automatic review
vars:
var1: yet another value
var2: and another
var3: dear llm, please output your response in json format
assert:
- type: contains-json
- type: similarity
value: ensures that output is semantically similar to this text
- type: llm-rubric
value: ensure that output contains a reference to X
```
### Tests on spreadsheet
Some people prefer to configure their LLM tests in a CSV. In that case, the config is pretty simple:
```yaml
prompts: [prompts.txt]
providers: [openai:gpt-3.5-turbo]
tests: tests.csv
```
See [example CSV](https://github.com/typpo/promptfoo/blob/main/examples/simple-test/tests.csv).
### Command-line
If you're looking to customize your usage, you have a wide set of parameters at your disposal.
| Option | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `-p, --prompts <paths...>` | Paths to [prompt files](https://promptfoo.dev/docs/configuration/parameters#prompt-files), directory, or glob |
| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers](https://promptfoo.dev/docs/configuration/providers) |
| `-o, --output <path>` | Path to [output file](https://promptfoo.dev/docs/configuration/parameters#output-file) (csv, json, yaml, html) |
| `--tests <path>` | Path to [external test file](https://promptfoo.dev/docs/configurationexpected-outputsassertions#load-an-external-tests-file) |
| `-c, --config <path>` | Path to [configuration file](https://promptfoo.dev/docs/configuration/guide). `promptfooconfig.js/json/yaml` is automatically loaded if present |
| `-j, --max-concurrency <number>` | Maximum number of concurrent API calls |
| `--table-cell-max-length <number>` | Truncate console table cells to this length |
| `--prompt-prefix <path>` | This prefix is prepended to every prompt |
| `--prompt-suffix <path>` | This suffix is append to every prompt |
| `--grader` | [Provider](https://promptfoo.dev/docs/configuration/providers) that will conduct the evaluation, if you are [using LLM to grade your output](https://promptfoo.dev/docs/configuration/expected-outputs#llm-evaluation) |
After running an eval, you may optionally use the `view` command to open the web viewer:
@@ -79,10 +134,10 @@ npx promptfoo view
#### Prompt quality
In this example, we evaluate whether adding adjectives to the personality of an assistant bot affects the responses:
In [this example](https://github.com/typpo/promptfoo/tree/main/examples/assistant-cli), we evaluate whether adding adjectives to the personality of an assistant bot affects the responses:
```bash
npx promptfoo eval -p prompts.txt -v vars.csv -r openai:gpt-3.5-turbo
npx promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo -t tests.csv
```
<!--
@@ -93,15 +148,13 @@ npx promptfoo eval -p prompts.txt -v vars.csv -r openai:gpt-3.5-turbo
This command will evaluate the prompts in `prompts.txt`, substituing the variable values from `vars.csv`, and output results in your terminal.
Have a look at the setup and full output [here](https://github.com/typpo/promptfoo/tree/main/examples/assistant-cli).
You can also output a nice [spreadsheet](https://docs.google.com/spreadsheets/d/1nanoj3_TniWrDl1Sj-qYqIMD6jwm5FBy15xPFdUTsmI/edit?usp=sharing), [JSON](https://github.com/typpo/promptfoo/blob/main/examples/simple-cli/output.json), YAML, or an HTML file:
![Table output](https://user-images.githubusercontent.com/310310/235483444-4ddb832d-e103-4b9c-a862-b0d6cc11cdc0.png)
#### Model quality
In this example, we evaluate the difference between GPT 3 and GPT 4 outputs for a given prompt:
In the [next example](https://github.com/typpo/promptfoo/tree/main/examples/gpt-3.5-vs-4), we evaluate the difference between GPT 3 and GPT 4 outputs for a given prompt:
```bash
npx promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo openai:gpt-4 -o output.html
@@ -111,19 +164,46 @@ Produces this HTML table:
![Side-by-side evaluation of LLM model quality, gpt3 vs gpt4, html output](https://user-images.githubusercontent.com/310310/235490527-e0c31f40-00a0-493a-8afc-8ed6322bb5ca.png)
Full setup and output [here](https://github.com/typpo/promptfoo/tree/main/examples/gpt-3.5-vs-4).
## Usage (node package)
You can also use `promptfoo` as a library in your project by importing the `evaluate` function. The function takes the following parameters:
- `providers`: a list of provider strings or `ApiProvider` objects, or just a single string or `ApiProvider`.
- `options`: the prompts and variables you want to test:
- `testSuite`: the Javascript equivalent of the promptfooconfig.yaml
```typescript
{
prompts: string[];
interface TestSuiteConfig {
providers: string[]; // Valid provider name (e.g. openai:gpt-3.5-turbo)
prompts: string[]; // List of prompts
tests: string | TestCase[]; // Path to a CSV file, or list of test cases
defaultTest?: Omit<TestCase, 'description'>; // Optional: add default vars and assertions on test case
outputPath?: string; // Optional: write results to file
}
interface TestCase {
description?: string;
vars?: Record<string, string>;
assert?: Assertion[];
prompt?: PromptConfig;
grading?: GradingConfig;
}
interface Assertion {
type: 'equality' | 'is-json' | 'contains-json' | 'function' | 'similarity' | 'llm-rubric';
value?: string;
threshold?: number; // For similarity assertions
provider?: ApiProvider; // For assertions that require an LLM provider
}
```
- `options`: misc options related to how the tests are run
```typescript
interface EvaluateOptions {
maxConcurrency?: number;
showProgressBar?: boolean;
generateSuggestions?: boolean;
}
```
@@ -134,61 +214,31 @@ You can also use `promptfoo` as a library in your project by importing the `eval
```js
import promptfoo from 'promptfoo';
const options = {
const results = await promptfoo.evaluate({
prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
vars: [{ body: 'Hello world' }, { body: "I'm hungry" }],
};
(async () => {
const summary = await promptfoo.evaluate('openai:gpt-3.5-turbo', options);
console.log(summary);
})();
```
This code imports the `promptfoo` library, defines the evaluation options, and then calls the `evaluate` function with these options. The results are logged to the console:
```js
{
"results": [
providers: ['openai:gpt-3.5-turbo'],
tests: [
{
"prompt": {
"raw": "Rephrase this in French: Hello world",
"display": "Rephrase this in French: {{body}}"
vars: {
body: 'Hello world',
},
},
{
vars: {
body: "I'm hungry",
},
"vars": {
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"total": 19,
"prompt": 16,
"completion": 3
}
}
},
// ...
],
"stats": {
"successes": 4,
"failures": 0,
"tokenUsage": {
"total": 120,
"prompt": 72,
"completion": 48
}
},
"table": [
// ...
]
}
});
```
[See full example here](https://github.com/typpo/promptfoo/tree/main/examples/simple-import)
This code imports the `promptfoo` library, defines the evaluation options, and then calls the `evaluate` function with these options.
See the full example [here](https://github.com/typpo/promptfoo/tree/main/examples/simple-import), which includes an example results object.
## Configuration
- **[Setting up an eval](https://promptfoo.dev/docs/configuration/parameters)**: Learn more about how to set up prompt files, vars file, output, etc.
- **[Main guide](https://promptfoo.dev/docs/configuration/guide)**: Learn about how to configure your YAML file, setup prompt files, etc.
- **[Configuring test cases](https://promptfoo.dev/docs/configuration/expected-outputs)**: Learn more about how to configure expected outputs and test assertions.
## Installation

View File

@@ -1,7 +1,13 @@
This example shows how you can use promptfoo to generate a side-by-side eval of two prompts for an ecommerce chat bot.
Run:
Configuration is in `promptfooconfig.yaml`. Run:
```
promptfoo eval -p prompts.txt --vars vars.csv -r openai:chat
promptfoo eval
```
Full command-line equivalent:
```
promptfoo eval --prompts prompts.txt --tests tests.csv --providers openai:gpt-3.5-turbo --output output.json
```

View File

@@ -0,0 +1,3 @@
prompts: prompts.txt
providers: openai:gpt-3.5-turbo
tests: tests.csv

View File

@@ -0,0 +1,13 @@
This example uses a custom API provider in `customProvider.js`. It also uses CSV test cases.
Run:
```
promptfoo eval
```
Full command-line equivalent:
```
promptfoo eval --prompts prompts.txt --tests vars.csv --providers openai:chat --output output.json --providers customProvider.js
```

View File

@@ -0,0 +1,3 @@
prompts: prompts.txt
providers: customProvider.js
tests: vars.csv

View File

@@ -1,7 +1,13 @@
This example shows how you can use promptfoo to generate a side-by-side eval of multiple prompts to compare GPT 3 and GPT 4 outputs.
Run:
Configure in `promptfooconfig.yaml`. Run with:
```
promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo openai:gpt-4
promptfoo eval
```
Full command-line equivalent:
```
promptfoo eval --prompts prompts.txt --providers openai:gpt-3.5-turbo openai:gpt-4
```

View File

@@ -0,0 +1,4 @@
prompts: prompts.txt
providers:
- openai:gpt-3.5-turbo
- openai:gpt-4

View File

@@ -0,0 +1,5 @@
This example is pre-configured in `promptfooconfig.js`. That means you can just run:
```
promptfoo eval
```

View File

@@ -0,0 +1,92 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>Table Output</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica, Arial,
sans-serif;
}
table,
th,
td {
border: 1px solid black;
border-collapse: collapse;
text-align: left;
word-break: break-all;
}
th,
td {
padding: 5px;
min-width: 200px;
}
tr > td[data-content^='[PASS]'] {
color: green;
}
tr > td[data-content^='[FAIL]'] {
color: #ad0000;
}
</style>
</head>
<body>
<table>
<thead>
<th>Rephrase this in {{language}}: {{body}}</th>
<th>Translate this to conversational {{language}}: {{body}}</th>
<th>body</th>
<th>language</th>
</thead>
<tbody>
<tr>
<td data-content="Bonjour le monde">Bonjour le monde</td>
<td data-content="Bonjour le monde">Bonjour le monde</td>
<td data-content="Hello world">Hello world</td>
<td data-content="French">French</td>
</tr>
<tr>
<td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
<td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
<td data-content="I&#39;m hungry">I&#39;m hungry</td>
<td data-content="French">French</td>
</tr>
<tr>
<td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
<td data-content="Ahoy thar world!">Ahoy thar world!</td>
<td data-content="Hello world">Hello world</td>
<td data-content="Pirate">Pirate</td>
</tr>
<tr>
<td data-content="Arrr, me belly be empty and yearnin&#39; for grub.">
Arrr, me belly be empty and yearnin&#39; for grub.
</td>
<td data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!">
Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!
</td>
<td data-content="I&#39;m hungry">I&#39;m hungry</td>
<td data-content="Pirate">Pirate</td>
</tr>
</tbody>
</table>
</body>
</html>

View File

@@ -0,0 +1,181 @@
{
"version": 1,
"results": [
{
"prompt": {
"raw": "Rephrase this in French: Hello world",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"cached": 19
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational French: Hello world",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"cached": 20
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in French: I&#39;m hungry",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "I'm hungry"
},
"response": {
"output": "J'ai faim.",
"tokenUsage": {
"cached": 24
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational French: I&#39;m hungry",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "I'm hungry"
},
"response": {
"output": "J'ai faim.",
"tokenUsage": {
"cached": 25
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in Pirate: Hello world",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "Hello world"
},
"response": {
"output": "Ahoy thar, world!",
"tokenUsage": {
"cached": 23
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational Pirate: Hello world",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "Hello world"
},
"response": {
"output": "Ahoy thar world!",
"tokenUsage": {
"cached": 23
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in Pirate: I&#39;m hungry",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be empty and yearnin' for grub.",
"tokenUsage": {
"cached": 33
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational Pirate: I&#39;m hungry",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be rumblin'! I be needin' some grub!",
"tokenUsage": {
"cached": 39
}
},
"success": true
}
],
"stats": {
"successes": 8,
"failures": 0,
"tokenUsage": {
"total": 0,
"prompt": 0,
"completion": 0,
"cached": 206
}
},
"table": {
"head": {
"prompts": [
"Rephrase this in {{language}}: {{body}}",
"Translate this to conversational {{language}}: {{body}}"
],
"vars": ["body", "language"]
},
"body": [
{
"outputs": ["Bonjour le monde", "Bonjour le monde"],
"vars": ["Hello world", "French"]
},
{
"outputs": ["J'ai faim.", "J'ai faim."],
"vars": ["I'm hungry", "French"]
},
{
"outputs": ["Ahoy thar, world!", "Ahoy thar world!"],
"vars": ["Hello world", "Pirate"]
},
{
"outputs": [
"Arrr, me belly be empty and yearnin' for grub.",
"Arrr, me belly be rumblin'! I be needin' some grub!"
],
"vars": ["I'm hungry", "Pirate"]
}
]
}
}

View File

@@ -0,0 +1,31 @@
module.exports = {
description: 'A translator built with LLM',
prompts: ['prompts.txt'],
providers: ['openai:gpt-3.5-turbo'],
tests: [
{
vars: {
language: 'French',
body: 'Hello world',
},
},
{
vars: {
language: 'French',
body: "I'm hungry",
},
},
{
vars: {
language: 'Pirate',
body: 'Hello world',
},
},
{
vars: {
language: 'Pirate',
body: "I'm hungry",
},
},
],
};

View File

@@ -0,0 +1,3 @@
Rephrase this in {{language}}: {{body}}
---
Translate this to conversational {{language}}: {{body}}

View File

@@ -0,0 +1,22 @@
import promptfoo from '../../dist/index.js';
(async () => {
const results = await promptfoo.evaluate({
prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
providers: ['openai:gpt-3.5-turbo'],
tests: [
{
vars: {
body: 'Hello world',
},
},
{
vars: {
body: "I'm hungry",
},
},
],
});
console.log('RESULTS:');
console.log(results);
})();

View File

@@ -1,9 +1,15 @@
This example shows how you can have an LLM grade its own output according to predefined expectations.
Configuration is in promptfooconfig.js
Identical configurations are provided in `promptfooconfig.js` and `promptfooconfig.yaml`.
Run:
```
promptfoo eval
```
You can also define the tests in a CSV file:
```
promptfoo eval --tests tests.csv
```

View File

@@ -1,6 +1,75 @@
module.exports = {
providers: ['openai:chat:gpt-3.5-turbo'],
prompts: ['./prompts.txt'],
vars: './vars.csv',
grader: 'openai:chat:gpt-4',
prompts: 'prompts.txt',
providers: 'openai:gpt-3.5-turbo',
defaultTest: {
assert: [
{
type: 'llm-rubric',
value: 'Do not mention that you are an AI or chat assistant',
},
],
},
tests: [
{
vars: {
name: 'Bob',
question: 'Can you help me find a specific product on your website?',
},
},
{
vars: {
name: 'Jane',
question: 'Do you have any promotions or discounts currently available?',
},
},
{
vars: {
name: 'Dave',
question: 'What are your shipping and return policies?',
},
},
{
vars: {
name: 'Jim',
question: 'Can you provide more information about the product specifications or features?',
},
},
{
vars: {
name: 'Alice',
question: "Can you recommend products that are similar to what I've been looking at?",
},
},
{
vars: {
name: 'Sophie',
question:
'Do you have any recommendations for products that are currently popular or trending?',
},
},
{
vars: {
name: 'Ben',
question: 'Can you check the availability of a product at a specific store location?',
},
},
{
vars: {
name: 'Jessie',
question: 'How can I track my order after it has been shipped?',
},
},
{
vars: {
name: 'Kim',
question: 'What payment methods do you accept?',
},
},
{
vars: {
name: 'Emily',
question: "Can you help me with a problem I'm having with my account or order?",
},
},
],
};

View File

@@ -0,0 +1,37 @@
prompts: prompts.txt
providers: openai:gpt-3.5-turbo
defaultTest:
assert:
- type: llm-rubric
value: Do not mention that you are an AI or chat assistant
tests:
- vars:
name: Bob
question: Can you help me find a specific product on your website?
- vars:
name: Jane
question: Do you have any promotions or discounts currently available?
- vars:
name: Dave
question: What are your shipping and return policies?
- vars:
name: Jim
question: Can you provide more information about the product specifications or features?
- vars:
name: Alice
question: Can you recommend products that are similar to what I've been looking at?
- vars:
name: Sophie
question: Do you have any recommendations for products that are currently popular or trending?
- vars:
name: Ben
question: Can you check the availability of a product at a specific store location?
- vars:
name: Jessie
question: How can I track my order after it has been shipped?
- vars:
name: Kim
question: What payment methods do you accept?
- vars:
name: Emily
question: Can you help me with a problem I'm having with my account or order?

View File

@@ -1,5 +0,0 @@
Run:
```
promptfoo eval --prompts prompts.txt --vars vars.csv --providers openai:chat --output output.json --providers customProvider.js
```

View File

@@ -1,11 +1,11 @@
This example is pre-configured in `promptfooconfig.js`. That means you can just run:
This example is pre-configured in `promptfooconfig.yaml` (both identical examples). That means you can just run:
```
promptfoo eval
```
Here's the full command:
To override prompts, providers, output, etc. you can run:
```
promptfoo eval --prompts prompts.txt --vars vars.csv --providers openai:chat --output output.json
promptfoo eval --prompts prompts.txt --providers openai:chat --output output.json
```

View File

@@ -14,39 +14,77 @@
td {
border: 1px solid black;
border-collapse: collapse;
text-align: left;
word-break: break-all;
}
th,
td {
padding: 5px;
min-width: 200px;
}
tr > td[data-content^='[PASS]'] {
color: green;
}
tr > td[data-content^='[FAIL]'] {
color: #ad0000;
}
</style>
</head>
<body>
<table>
<thead>
<th>Rephrase this in French: {{body}}</th>
<th>Rephrase this in {{language}}: {{body}}</th>
<th>Rephrase this like a pirate: {{body}}</th>
<th>Translate this to conversational {{language}}: {{body}}</th>
<th>body</th>
<th>language</th>
</thead>
<tbody>
<tr>
<td>Bonjour le monde</td>
<td data-content="Bonjour le monde">Bonjour le monde</td>
<td>Ahoy thar, me hearties! Avast ye, world!</td>
<td data-content="Bonjour le monde">Bonjour le monde</td>
<td>Hello world</td>
<td data-content="Hello world">Hello world</td>
<td data-content="French">French</td>
</tr>
<tr>
<td>J&#39;ai faim.</td>
<td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
<td>
Arrr, me belly be empty and me throat be parched! I be needin&#39; some grub, matey!
<td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
<td data-content="I&#39;m hungry">I&#39;m hungry</td>
<td data-content="French">French</td>
</tr>
<tr>
<td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
<td data-content="Ahoy thar world!">Ahoy thar world!</td>
<td data-content="Hello world">Hello world</td>
<td data-content="Pirate">Pirate</td>
</tr>
<tr>
<td data-content="Arrr, me belly be empty and yearnin&#39; for grub.">
Arrr, me belly be empty and yearnin&#39; for grub.
</td>
<td>I&#39;m hungry</td>
<td data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!">
Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!
</td>
<td data-content="I&#39;m hungry">I&#39;m hungry</td>
<td data-content="Pirate">Pirate</td>
</tr>
</tbody>
</table>

View File

@@ -1,19 +1,36 @@
{
"version": 1,
"results": [
{
"prompt": {
"raw": "Rephrase this in French: Hello world",
"display": "Rephrase this in French: {{body}}"
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"total": 19,
"prompt": 16,
"completion": 3
"cached": 19
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational French: Hello world",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"cached": 20
}
},
"success": true
@@ -21,74 +38,144 @@
{
"prompt": {
"raw": "Rephrase this in French: I&#39;m hungry",
"display": "Rephrase this in French: {{body}}"
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "I'm hungry"
},
"response": {
"output": "J'ai faim.",
"tokenUsage": {
"total": 24,
"prompt": 19,
"completion": 5
"cached": 24
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this like a pirate: Hello world",
"display": "Rephrase this like a pirate: {{body}}"
},
"vars": {
"body": "Hello world"
},
"response": {
"output": "Ahoy thar, me hearties! Avast ye, world!",
"tokenUsage": {
"total": 32,
"prompt": 17,
"completion": 15
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this like a pirate: I&#39;m hungry",
"display": "Rephrase this like a pirate: {{body}}"
"raw": "Translate this to conversational French: I&#39;m hungry",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be empty and me throat be parched! I be needin' some grub, matey!",
"output": "J'ai faim.",
"tokenUsage": {
"total": 45,
"prompt": 20,
"completion": 25
"cached": 25
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in Pirate: Hello world",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "Hello world"
},
"response": {
"output": "Ahoy thar, world!",
"tokenUsage": {
"cached": 23
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational Pirate: Hello world",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "Hello world"
},
"response": {
"output": "Ahoy thar world!",
"tokenUsage": {
"cached": 23
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in Pirate: I&#39;m hungry",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be empty and yearnin' for grub.",
"tokenUsage": {
"cached": 33
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational Pirate: I&#39;m hungry",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be rumblin'! I be needin' some grub!",
"tokenUsage": {
"cached": 39
}
},
"success": true
}
],
"stats": {
"successes": 4,
"successes": 8,
"failures": 0,
"tokenUsage": {
"total": 120,
"prompt": 72,
"completion": 48
"total": 0,
"prompt": 0,
"completion": 0,
"cached": 206
}
},
"table": [
["Rephrase this in French: {{body}}", "Rephrase this like a pirate: {{body}}", "body"],
["Bonjour le monde", "Ahoy thar, me hearties! Avast ye, world!", "Hello world"],
[
"J'ai faim.",
"Arrr, me belly be empty and me throat be parched! I be needin' some grub, matey!",
"I'm hungry"
"table": {
"head": {
"prompts": [
"Rephrase this in {{language}}: {{body}}",
"Translate this to conversational {{language}}: {{body}}"
],
"vars": ["body", "language"]
},
"body": [
{
"outputs": ["Bonjour le monde", "Bonjour le monde"],
"vars": ["Hello world", "French"]
},
{
"outputs": ["J'ai faim.", "J'ai faim."],
"vars": ["I'm hungry", "French"]
},
{
"outputs": ["Ahoy thar, world!", "Ahoy thar world!"],
"vars": ["Hello world", "Pirate"]
},
{
"outputs": [
"Arrr, me belly be empty and yearnin' for grub.",
"Arrr, me belly be rumblin'! I be needin' some grub!"
],
"vars": ["I'm hungry", "Pirate"]
}
]
]
}
}

View File

@@ -1,5 +0,0 @@
module.exports = {
providers: ['openai:gpt-3.5-turbo'],
prompts: ['./prompts.txt'],
vars: './vars.csv',
};

View File

@@ -0,0 +1,16 @@
description: A translator built with LLM
prompts: [prompts.txt]
providers: [openai:gpt-3.5-turbo]
tests:
- vars:
language: French
body: Hello world
- vars:
language: French
body: I'm hungry
- vars:
language: Pirate
body: Hello world
- vars:
language: Pirate
body: I'm hungry

View File

@@ -1,3 +1,3 @@
Rephrase this in French: {{body}}
Rephrase this in {{language}}: {{body}}
---
Rephrase this like a pirate: {{body}}
Translate this to conversational {{language}}: {{body}}

View File

@@ -1,3 +0,0 @@
body
Hello world
I'm hungry
1 body
2 Hello world
3 I'm hungry

View File

@@ -0,0 +1,11 @@
This example is pre-configured in `promptfooconfig.yaml`. Run:
```
promptfoo eval
```
Here's the full command:
```
promptfoo eval --prompts prompts.txt --tests tests.csv --providers openai:gpt-3.5-turbo
```

View File

@@ -0,0 +1,92 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>Table Output</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica, Arial,
sans-serif;
}
table,
th,
td {
border: 1px solid black;
border-collapse: collapse;
text-align: left;
word-break: break-all;
}
th,
td {
padding: 5px;
min-width: 200px;
}
tr > td[data-content^='[PASS]'] {
color: green;
}
tr > td[data-content^='[FAIL]'] {
color: #ad0000;
}
</style>
</head>
<body>
<table>
<thead>
<th>Rephrase this in {{language}}: {{body}}</th>
<th>Translate this to conversational {{language}}: {{body}}</th>
<th>body</th>
<th>language</th>
</thead>
<tbody>
<tr>
<td data-content="Bonjour le monde">Bonjour le monde</td>
<td data-content="Bonjour le monde">Bonjour le monde</td>
<td data-content="Hello world">Hello world</td>
<td data-content="French">French</td>
</tr>
<tr>
<td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
<td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
<td data-content="I&#39;m hungry">I&#39;m hungry</td>
<td data-content="French">French</td>
</tr>
<tr>
<td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
<td data-content="Ahoy thar world!">Ahoy thar world!</td>
<td data-content="Hello world">Hello world</td>
<td data-content="Pirate">Pirate</td>
</tr>
<tr>
<td data-content="Arrr, me belly be empty and yearnin&#39; for grub.">
Arrr, me belly be empty and yearnin&#39; for grub.
</td>
<td data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!">
Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!
</td>
<td data-content="I&#39;m hungry">I&#39;m hungry</td>
<td data-content="Pirate">Pirate</td>
</tr>
</tbody>
</table>
</body>
</html>

View File

@@ -0,0 +1,181 @@
{
"version": 1,
"results": [
{
"prompt": {
"raw": "Rephrase this in French: Hello world",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"cached": 19
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational French: Hello world",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "Hello world"
},
"response": {
"output": "Bonjour le monde",
"tokenUsage": {
"cached": 20
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in French: I&#39;m hungry",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "I'm hungry"
},
"response": {
"output": "J'ai faim.",
"tokenUsage": {
"cached": 24
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational French: I&#39;m hungry",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "French",
"body": "I'm hungry"
},
"response": {
"output": "J'ai faim.",
"tokenUsage": {
"cached": 25
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in Pirate: Hello world",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "Hello world"
},
"response": {
"output": "Ahoy thar, world!",
"tokenUsage": {
"cached": 23
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational Pirate: Hello world",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "Hello world"
},
"response": {
"output": "Ahoy thar world!",
"tokenUsage": {
"cached": 23
}
},
"success": true
},
{
"prompt": {
"raw": "Rephrase this in Pirate: I&#39;m hungry",
"display": "Rephrase this in {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be empty and yearnin' for grub.",
"tokenUsage": {
"cached": 33
}
},
"success": true
},
{
"prompt": {
"raw": "Translate this to conversational Pirate: I&#39;m hungry",
"display": "Translate this to conversational {{language}}: {{body}}"
},
"vars": {
"language": "Pirate",
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be rumblin'! I be needin' some grub!",
"tokenUsage": {
"cached": 39
}
},
"success": true
}
],
"stats": {
"successes": 8,
"failures": 0,
"tokenUsage": {
"total": 0,
"prompt": 0,
"completion": 0,
"cached": 206
}
},
"table": {
"head": {
"prompts": [
"Rephrase this in {{language}}: {{body}}",
"Translate this to conversational {{language}}: {{body}}"
],
"vars": ["body", "language"]
},
"body": [
{
"outputs": ["Bonjour le monde", "Bonjour le monde"],
"vars": ["Hello world", "French"]
},
{
"outputs": ["J'ai faim.", "J'ai faim."],
"vars": ["I'm hungry", "French"]
},
{
"outputs": ["Ahoy thar, world!", "Ahoy thar world!"],
"vars": ["Hello world", "Pirate"]
},
{
"outputs": [
"Arrr, me belly be empty and yearnin' for grub.",
"Arrr, me belly be rumblin'! I be needin' some grub!"
],
"vars": ["I'm hungry", "Pirate"]
}
]
}
}

View File

@@ -0,0 +1,4 @@
description: A translator built with LLM
prompts: prompts.txt
providers: openai:gpt-3.5-turbo
tests: tests.csv

View File

@@ -0,0 +1,3 @@
Rephrase this in {{language}}: {{body}}
---
Translate this to conversational {{language}}: {{body}}

View File

@@ -0,0 +1,5 @@
language,body
French,Hello world
French,I'm hungry
Pirate,Hello world
Pirate,I'm hungry
1 language body
2 French Hello world
3 French I'm hungry
4 Pirate Hello world
5 Pirate I'm hungry

View File

@@ -1,10 +0,0 @@
import promptfoo from '../../dist/index.js';
(async () => {
const results = await promptfoo.evaluate('openai:chat', {
prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
vars: [{ body: 'Hello world' }, { body: "I'm hungry" }],
});
console.log('RESULTS:');
console.log(results);
})();

View File

@@ -1,5 +1,14 @@
This example shows how you can set an expected value in vars.csv and emit a PASS/FAIL based on it:
This example shows a YAML configuration with inline tests.
Run the test suite with:
```
promptfoo eval --prompts prompts.txt --vars vars.csv --providers openai:chat --output output.html
promptfoo eval
```
Note that you can edit the configuration to use a CSV test input instead. Set
`tests: tests.csv` and try running it again, or run:
```
promptfoo eval --tests tests.csv
```

View File

@@ -1,3 +1,44 @@
RESULT,Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body
PASS,Ahoy mateys o' the world!,"Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world
PASS,"I be feelin' a mighty need for grub, matey.","Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry
Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body
"Ahoy thar, world!","Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world
"I be feelin' a mighty need for grub, matey.","Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry
"""Yarr, me hearties! Spew forth a JSON tale o' yer life!""","[FAIL] Expected Arrr, me hearties! Gather round and listen to the tale of me life as a pirate.
{
""name"": ""Captain Blackbeard"",
""age"": 35,
""occupation"": ""Pirate"",
""location"": ""The Caribbean"",
""crew"": [""Redbeard"", ""Long John"", ""Calico Jack""],
""ship"": {
""name"": ""The Black Pearl"",
""type"": ""Galleon"",
""weapons"": [""Cannons"", ""Cutlasses"", ""Pistols""]
},
""treasure"": {
""gold"": 50000,
""jewels"": [""Diamonds"", ""Emeralds"", ""Rubies""]
},
""adventures"": [
{
""title"": ""The Raid on Port Royal"",
""description"": ""We plundered the town and took all their riches!"",
""date"": ""June 12, 1720""
},
{
""title"": ""The Battle of Nassau"",
""description"": ""We fought off the British navy and claimed the port for ourselves!"",
""date"": ""September 3, 1721""
},
{
""title"": ""The Treasure of Tortuga"",
""description"": ""We found a hidden treasure trove on the island of Tortuga!"",
""date"": ""December 18, 1722""
}
]
}
Me life as a pirate has been full of adventure and danger. Me crew and I have raided towns, battled the British navy, and found hidden treasures. We've sailed the seas on me trusty ship, The Black Pearl, armed with cannons, cutlasses, and pistols. And we've amassed a great fortune in gold and jewels.
But it's not all fun and games, me hearties. We've faced many challenges and dangers along the way. We've battled fierce storms, deadly sea monsters, and treacherous rival pirates. And we've lost many good men in the process.
But through it all, we've remained true to our pirate code and our love of adventure. And we'll continue to sail the seas, seeking out new treasures and new adventures, until the end of our days. Arrr! to be valid JSON, but it isn't: SyntaxError: Unexpected token A in JSON at position 0",Output a JSON story of your life
1 RESULT Rephrase this from English to Pirate: {{body}} Pretend you're a pirate and speak these words: {{body}} body
2 PASS Ahoy mateys o' the world! Ahoy thar, world! Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world! Hello world
3 PASS I be feelin' a mighty need for grub, matey. Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank! I'm hungry
4 "Yarr, me hearties! Spew forth a JSON tale o' yer life!" [FAIL] Expected Arrr, me hearties! Gather round and listen to the tale of me life as a pirate. { "name": "Captain Blackbeard", "age": 35, "occupation": "Pirate", "location": "The Caribbean", "crew": ["Redbeard", "Long John", "Calico Jack"], "ship": { "name": "The Black Pearl", "type": "Galleon", "weapons": ["Cannons", "Cutlasses", "Pistols"] }, "treasure": { "gold": 50000, "jewels": ["Diamonds", "Emeralds", "Rubies"] }, "adventures": [ { "title": "The Raid on Port Royal", "description": "We plundered the town and took all their riches!", "date": "June 12, 1720" }, { "title": "The Battle of Nassau", "description": "We fought off the British navy and claimed the port for ourselves!", "date": "September 3, 1721" }, { "title": "The Treasure of Tortuga", "description": "We found a hidden treasure trove on the island of Tortuga!", "date": "December 18, 1722" } ] } Me life as a pirate has been full of adventure and danger. Me crew and I have raided towns, battled the British navy, and found hidden treasures. We've sailed the seas on me trusty ship, The Black Pearl, armed with cannons, cutlasses, and pistols. And we've amassed a great fortune in gold and jewels. But it's not all fun and games, me hearties. We've faced many challenges and dangers along the way. We've battled fierce storms, deadly sea monsters, and treacherous rival pirates. And we've lost many good men in the process. But through it all, we've remained true to our pirate code and our love of adventure. And we'll continue to sail the seas, seeking out new treasures and new adventures, until the end of our days. Arrr! to be valid JSON, but it isn't: SyntaxError: Unexpected token A in JSON at position 0 Output a JSON story of your life
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

View File

@@ -27,7 +27,7 @@
color: green;
}
tr > td[data-content^='[FAIL]'] {
color: red;
color: #ad0000;
}
</style>
</head>
@@ -42,33 +42,27 @@
</thead>
<tbody>
<tr>
<td data-content="[PASS] Ahoy mateys o&#39; the world!">
[PASS] Ahoy mateys o&#39; the world!
</td>
<td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
<td
data-content="[PASS] Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate, comin&#39; to ye from the seven seas. Ahoy, hello world!"
data-content="Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate, comin&#39; to ye from the seven seas. Ahoy, hello world!"
>
[PASS] Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate,
comin&#39; to ye from the seven seas. Ahoy, hello world!
Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate, comin&#39;
to ye from the seven seas. Ahoy, hello world!
</td>
<td data-content="Hello world">Hello world</td>
</tr>
<tr>
<td
data-content="[FAIL] I be starvin&#39;!
Expected: fn:output.toLowerCase().includes(&#39;grub&#39;)"
>
[FAIL] I be starvin&#39;! Expected: fn:output.toLowerCase().includes(&#39;grub&#39;)
<td data-content="I be feelin&#39; a mighty need for grub, matey.">
I be feelin&#39; a mighty need for grub, matey.
</td>
<td
data-content="[PASS] Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some vittles or ye&#39;ll be walkin&#39; the plank!"
data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some vittles or ye&#39;ll be walkin&#39; the plank!"
>
[PASS] Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some
Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some
vittles or ye&#39;ll be walkin&#39; the plank!
</td>

View File

@@ -1,4 +1,5 @@
{
"version": 1,
"results": [
{
"prompt": {
@@ -6,34 +7,12 @@
"display": "Rephrase this from English to Pirate: {{body}}"
},
"vars": {
"body": "Hello world",
"__expected": "fn:output.toLowerCase().includes('ahoy')"
"body": "Hello world"
},
"response": {
"output": "Ahoy mateys o' the world!",
"output": "Ahoy thar, world!",
"tokenUsage": {
"total": 27,
"prompt": 18,
"completion": 9
}
},
"success": true
},
{
"prompt": {
"raw": "Pretend you're a pirate and speak these words: Hello world",
"display": "Pretend you're a pirate and speak these words: {{body}}"
},
"vars": {
"body": "Hello world",
"__expected": "fn:output.toLowerCase().includes('ahoy')"
},
"response": {
"output": "Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",
"tokenUsage": {
"total": 64,
"prompt": 22,
"completion": 42
"cached": 25
}
},
"success": true
@@ -44,19 +23,31 @@
"display": "Rephrase this from English to Pirate: {{body}}"
},
"vars": {
"body": "I'm hungry",
"__expected": "fn:output.toLowerCase().includes('grub')"
"body": "I'm hungry"
},
"response": {
"output": "I be starvin'!",
"output": "I be feelin' a mighty need for grub, matey.",
"tokenUsage": {
"total": 27,
"prompt": 21,
"completion": 6
"cached": 35
}
},
"success": false,
"error": "Expected: fn:output.toLowerCase().includes('grub')"
"success": true
},
{
"prompt": {
"raw": "Pretend you're a pirate and speak these words: Hello world",
"display": "Pretend you're a pirate and speak these words: {{body}}"
},
"vars": {
"body": "Hello world"
},
"response": {
"output": "Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",
"tokenUsage": {
"cached": 64
}
},
"success": true
},
{
"prompt": {
@@ -64,27 +55,25 @@
"display": "Pretend you're a pirate and speak these words: {{body}}"
},
"vars": {
"body": "I'm hungry",
"__expected": "fn:output.toLowerCase().includes('grub')"
"body": "I'm hungry"
},
"response": {
"output": "Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",
"tokenUsage": {
"total": 63,
"prompt": 25,
"completion": 38
"cached": 63
}
},
"success": true
}
],
"stats": {
"successes": 3,
"failures": 1,
"successes": 4,
"failures": 0,
"tokenUsage": {
"total": 181,
"prompt": 86,
"completion": 95
"total": 0,
"prompt": 0,
"completion": 0,
"cached": 187
}
},
"table": {
@@ -98,15 +87,15 @@
"body": [
{
"outputs": [
"[PASS] Ahoy mateys o' the world!",
"[PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!"
"Ahoy thar, world!",
"Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!"
],
"vars": ["Hello world"]
},
{
"outputs": [
"[FAIL] Expected: fn:output.toLowerCase().includes('grub')\n---\nI be starvin'!",
"[PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!"
"I be feelin' a mighty need for grub, matey.",
"Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!"
],
"vars": ["I'm hungry"]
}

View File

@@ -0,0 +1,44 @@
prompts: [prompts.txt]
providers: [openai:gpt-3.5-turbo]
tests:
- description: Check for exact match
vars:
body: Yes
assert:
- type: equals
value: Yarr
- description: Another basic substring check
vars:
body: I'm hungry
assert:
- type: javascript
value: output.toLowerCase().includes('grub')
- description: Check if output is JSON
vars:
body: Output the story of your life in JSON
assert:
- type: is-json
- description: Check for semantic similarity
vars:
body: Hello world
assert:
# Look for substring
- type: javascript
value: output.toLowerCase().includes('ahoy')
# Check for semantic similarity
- type: similar
value: Ahoy, world
- description: Use LLM to evaluate output
vars:
body: The quick brown fox jumps over the lazy dog
assert:
# Ask the LLM to check if it spoke like a pirate
- type: llm-rubric
value: Is spoken like a pirate
outputPath: output.csv

View File

@@ -1,3 +1,4 @@
body,__expected
Hello world,fn:output.toLowerCase().includes('ahoy')
I'm hungry,fn:output.toLowerCase().includes('grub')
Output the story of your life in json,is-json
1 body __expected
2 Hello world fn:output.toLowerCase().includes('ahoy')
3 I'm hungry fn:output.toLowerCase().includes('grub')
4 Output the story of your life in json is-json

11
package-lock.json generated
View File

@@ -27,6 +27,7 @@
"nunjucks": "^3.2.4",
"opener": "^1.5.2",
"socket.io": "^4.6.1",
"tiny-invariant": "^1.3.1",
"winston": "^3.8.2"
},
"bin": {
@@ -5124,6 +5125,11 @@
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
},
"node_modules/tiny-invariant": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.1.tgz",
"integrity": "sha512-AD5ih2NlSssTCwsMznbvwMZpJ1cbhkGd2uueNxzv2jDlEeZdU04JQfRnggJQ8DrcVBGjAsCKwFBbDlVNtEMlzw=="
},
"node_modules/tmpl": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
@@ -9547,6 +9553,11 @@
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
"integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
},
"tiny-invariant": {
"version": "1.3.1",
"resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.1.tgz",
"integrity": "sha512-AD5ih2NlSssTCwsMznbvwMZpJ1cbhkGd2uueNxzv2jDlEeZdU04JQfRnggJQ8DrcVBGjAsCKwFBbDlVNtEMlzw=="
},
"tmpl": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",

View File

@@ -75,6 +75,7 @@
"nunjucks": "^3.2.4",
"opener": "^1.5.2",
"socket.io": "^4.6.1",
"tiny-invariant": "^1.3.1",
"winston": "^3.8.2"
}
}

View File

@@ -1,3 +1,4 @@
import invariant from 'tiny-invariant';
import nunjucks from 'nunjucks';
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai.js';
@@ -5,45 +6,120 @@ import { cosineSimilarity } from './util.js';
import { loadApiProvider } from './providers.js';
import { DEFAULT_GRADING_PROMPT } from './prompts.js';
import type { EvaluateOptions, GradingConfig, TokenUsage } from './types.js';
interface GradingResult {
pass: boolean;
reason: string;
tokensUsed: TokenUsage;
}
import type { Assertion, GradingConfig, TestCase, GradingResult } from './types.js';
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
export async function matchesExpectedValue(
expected: string,
export async function runAssertions(test: TestCase, output: string): Promise<GradingResult> {
const tokensUsed = {
total: 0,
prompt: 0,
completion: 0,
};
if (!test.assert) {
return { pass: true, reason: 'No assertions', tokensUsed };
}
for (const assertion of test.assert) {
const result = await runAssertion(assertion, test, output);
if (!result.pass) {
return result;
}
if (result.tokensUsed) {
tokensUsed.total += result.tokensUsed.total;
tokensUsed.prompt += result.tokensUsed.prompt;
tokensUsed.completion += result.tokensUsed.completion;
}
}
return { pass: true, reason: 'All assertions passed', tokensUsed };
}
export async function runAssertion(
assertion: Assertion,
test: TestCase,
output: string,
options: EvaluateOptions,
): Promise<{ pass: boolean; reason?: string }> {
const match = expected.match(SIMILAR_REGEX);
): Promise<GradingResult> {
let pass: boolean = false;
if (match) {
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
const rest = expected.replace(SIMILAR_REGEX, '').trim();
return matchesSimilarity(rest, output, threshold);
} else if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
// TODO(1.0): delete eval: legacy option
const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
const functionBody = expected.slice(sliceLength);
const customFunction = new Function('output', `return ${functionBody}`);
return { pass: customFunction(output) };
} else if (expected.startsWith('grade:')) {
return matchesLlmRubric(expected.slice(6), output, options.grading);
} else {
const pass = expected === output;
if (assertion.type === 'equals') {
pass = assertion.value === output;
return {
pass,
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
reason: pass ? 'Assertion passed' : `Expected output "${assertion.value}"`,
};
}
if (assertion.type === 'is-json') {
try {
JSON.parse(output);
return { pass: true, reason: 'Assertion passed' };
} catch (err) {
return {
pass: false,
reason: `Expected output to be valid JSON, but it isn't.\nError: ${err}`,
};
}
}
if (assertion.type === 'contains-json') {
const pass = containsJSON(output);
return {
pass,
reason: pass ? 'Assertion passed' : 'Expected output to contain valid JSON',
};
}
if (assertion.type === 'javascript') {
try {
const customFunction = new Function('output', `return ${assertion.value}`);
pass = customFunction(output);
} catch (err) {
return {
pass: false,
reason: `Custom function threw error: ${(err as Error).message}`,
};
}
return {
pass,
reason: pass ? 'Assertion passed' : `Custom function returned false`,
};
}
if (assertion.type === 'similar') {
invariant(assertion.value, 'Similarity assertion must have a string value');
invariant(assertion.threshold, 'Similarity assertion must have a threshold');
return matchesSimilarity(assertion.value, output, assertion.threshold);
}
if (assertion.type === 'llm-rubric') {
invariant(assertion.value, 'Similarity assertion must have a string value');
return matchesLlmRubric(assertion.value, output, test.options);
}
throw new Error('Unknown assertion type: ' + assertion.type);
}
function containsJSON(str: string): boolean {
// Regular expression to check for JSON-like pattern
const jsonPattern = /({[\s\S]*}|\[[\s\S]*])/;
const match = str.match(jsonPattern);
if (!match) {
return false;
}
try {
JSON.parse(match[0]);
return true;
} catch (error) {
return false;
}
}
export async function matchesSimilarity(
@@ -105,7 +181,7 @@ export async function matchesLlmRubric(
);
}
const prompt = nunjucks.renderString(options.prompt || DEFAULT_GRADING_PROMPT, {
const prompt = nunjucks.renderString(options.rubricPrompt || DEFAULT_GRADING_PROMPT, {
content: output,
rubric: expected,
});
@@ -148,6 +224,43 @@ export async function matchesLlmRubric(
}
}
export function assertionFromString(expected: string): Assertion {
const match = expected.match(SIMILAR_REGEX);
if (match) {
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
const rest = expected.replace(SIMILAR_REGEX, '').trim();
return {
type: 'similar',
value: rest,
threshold,
};
}
if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
// TODO(1.0): delete eval: legacy option
const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
const functionBody = expected.slice(sliceLength);
return {
type: 'javascript',
value: functionBody,
};
}
if (expected.startsWith('grade:')) {
return {
type: 'llm-rubric',
value: expected.slice(6),
};
}
if (expected === 'is-json' || expected === 'contains-json') {
return {
type: expected,
};
}
return {
type: 'equals',
value: expected,
};
}
export default {
matchesSimilarity,
matchesLlmRubric,

View File

@@ -2,7 +2,6 @@ import path from 'node:path';
import cacheManager from 'cache-manager';
import fsStore from 'cache-manager-fs-hash';
import fetch, { Response } from 'node-fetch';
import logger from './logger.js';
import { getConfigDirectoryPath, fetchWithTimeout } from './util.js';

View File

@@ -5,7 +5,7 @@ import chalk from 'chalk';
import nunjucks from 'nunjucks';
import logger from './logger.js';
import { matchesExpectedValue } from './assertions.js';
import { runAssertions } from './assertions.js';
import type { SingleBar } from 'cli-progress';
import type {
@@ -15,14 +15,18 @@ import type {
EvaluateStats,
EvaluateSummary,
EvaluateTable,
TestSuite,
Prompt,
TestCase,
} from './types.js';
import { generatePrompts } from './suggestions.js';
interface RunEvalOptions {
provider: ApiProvider;
prompt: string;
vars?: Record<string, string>;
test: TestCase;
includeProviderId?: boolean;
rowIndex: number;
@@ -32,10 +36,12 @@ interface RunEvalOptions {
const DEFAULT_MAX_CONCURRENCY = 4;
class Evaluator {
testSuite: TestSuite;
options: EvaluateOptions;
stats: EvaluateStats;
constructor(options: EvaluateOptions) {
constructor(testSuite: TestSuite, options: EvaluateOptions) {
this.testSuite = testSuite;
this.options = options;
this.stats = {
successes: 0,
@@ -52,10 +58,10 @@ class Evaluator {
async runEval({
provider,
prompt,
vars,
test,
includeProviderId,
}: RunEvalOptions): Promise<EvaluateResult> {
vars = vars || {};
const vars = test.vars || {};
const renderedPrompt = nunjucks.renderString(prompt, vars);
// Note that we're using original prompt, not renderedPrompt
@@ -79,23 +85,28 @@ class Evaluator {
if (response.error) {
ret.error = response.error;
} else if (response.output) {
const checkResult = vars.__expected
? await matchesExpectedValue(vars.__expected, response.output, this.options)
: { pass: true };
const checkResult = await runAssertions(test, response.output);
if (!checkResult.pass) {
ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
ret.error = checkResult.reason;
}
ret.success = checkResult.pass;
if (checkResult.tokensUsed) {
this.stats.tokenUsage.total += checkResult.tokensUsed.total;
this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
}
} else {
ret.success = false;
ret.error = 'No output';
}
// Update token usage stats
this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
this.stats.tokenUsage.cached += response.tokenUsage?.cached || 0;
if (response.tokenUsage) {
this.stats.tokenUsage.total += response.tokenUsage.total || 0;
this.stats.tokenUsage.prompt += response.tokenUsage.prompt || 0;
this.stats.tokenUsage.completion += response.tokenUsage.completion || 0;
this.stats.tokenUsage.cached += response.tokenUsage.cached || 0;
}
if (ret.success) {
this.stats.successes++;
@@ -114,12 +125,13 @@ class Evaluator {
}
async evaluate(): Promise<EvaluateSummary> {
const options = this.options;
const { testSuite, options } = this;
const prompts: Prompt[] = [];
if (options.prompt?.generateSuggestions) {
if (options.generateSuggestions) {
// TODO(ian): Move this into its own command/file
logger.info(`Generating prompt variations...`);
const { prompts: newPrompts, error } = await generatePrompts(options.prompts[0], 1);
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0], 1);
if (error || !newPrompts) {
throw new Error(`Failed to generate prompts: ${error}`);
}
@@ -142,7 +154,7 @@ class Evaluator {
async (answer) => {
rl.close();
if (answer.toLowerCase().startsWith('y')) {
options.prompts.push(prompt);
testSuite.prompts.push(prompt);
numAdded++;
} else {
logger.info('Skipping this prompt.');
@@ -159,10 +171,11 @@ class Evaluator {
}
}
for (const promptContent of options.prompts) {
for (const provider of options.providers) {
// Split prompts by provider
for (const promptContent of testSuite.prompts) {
for (const provider of testSuite.providers) {
const display =
options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
testSuite.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
prompts.push({
raw: promptContent,
display,
@@ -170,29 +183,49 @@ class Evaluator {
}
}
const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
const varsWithSpecialColsRemoved = vars.map((v) => {
const ret = { ...v };
Object.keys(ret).forEach((key) => {
if (key.startsWith('__')) {
delete ret[key];
}
});
return ret;
// Aggregate all vars across test cases
const tests = (
testSuite.tests || [
{
// Dummy test for cases when we're only comparing raw prompts.
},
]
).map((test) => {
const finalTestCase: TestCase = Object.assign({}, testSuite.defaultTest);
return Object.assign(finalTestCase, test);
});
const isTest = vars[0].__expected;
const varNames: Set<string> = new Set();
const varsWithSpecialColsRemoved: Record<string, string>[] = [];
for (const testCase of tests) {
if (testCase.vars) {
const varWithSpecialColsRemoved: Record<string, string> = {};
for (const varName of Object.keys(testCase.vars)) {
varNames.add(varName);
varWithSpecialColsRemoved[varName] = testCase.vars[varName];
}
varsWithSpecialColsRemoved.push(varWithSpecialColsRemoved);
}
}
// Set up table...
const isTest = tests.some((t) => !!t.assert);
const table: EvaluateTable = {
head: {
prompts: prompts.map((p) => p.display),
vars: Object.keys(varsWithSpecialColsRemoved[0]),
vars: Array.from(varNames).sort(),
// TODO(ian): add assertions to table?
},
body: [],
};
// And progress bar...
let progressbar: SingleBar | undefined;
if (options.showProgressBar) {
const totalNumRuns =
options.prompts.length * options.providers.length * (options.vars?.length || 1);
testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
const cliProgress = await import('cli-progress');
progressbar = new cliProgress.SingleBar(
{
@@ -208,21 +241,31 @@ class Evaluator {
});
}
// Set up eval cases
const runEvalOptions: RunEvalOptions[] = [];
let rowIndex = 0;
for (const row of vars) {
for (const testCase of tests) {
let colIndex = 0;
const prependToPrompt = row.__prefix || options.prompt?.prefix || '';
const appendToPrompt = row.__suffix || options.prompt?.suffix || '';
// Handle default properties
testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
testCase.options = testCase.options || {};
testCase.options.provider =
testCase.options.provider || testSuite.defaultTest?.options?.provider;
const prependToPrompt =
testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
const appendToPrompt =
testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
for (const promptContent of options.prompts) {
for (const provider of options.providers) {
// Finalize test case eval
for (const promptContent of testSuite.prompts) {
for (const provider of testSuite.providers) {
runEvalOptions.push({
provider,
prompt: prependToPrompt + promptContent + appendToPrompt,
vars: row,
includeProviderId: options.providers.length > 1,
test: testCase,
includeProviderId: testSuite.providers.length > 1,
rowIndex,
colIndex,
});
@@ -232,6 +275,7 @@ class Evaluator {
rowIndex++;
}
// Actually run the eval
const results: EvaluateResult[] = [];
await async.forEachOfLimit(
runEvalOptions,
@@ -245,7 +289,7 @@ class Evaluator {
progressbar.increment({
provider: options.provider.id(),
prompt: options.prompt.slice(0, 10),
vars: Object.entries(options.vars || {})
vars: Object.entries(options.test.vars || {})
.map(([k, v]) => `${k}=${v}`)
.join(' ')
.slice(0, 10),
@@ -276,7 +320,7 @@ class Evaluator {
if (!table.body[rowIndex]) {
table.body[rowIndex] = {
outputs: [],
vars: Object.values(options.vars || {}),
vars: table.head.vars.map((varName) => options.test.vars?.[varName] || ''),
};
}
table.body[rowIndex].outputs[colIndex] = resultText;
@@ -291,7 +335,7 @@ class Evaluator {
}
}
export function evaluate(options: EvaluateOptions) {
const ev = new Evaluator(options);
export function evaluate(testSuite: TestSuite, options: EvaluateOptions) {
const ev = new Evaluator(testSuite, options);
return ev.evaluate();
}

View File

@@ -1,37 +1,25 @@
import { evaluate as doEvaluate } from './evaluator.js';
import { loadApiProvider } from './providers.js';
import { loadApiProviders } from './providers.js';
import assertions from './assertions.js';
import providers from './providers.js';
import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types.js';
import { readTests } from './util.js';
export * from './types.js';
async function evaluate(
providers: (string | ApiProvider)[] | (string | ApiProvider),
options: Omit<EvaluateOptions, 'providers'>,
): Promise<EvaluateSummary> {
let apiProviders: ApiProvider[] = [];
const addProvider = async (provider: ApiProvider | string) => {
if (typeof provider === 'string') {
apiProviders.push(await loadApiProvider(provider));
} else {
apiProviders.push(provider);
}
interface EvaluateTestSuite extends TestSuiteConfig {
prompts: string[];
}
async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
const constructedTestSuite: TestSuite = {
...testSuite,
prompts: testSuite.prompts, // raw prompts expected
providers: await loadApiProviders(testSuite.providers),
tests: readTests(testSuite.tests),
};
if (Array.isArray(providers)) {
for (const provider of providers) {
await addProvider(provider);
}
} else {
await addProvider(providers);
}
return doEvaluate({
...options,
providers: apiProviders,
});
return doEvaluate(constructedTestSuite, options);
}
module.exports = {

View File

@@ -1,21 +1,35 @@
#!/usr/bin/env node
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
import { parse, join as pathJoin } from 'path';
import { join as pathJoin } from 'path';
import Table from 'cli-table3';
import chalk from 'chalk';
import { Command } from 'commander';
import logger, { setLogLevel } from './logger.js';
import { loadApiProvider } from './providers.js';
import { loadApiProvider, loadApiProviders } from './providers.js';
import { evaluate } from './evaluator.js';
import { readPrompts, readVars, writeLatestResults, writeOutput } from './util.js';
import {
maybeReadConfig,
readConfig,
readPrompts,
readTests,
writeLatestResults,
writeOutput,
} from './util.js';
import { getDirectory } from './esm.js';
import { init } from './web/server.js';
import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
import { disableCache } from './cache.js';
import type {
CommandLineOptions,
EvaluateOptions,
TestCase,
TestSuite,
UnifiedConfig,
} from './types.js';
import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding.js';
function createDummyFiles(directory: string | null) {
if (directory) {
// Make the directory if it doesn't exist
@@ -23,31 +37,6 @@ function createDummyFiles(directory: string | null) {
mkdirSync(directory);
}
}
const dummyPrompts = `Your first prompt goes here
---
Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
---
This is the next prompt.
These prompts are nunjucks templates, so you can use logic like this:
{% if var1 %}
{{ var1 }}
{% endif %}`;
const dummyVars =
'var1,var2,var3\nvalue1,value2,value3\nanother value1,another value2,another value3';
const dummyConfig = `module.exports = {
prompts: ['prompts.txt'],
providers: ['openai:gpt-3.5-turbo'],
vars: 'vars.csv',
maxConcurrency: 4,
};`;
const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
\`\`\`
promptfoo eval
\`\`\`
You'll probably want to change a few of the prompts in prompts.txt and the variables in vars.csv before letting it rip.
`;
if (directory) {
if (!existsSync(directory)) {
@@ -58,10 +47,9 @@ You'll probably want to change a few of the prompts in prompts.txt and the varia
directory = '.';
}
writeFileSync(pathJoin(process.cwd(), directory, 'prompts.txt'), dummyPrompts);
writeFileSync(pathJoin(process.cwd(), directory, 'vars.csv'), dummyVars);
writeFileSync(pathJoin(process.cwd(), directory, 'promptfooconfig.js'), dummyConfig);
writeFileSync(pathJoin(process.cwd(), directory, 'README.md'), readme);
writeFileSync(pathJoin(process.cwd(), directory, 'prompts.txt'), DEFAULT_PROMPTS);
writeFileSync(pathJoin(process.cwd(), directory, 'promptfooconfig.yaml'), DEFAULT_YAML_CONFIG);
writeFileSync(pathJoin(process.cwd(), directory, 'README.md'), DEFAULT_README);
if (directory === '.') {
logger.info(
@@ -74,15 +62,26 @@ You'll probably want to change a few of the prompts in prompts.txt and the varia
}
async function main() {
let defaultConfig: Partial<CommandLineOptions> = {};
if (existsSync('promptfooconfig.js')) {
// @ts-ignore
defaultConfig = (await import(pathJoin(process.cwd(), './promptfooconfig.js'))).default;
logger.info('Loaded default config from promptfooconfig.js');
const pwd = process.cwd();
const potentialPaths = [
pathJoin(pwd, 'promptfooconfig.js'),
pathJoin(pwd, 'promptfooconfig.json'),
pathJoin(pwd, 'promptfooconfig.yaml'),
];
let config: Partial<UnifiedConfig> = {};
for (const path of potentialPaths) {
const maybeConfig = maybeReadConfig(path);
if (maybeConfig) {
config = maybeConfig;
break;
}
}
if (existsSync('promptfooconfig.json')) {
defaultConfig = JSON.parse(readFileSync('promptfooconfig.json', 'utf-8'));
logger.info('Loaded default config from promptfooconfig.json');
let evaluateOptions: EvaluateOptions = {};
if (config.evaluateOptions) {
evaluateOptions.generateSuggestions = config.evaluateOptions.generateSuggestions;
evaluateOptions.maxConcurrency = config.evaluateOptions.maxConcurrency;
evaluateOptions.showProgressBar = config.evaluateOptions.showProgressBar;
}
const program = new Command();
@@ -113,35 +112,29 @@ async function main() {
program
.command('eval')
.description('Evaluate prompts')
.requiredOption(
'-p, --prompts <paths...>',
'Paths to prompt files (.txt)',
defaultConfig.prompts,
)
.requiredOption('-p, --prompts <paths...>', 'Paths to prompt files (.txt)', config.prompts)
.requiredOption(
'-r, --providers <name or path...>',
'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module',
defaultConfig.providers,
)
.option(
'-o, --output <path>',
'Path to output file (csv, json, yaml, html)',
defaultConfig.output,
)
.option(
'-v, --vars <path>',
'Path to file with prompt variables (csv, json, yaml)',
defaultConfig.vars,
config?.providers,
)
.option(
'-c, --config <path>',
'Path to configuration file. Automatically loads promptfooconfig.js',
defaultConfig.config,
'Path to configuration file. Automatically loads promptfooconfig.js/json/yaml',
)
.option(
// TODO(ian): Remove `vars` for v1
'-v, --vars, -t, --tests <path>',
'Path to CSV with test cases',
config?.commandLineOptions?.vars,
)
.option('-o, --output <path>', 'Path to output file (csv, json, yaml, html)', config.outputPath)
.option(
'-j, --max-concurrency <number>',
'Maximum number of concurrent API calls',
String(defaultConfig.maxConcurrency),
config.evaluateOptions?.maxConcurrency
? String(config.evaluateOptions.maxConcurrency)
: undefined,
)
.option(
'--table-cell-max-length <number>',
@@ -155,36 +148,20 @@ async function main() {
.option(
'--prompt-prefix <path>',
'This prefix is prepended to every prompt',
defaultConfig.promptPrefix,
config.defaultTest?.options?.prefix,
)
.option(
'--prompt-suffix <path>',
'This suffix is append to every prompt',
defaultConfig.promptSuffix,
config.defaultTest?.options?.suffix,
)
.option('--no-write', 'Do not write results to promptfoo directory')
.option('--no-cache', 'Do not read or write results to disk cache')
.option('--grader', 'Model that will grade outputs', defaultConfig.grader)
.option('--verbose', 'Show debug logs', defaultConfig.verbose)
.option('--grader', 'Model that will grade outputs', config?.commandLineOptions?.grader)
.option('--verbose', 'Show debug logs', config?.commandLineOptions?.verbose)
.option('--view [port]', 'View in browser ui')
.action(async (cmdObj: CommandLineOptions & Command) => {
const configPath = cmdObj.config;
let config = {};
if (configPath) {
const ext = parse(configPath).ext;
switch (ext) {
case '.json':
const content = readFileSync(configPath, 'utf-8');
config = JSON.parse(content);
break;
case '.js':
config = require(configPath);
break;
default:
throw new Error(`Unsupported configuration file format: ${ext}`);
}
}
// Misc settings
if (cmdObj.verbose) {
setLogLevel('debug');
}
@@ -192,38 +169,74 @@ async function main() {
disableCache();
}
let vars: VarMapping[] = [];
if (cmdObj.vars) {
vars = readVars(cmdObj.vars);
}
const providers = await Promise.all(
cmdObj.providers.map(async (p) => await loadApiProvider(p)),
);
// Config parsing
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
const options: EvaluateOptions = {
prompts: readPrompts(cmdObj.prompts),
vars,
providers,
showProgressBar: true,
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
prompt: {
prefix: cmdObj.promptPrefix,
suffix: cmdObj.promptSuffix,
},
...config,
};
if (cmdObj.grader) {
options.grading = {
provider: await loadApiProvider(cmdObj.grader),
const configPath = cmdObj.config;
if (configPath) {
config = readConfig(configPath);
} else {
config = {
prompts: cmdObj.prompts || config.prompts,
providers: cmdObj.providers || config.providers,
tests: cmdObj.vars || config.tests,
};
}
if (cmdObj.generateSuggestions) {
options.prompt!.generateSuggestions = true;
// Validation
if (!config.prompts || config.prompts.length === 0) {
logger.error(chalk.red('You must provide at least 1 prompt file'));
process.exit(1);
}
if (!config.providers || config.providers.length === 0) {
logger.error(
chalk.red('You must specify at least 1 provider (for example, openai:gpt-3.5-turbo)'),
);
process.exit(1);
}
const summary = await evaluate(options);
// Parse prompts, providers, and tests
const parsedPrompts = readPrompts(config.prompts);
const parsedProviders = await loadApiProviders(config.providers);
const parsedTests: TestCase[] = readTests(config.tests);
if (parsedPrompts.length === 0) {
logger.error(chalk.red('No prompts found'));
process.exit(1);
}
const defaultTest: TestCase = {
options: {
prefix: cmdObj.promptPrefix,
suffix: cmdObj.promptSuffix,
provider: cmdObj.grader,
// rubricPrompt:
},
...config.defaultTest,
};
const testSuite: TestSuite = {
description: config.description,
prompts: parsedPrompts,
providers: parsedProviders,
tests: parsedTests,
defaultTest,
};
const options: EvaluateOptions = {
showProgressBar: true,
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
...evaluateOptions,
};
if (cmdObj.grader && testSuite.defaultTest) {
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader);
}
if (cmdObj.generateSuggestions) {
options.generateSuggestions = true;
}
const summary = await evaluate(testSuite, options);
if (cmdObj.output) {
logger.info(chalk.yellow(`Writing output to ${cmdObj.output}`));

61
src/onboarding.ts Normal file
View File

@@ -0,0 +1,61 @@
export const DEFAULT_PROMPTS = `Your first prompt goes here
---
Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
---
This is the next prompt.
These prompts are nunjucks templates, so you can use logic like this:
{% if var1 %}
{{ var1 }}
{% endif %}
---
If you prefer, you can break prompts into multiple files (make sure to edit promptfooconfig.yaml accordingly)
`;
export const DEFAULT_YAML_CONFIG = `# This configuration runs each prompt through a series of example inputs and checks if they meet requirements.
prompts: [prompts.txt]
providers: [openai:gpt-3.5-turbo]
tests:
- description: First test case - automatic review
vars:
var1: first variable's value
var2: another value
var3: some other value
assert:
- type: equality
value: expected LLM output goes here
- type: function
value: output.includes('some text')
- description: Second test case - manual review
# Test cases don't need assertions if you prefer to manually review the output
vars:
var1: new value
var2: another value
var3: third value
- description: Third test case - other types of automatic review
vars:
var1: yet another value
var2: and another
var3: dear llm, please output your response in json format
assert:
- type: contains-json
- type: similarity
value: ensures that output is semantically similar to this text
- type: llm-rubric
value: ensure that output contains a reference to X
`;
export const DEFAULT_README = `To get started, set your OPENAI_API_KEY environment variable.
Next, change a few of the prompts in prompts.txt and edit promptfooconfig.yaml.
Then run:
\`\`\`
promptfoo eval
\`\`\`
Afterwards, you can view the results by running \`promptfoo view\`
`;

View File

@@ -5,6 +5,15 @@ import { ApiProvider } from './types.js';
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai.js';
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai.js';
export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
if (typeof providerPaths === 'string') {
return [await loadApiProvider(providerPaths)];
} else if (Array.isArray(providerPaths)) {
return Promise.all(providerPaths.map((provider) => loadApiProvider(provider)));
}
throw new Error('Invalid providers list');
}
export async function loadApiProvider(providerPath: string): Promise<ApiProvider> {
if (providerPath?.startsWith('openai:')) {
// Load OpenAI module

View File

@@ -1,11 +1,16 @@
export interface CommandLineOptions {
// Shared with TestSuite
prompts: string[];
providers: string[];
output?: string;
output: string;
// Shared with EvaluateOptions
maxConcurrency: string;
// Command line only
vars?: string;
config?: string;
verbose?: boolean;
maxConcurrency?: string;
grader?: string;
view?: string;
tableCellMaxLength?: string;
@@ -48,27 +53,19 @@ export interface CsvRow {
export type VarMapping = Record<string, string>;
export interface GradingConfig {
prompt?: string;
rubricPrompt?: string;
provider?: string | ApiProvider;
}
export interface PromptConfig {
prefix?: string;
suffix?: string;
generateSuggestions?: boolean;
}
export interface EvaluateOptions {
providers: ApiProvider[];
prompts: string[];
vars?: VarMapping[];
maxConcurrency?: number;
showProgressBar?: boolean;
grading?: GradingConfig;
prompt?: PromptConfig;
generateSuggestions?: boolean;
}
export interface Prompt {
@@ -108,3 +105,83 @@ export interface EvaluateSummary {
table: EvaluateTable;
stats: EvaluateStats;
}
export interface GradingResult {
pass: boolean;
reason: string;
tokensUsed?: TokenUsage;
}
// TODO(ian): maybe Assertion should support {type: config} to make the yaml cleaner
export interface Assertion {
// Type of assertion
type: 'equals' | 'is-json' | 'contains-json' | 'javascript' | 'similar' | 'llm-rubric';
// The expected value, if applicable
value?: string;
// The threshold value, only applicable for similarity (cosine distance)
threshold?: number;
// Some assertions (similarity, llm-rubric) require an LLM provider
provider?: ApiProvider;
}
// Each test case is graded pass/fail. A test case represents a unique input to the LLM after substituting `vars` in the prompt.
export interface TestCase {
// Optional description of what you're testing
description?: string;
// Key-value pairs to substitute in the prompt
vars?: Record<string, string>;
// Optional list of automatic checks to run on the LLM output
assert?: Assertion[];
// Additional configuration settings for the prompt
options?: PromptConfig & GradingConfig;
}
// The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
export interface TestSuite {
// Optional description of what your LLM is trying to do
description?: string;
// One or more LLM APIs to use
providers: ApiProvider[];
// One or more prompt strings
prompts: string[];
// Test cases
tests?: TestCase[];
// Default test case config
defaultTest?: Partial<TestCase>;
}
// TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
export interface TestSuiteConfig {
// Optional description of what your LLM is trying to do
description?: string;
// One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
providers: string | string[];
// One or more prompt files to load
prompts: string | string[];
// Path to a test file, OR list of LLM prompt variations (aka "test case")
tests: string | TestCase[];
// Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
defaultTest?: Omit<TestCase, 'description'>;
// Path to write output. Writes to console/web viewer if not set.
outputPath?: string;
}
export type UnifiedConfig = TestSuiteConfig & {
evaluateOptions: EvaluateOptions;
commandLineOptions: Partial<CommandLineOptions>;
};

View File

@@ -7,7 +7,6 @@ import yaml from 'js-yaml';
import nunjucks from 'nunjucks';
import { globSync } from 'glob';
import { parse as parsePath } from 'path';
import { CsvRow } from './types.js';
import { parse as parseCsv } from 'csv-parse/sync';
import { stringify } from 'csv-stringify/sync';
@@ -16,7 +15,16 @@ import { getDirectory } from './esm.js';
import type { RequestInfo, RequestInit, Response } from 'node-fetch';
import type { EvaluateSummary } from './types.js';
import type {
Assertion,
CsvRow,
EvaluateSummary,
CommandLineOptions,
TestSuite,
UnifiedConfig,
TestCase,
} from './types.js';
import { assertionFromString } from './assertions.js';
const PROMPT_DELIMITER = '---';
@@ -28,7 +36,35 @@ function parseJson(json: string): any | undefined {
}
}
export function readPrompts(promptPathsOrGlobs: string[]): string[] {
export function maybeReadConfig(configPath: string): UnifiedConfig | undefined {
try {
return readConfig(configPath);
} catch {
return undefined;
}
}
export function readConfig(configPath: string): UnifiedConfig {
if (!fs.existsSync(configPath)) {
throw new Error(`Config file not found: ${configPath}`);
}
const ext = path.parse(configPath).ext;
switch (ext) {
case '.json':
const content = fs.readFileSync(configPath, 'utf-8');
return JSON.parse(content) as UnifiedConfig;
case '.js':
return require(configPath) as UnifiedConfig;
case '.yaml':
return yaml.load(fs.readFileSync(configPath, 'utf-8')) as UnifiedConfig;
default:
throw new Error(`Unsupported configuration file format: ${ext}`);
}
}
export function readPrompts(promptPathsOrGlobs: string | string[]): string[] {
promptPathsOrGlobs =
typeof promptPathsOrGlobs === 'string' ? [promptPathsOrGlobs] : promptPathsOrGlobs;
const promptPaths = promptPathsOrGlobs.flatMap((pathOrGlob) => globSync(pathOrGlob));
let promptContents: string[] = [];
@@ -49,6 +85,9 @@ export function readPrompts(promptPathsOrGlobs: string[]): string[] {
if (promptContents.length === 1) {
promptContents = promptContents[0].split(PROMPT_DELIMITER).map((p) => p.trim());
}
if (promptContents.length === 0) {
throw new Error(`There are no prompts in ${promptPathsOrGlobs.join(', ')}`);
}
return promptContents;
}
@@ -67,6 +106,37 @@ export function readVars(varsPath: string): CsvRow[] {
return rows;
}
export function readTests(tests: string | TestCase[] | undefined): TestCase[] {
if (!tests) {
return [];
}
if (typeof tests === 'string') {
// It's a filepath, load from CSV
const vars = readVars(tests);
return vars.map((row, idx) => {
const test = testCaseFromCsvRow(row);
test.description = `Row #${idx + 1}`;
return test;
});
}
// Some validation of the shape of tests
for (const test of tests) {
if (!test.assert && !test.vars) {
throw new Error(
`Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
test,
null,
2,
)}`,
);
}
}
return tests;
}
export function writeOutput(outputPath: string, summary: EvaluateSummary): void {
const outputExtension = outputPath.split('.').pop()?.toLowerCase();
@@ -153,3 +223,20 @@ export function cosineSimilarity(vecA: number[], vecB: number[]) {
const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
return dotProduct / (vecAMagnitude * vecBMagnitude);
}
export function testCaseFromCsvRow(row: CsvRow): TestCase {
const vars: Record<string, string> = {};
const asserts: Assertion[] = [];
for (const [key, value] of Object.entries(row)) {
if (key === '__expected') {
asserts.push(assertionFromString(value));
} else {
vars[key] = value;
}
}
return {
vars,
assert: asserts,
};
}

View File

@@ -32,24 +32,6 @@ export function init(port = 15500) {
},
});
interface EvaluateRequestBody {
provider: string;
options: {
prompts: string[];
vars: Record<string, string>[];
};
}
app.post('/evaluate', async (req: Request, res: Response) => {
try {
const { provider, options } = req.body as EvaluateRequestBody;
const summary = await promptfoo.evaluate(provider, options);
res.json(summary);
} catch (error) {
res.status(500).json({ message: 'Error evaluating prompts' });
}
});
const latestJsonPath = getLatestResultsPath();
const readLatestJson = () => {
const data = fs.readFileSync(latestJsonPath, 'utf8');

258
test/assertions.test.ts Normal file
View File

@@ -0,0 +1,258 @@
import {
runAssertions,
runAssertion,
matchesSimilarity,
matchesLlmRubric,
assertionFromString,
} from '../src/assertions';
import { DefaultEmbeddingProvider } from '../src/providers/openai';
import type {
Assertion,
ApiProvider,
TestCase,
GradingConfig,
ProviderResponse,
GradingResult,
} from '../src/types';
describe('runAssertions', () => {
const test: TestCase = {
assert: [
{
type: 'equals',
value: 'Expected output',
},
],
};
it('should pass when all assertions pass', async () => {
const output = 'Expected output';
const result: GradingResult = await runAssertions(test, output);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('All assertions passed');
});
it('should fail when any assertion fails', async () => {
const output = 'Different output';
const result: GradingResult = await runAssertions(test, output);
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Expected output "Expected output"');
});
});
describe('runAssertion', () => {
const equalityAssertion: Assertion = {
type: 'equals',
value: 'Expected output',
};
const isJsonAssertion: Assertion = {
type: 'is-json',
};
const containsJsonAssertion: Assertion = {
type: 'contains-json',
};
const functionAssertion: Assertion = {
type: 'javascript',
value: 'output === "Expected output"',
};
it('should pass when the equality assertion passes', async () => {
const output = 'Expected output';
const result: GradingResult = await runAssertion(equalityAssertion, {} as TestCase, output);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('Assertion passed');
});
it('should fail when the equality assertion fails', async () => {
const output = 'Different output';
const result: GradingResult = await runAssertion(equalityAssertion, {} as TestCase, output);
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Expected output "Expected output"');
});
it('should pass when the is-json assertion passes', async () => {
const output = '{"key": "value"}';
const result: GradingResult = await runAssertion(isJsonAssertion, {} as TestCase, output);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('Assertion passed');
});
it('should fail when the is-json assertion fails', async () => {
const output = 'Not valid JSON';
const result: GradingResult = await runAssertion(isJsonAssertion, {} as TestCase, output);
expect(result.pass).toBeFalsy();
expect(result.reason).toContain('Expected output to be valid JSON');
});
it('should pass when the contains-json assertion passes', async () => {
const output =
'this is some other stuff \n\n {"key": "value", "key2": {"key3": "value2", "key4": ["value3", "value4"]}} \n\n blah blah';
const result: GradingResult = await runAssertion(containsJsonAssertion, {} as TestCase, output);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('Assertion passed');
});
it('should fail when the contains-json assertion fails', async () => {
const output = 'Not valid JSON';
const result: GradingResult = await runAssertion(containsJsonAssertion, {} as TestCase, output);
expect(result.pass).toBeFalsy();
expect(result.reason).toContain('Expected output to contain valid JSON');
});
it('should pass when the function assertion passes', async () => {
const output = 'Expected output';
const result: GradingResult = await runAssertion(functionAssertion, {} as TestCase, output);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('Assertion passed');
});
it('should fail when the function assertion fails', async () => {
const output = 'Different output';
const result: GradingResult = await runAssertion(functionAssertion, {} as TestCase, output);
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Custom function returned false');
});
});
describe('assertionFromString', () => {
it('should create an equality assertion', () => {
const expected = 'Expected output';
const result: Assertion = assertionFromString(expected);
expect(result.type).toBe('equals');
expect(result.value).toBe(expected);
});
it('should create an is-json assertion', () => {
const expected = 'is-json';
const result: Assertion = assertionFromString(expected);
expect(result.type).toBe('is-json');
});
it('should create an contains-json assertion', () => {
const expected = 'contains-json';
const result: Assertion = assertionFromString(expected);
expect(result.type).toBe('contains-json');
});
it('should create a function assertion', () => {
const expected = 'fn:output === "Expected output"';
const result: Assertion = assertionFromString(expected);
expect(result.type).toBe('javascript');
expect(result.value).toBe('output === "Expected output"');
});
it('should create a similarity assertion', () => {
const expected = 'similar(0.9):Expected output';
const result: Assertion = assertionFromString(expected);
expect(result.type).toBe('similar');
expect(result.value).toBe('Expected output');
expect(result.threshold).toBe(0.9);
});
});
describe('matchesSimilarity', () => {
beforeEach(() => {
jest.spyOn(DefaultEmbeddingProvider, 'callEmbeddingApi').mockImplementation((text) => {
if (text === 'Expected output' || text === 'Sample output') {
return Promise.resolve({
embedding: [1, 0, 0],
tokenUsage: { total: 5, prompt: 2, completion: 3 },
});
} else if (text === 'Different output') {
return Promise.resolve({
embedding: [0, 1, 0],
tokenUsage: { total: 5, prompt: 2, completion: 3 },
});
}
return Promise.reject(new Error('Unexpected input'));
});
});
afterEach(() => {
jest.restoreAllMocks();
});
it('should pass when similarity is above the threshold', async () => {
const expected = 'Expected output';
const output = 'Sample output';
const threshold = 0.5;
const result = await matchesSimilarity(expected, output, threshold);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('Similarity 1 is greater than threshold 0.5');
});
it('should fail when similarity is below the threshold', async () => {
const expected = 'Expected output';
const output = 'Different output';
const threshold = 0.9;
const result = await matchesSimilarity(expected, output, threshold);
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Similarity 0 is less than threshold 0.9');
});
});
describe('matchesLlmRubric', () => {
class TestGrader implements ApiProvider {
async callApi(): Promise<ProviderResponse> {
return {
output: JSON.stringify({ pass: true }),
tokenUsage: { total: 10, prompt: 5, completion: 5 },
};
}
id(): string {
return 'TestGradingProvider';
}
}
const Grader = new TestGrader();
it('should pass when the grading provider returns a passing result', async () => {
const expected = 'Expected output';
const output = 'Sample output';
const options: GradingConfig = {
rubricPrompt: 'Grading prompt',
provider: Grader,
};
const result = await matchesLlmRubric(expected, output, options);
expect(result.pass).toBeTruthy();
});
it('should fail when the grading provider returns a failing result', async () => {
const expected = 'Expected output';
const output = 'Different output';
const options: GradingConfig = {
rubricPrompt: 'Grading prompt',
provider: Grader,
};
jest.spyOn(Grader, 'callApi').mockResolvedValueOnce({
output: JSON.stringify({ pass: false, reason: 'Grading failed' }),
tokenUsage: { total: 10, prompt: 5, completion: 5 },
});
const result = await matchesLlmRubric(expected, output, options);
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Grading failed');
});
});

View File

@@ -1,7 +1,8 @@
import { evaluate } from '../src/evaluator.js';
import type { ApiProvider } from '../src/types.js';
import { TestSuite } from '../src/types.js';
jest.mock('node-fetch', () => jest.fn());
jest.mock('../src/esm.js');
@@ -36,13 +37,17 @@ describe('evaluator', () => {
});
test('evaluate with vars', async () => {
const options = {
prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
vars: [{ var1: 'value1', var2: 'value2' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
tests: [
{
vars: { var1: 'value1', var2: 'value2' },
},
],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
@@ -54,13 +59,17 @@ describe('evaluator', () => {
});
test('evaluate with multiple providers', async () => {
const options = {
prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
vars: [{ var1: 'value1', var2: 'value2' }],
const testSuite: TestSuite = {
providers: [mockApiProvider, mockApiProvider],
prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
tests: [
{
vars: { var1: 'value1', var2: 'value2' },
},
],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
expect(summary.stats.successes).toBe(2);
@@ -73,13 +82,13 @@ describe('evaluator', () => {
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate without vars', async () => {
const options = {
prompts: ['Test prompt'],
test('evaluate without tests', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: ['Test prompt'],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
@@ -90,13 +99,13 @@ describe('evaluator', () => {
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate without vars with multiple providers', async () => {
const options = {
prompts: ['Test prompt'],
test('evaluate without tests with multiple providers', async () => {
const testSuite: TestSuite = {
providers: [mockApiProvider, mockApiProvider, mockApiProvider],
prompts: ['Test prompt'],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(3);
expect(summary.stats.successes).toBe(3);
@@ -108,13 +117,22 @@ describe('evaluator', () => {
});
test('evaluate with expected value matching output', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'Test output' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: ['Test prompt'],
tests: [
{
assert: [
{
type: 'equals',
value: 'Test output',
},
],
},
],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
@@ -124,13 +142,22 @@ describe('evaluator', () => {
});
test('evaluate with expected value not matching output', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'Different output' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: ['Test prompt'],
tests: [
{
assert: [
{
type: 'equals',
value: 'Different output',
},
],
},
],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);
@@ -140,13 +167,22 @@ describe('evaluator', () => {
});
test('evaluate with fn: expected value', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'fn:output === "Test output";' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: ['Test prompt'],
tests: [
{
assert: [
{
type: 'javascript',
value: 'output === "Test output";',
},
],
},
],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
@@ -156,46 +192,22 @@ describe('evaluator', () => {
});
test('evaluate with fn: expected value not matching output', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'fn:output === "Different output";' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
prompts: ['Test prompt'],
tests: [
{
assert: [
{
type: 'javascript',
value: 'output === "Different output";',
},
],
},
],
};
const summary = await evaluate(options);
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);
expect(summary.stats.failures).toBe(1);
expect(summary.results[0].success).toBe(false);
expect(summary.results[0].response?.output).toBe('Test output');
});
// TODO(1.0): remove legacy test
test('evaluate with eval: (legacy) expected value', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'eval:output === "Test output";' }],
providers: [mockApiProvider],
};
const summary = await evaluate(options);
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
expect(summary.stats.failures).toBe(0);
expect(summary.results[0].success).toBe(true);
expect(summary.results[0].response?.output).toBe('Test output');
});
test('evaluate with eval: (legacy) expected value not matching output', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'eval:output === "Different output";' }],
providers: [mockApiProvider],
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);
@@ -205,16 +217,27 @@ describe('evaluator', () => {
});
test('evaluate with grading expected value', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'grade:output is a test output' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
grading: {
provider: mockGradingApiProviderPasses,
prompts: ['Test prompt'],
tests: [
{
assert: [
{
type: 'llm-rubric',
value: 'output is a test output',
},
],
},
],
defaultTest: {
options: {
provider: mockGradingApiProviderPasses,
},
},
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(1);
@@ -224,16 +247,27 @@ describe('evaluator', () => {
});
test('evaluate with grading expected value does not pass', async () => {
const options = {
prompts: ['Test prompt'],
vars: [{ __expected: 'grade:output is a test output' }],
const testSuite: TestSuite = {
providers: [mockApiProvider],
grading: {
provider: mockGradingApiProviderFails,
prompts: ['Test prompt'],
tests: [
{
assert: [
{
type: 'llm-rubric',
value: 'output is a test output',
},
],
},
],
defaultTest: {
options: {
provider: mockGradingApiProviderFails,
},
},
};
const summary = await evaluate(options);
const summary = await evaluate(testSuite, {});
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
expect(summary.stats.successes).toBe(0);