Simplify API and add support for unified test suite definition (#14)

2023-08-15 01:10:51 +03:00 · 2023-05-30 09:02:49 -04:00
parent f259969051
commit bf81260b44
58 changed files with 2249 additions and 549 deletions
--- a/README.md
+++ b/README.md
@@ -32,21 +32,21 @@ It works on the command line too:

 Start by establishing a handful of test cases - core use cases and failure cases that you want to ensure your prompt can handle.

-As you explore modifications to the prompt, use `promptfoo eval` to rate all outputs.  This ensures the prompt is actually improving overall.
+As you explore modifications to the prompt, use `promptfoo eval` to rate all outputs. This ensures the prompt is actually improving overall.

 As you collect more examples and establish a user feedback loop, continue to build the pool of test cases.

 <img width="772" alt="LLM ops" src="https://github.com/typpo/promptfoo/assets/310310/cf0461a7-2832-4362-9fbb-4ebd911d06ff">

-## Usage (command line & web viewer)
+## Usage

-To get started, run the following command:
+To get started, run this command:

 ```
 npx promptfoo init
 ```

-This will create some templates in your current directory: `prompts.txt`, `vars.csv`, and `promptfooconfig.js`.
+This will create some placeholders in your current directory: `prompts.txt` and `promptfooconfig.yaml`.

 After editing the prompts and variables to your liking, run the eval command to kick off an evaluation:

@@ -54,20 +54,75 @@ After editing the prompts and variables to your liking, run the eval command to
 npx promptfoo eval
 ```

-If you're looking to customize your usage, you have a wide set of parameters at your disposal. See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/parameters) for more detail:
+### Configuration

-| Option                              | Description                                                                                                                                                                                    |
-| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `-p, --prompts <paths...>`          | Paths to prompt files, directory, or glob                                                                                                                                                      |
-| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers](https://www.promptfoo.dev/docs/configuration/providers) |
-| `-o, --output <path>`               | Path to output file (csv, json, yaml, html)                                                                                                                                                    |
-| `-v, --vars <path>`                 | Path to file with prompt variables (csv, json, yaml)                                                                                                                                           |
-| `-c, --config <path>`               | Path to configuration file. `promptfooconfig.js[on]` is automatically loaded if present                                                                                                        |
-| `-j, --max-concurrency <number>`    | Maximum number of concurrent API calls                                                                                                                                                         |
-| `--table-cell-max-length <number>`  | Truncate console table cells to this length                                                                                                                                                    |
-| `--prompt-prefix <path>`            | This prefix is prepended to every prompt                                                                                                                                                       |
-| `--prompt-suffix <path>`            | This suffix is append to every prompt                                                                                                                                                          |
-| `--grader`                          | Provider that will grade outputs, if you are using [LLM grading](https://www.promptfoo.dev/docs/configuration/expected-outputs)                                                                |
+The YAML configuration format runs each prompt through a series of example inputs (aka "test case") and checks if they meet requirements (aka "assert").
+
+See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/parameters) for more detail.
+
+```yaml
+prompts: [prompts.txt]
+providers: [openai:gpt-3.5-turbo]
+tests:
+  - description: First test case - automatic review
+    vars:
+      var1: first variable's value
+      var2: another value
+      var3: some other value
+    assert:
+      - type: equality
+        value: expected LLM output goes here
+      - type: function
+        value: output.includes('some text')
+
+  - description: Second test case - manual review
+    # Test cases don't need assertions if you prefer to review the output yourself
+    vars:
+      var1: new value
+      var2: another value
+      var3: third value
+
+  - description: Third test case - other types of automatic review
+    vars:
+      var1: yet another value
+      var2: and another
+      var3: dear llm, please output your response in json format
+    assert:
+      - type: contains-json
+      - type: similarity
+        value: ensures that output is semantically similar to this text
+      - type: llm-rubric
+        value: ensure that output contains a reference to X
+```
+
+### Tests on spreadsheet
+
+Some people prefer to configure their LLM tests in a CSV. In that case, the config is pretty simple:
+
+```yaml
+prompts: [prompts.txt]
+providers: [openai:gpt-3.5-turbo]
+tests: tests.csv
+```
+
+See [example CSV](https://github.com/typpo/promptfoo/blob/main/examples/simple-test/tests.csv).
+
+### Command-line
+
+If you're looking to customize your usage, you have a wide set of parameters at your disposal.
+
+| Option                              | Description                                                                                                                                                                                                            |
+| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `-p, --prompts <paths...>`          | Paths to [prompt files](https://promptfoo.dev/docs/configuration/parameters#prompt-files), directory, or glob                                                                                                          |
+| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers](https://promptfoo.dev/docs/configuration/providers)                             |
+| `-o, --output <path>`               | Path to [output file](https://promptfoo.dev/docs/configuration/parameters#output-file) (csv, json, yaml, html)                                                                                                         |
+| `--tests <path>`                    | Path to [external test file](https://promptfoo.dev/docs/configurationexpected-outputsassertions#load-an-external-tests-file)                                                                                           |
+| `-c, --config <path>`               | Path to [configuration file](https://promptfoo.dev/docs/configuration/guide). `promptfooconfig.js/json/yaml` is automatically loaded if present                                                                        |
+| `-j, --max-concurrency <number>`    | Maximum number of concurrent API calls                                                                                                                                                                                 |
+| `--table-cell-max-length <number>`  | Truncate console table cells to this length                                                                                                                                                                            |
+| `--prompt-prefix <path>`            | This prefix is prepended to every prompt                                                                                                                                                                               |
+| `--prompt-suffix <path>`            | This suffix is append to every prompt                                                                                                                                                                                  |
+| `--grader`                          | [Provider](https://promptfoo.dev/docs/configuration/providers) that will conduct the evaluation, if you are [using LLM to grade your output](https://promptfoo.dev/docs/configuration/expected-outputs#llm-evaluation) |

 After running an eval, you may optionally use the `view` command to open the web viewer:

@@ -79,10 +134,10 @@ npx promptfoo view

 #### Prompt quality

-In this example, we evaluate whether adding adjectives to the personality of an assistant bot affects the responses:
+In [this example](https://github.com/typpo/promptfoo/tree/main/examples/assistant-cli), we evaluate whether adding adjectives to the personality of an assistant bot affects the responses:

 ```bash
-npx promptfoo eval -p prompts.txt -v vars.csv -r openai:gpt-3.5-turbo
+npx promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo -t tests.csv
 ```

 <!--
@@ -93,15 +148,13 @@ npx promptfoo eval -p prompts.txt -v vars.csv -r openai:gpt-3.5-turbo

 This command will evaluate the prompts in `prompts.txt`, substituing the variable values from `vars.csv`, and output results in your terminal.

-Have a look at the setup and full output [here](https://github.com/typpo/promptfoo/tree/main/examples/assistant-cli).
-
 You can also output a nice [spreadsheet](https://docs.google.com/spreadsheets/d/1nanoj3_TniWrDl1Sj-qYqIMD6jwm5FBy15xPFdUTsmI/edit?usp=sharing), [JSON](https://github.com/typpo/promptfoo/blob/main/examples/simple-cli/output.json), YAML, or an HTML file:

 ![Table output](https://user-images.githubusercontent.com/310310/235483444-4ddb832d-e103-4b9c-a862-b0d6cc11cdc0.png)

 #### Model quality

-In this example, we evaluate the difference between GPT 3 and GPT 4 outputs for a given prompt:
+In the [next example](https://github.com/typpo/promptfoo/tree/main/examples/gpt-3.5-vs-4), we evaluate the difference between GPT 3 and GPT 4 outputs for a given prompt:

 ```bash
 npx promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo openai:gpt-4 -o output.html
@@ -111,19 +164,46 @@ Produces this HTML table:

 ![Side-by-side evaluation of LLM model quality, gpt3 vs gpt4, html output](https://user-images.githubusercontent.com/310310/235490527-e0c31f40-00a0-493a-8afc-8ed6322bb5ca.png)

-Full setup and output [here](https://github.com/typpo/promptfoo/tree/main/examples/gpt-3.5-vs-4).
-
 ## Usage (node package)

 You can also use `promptfoo` as a library in your project by importing the `evaluate` function. The function takes the following parameters:

- `providers`: a list of provider strings or `ApiProvider` objects, or just a single string or `ApiProvider`.
- `options`: the prompts and variables you want to test:
+- `testSuite`: the Javascript equivalent of the promptfooconfig.yaml

  ```typescript
-  {
-    prompts: string[];
+  interface TestSuiteConfig {
+    providers: string[]; // Valid provider name (e.g. openai:gpt-3.5-turbo)
+    prompts: string[]; // List of prompts
+    tests: string | TestCase[]; // Path to a CSV file, or list of test cases
+
+    defaultTest?: Omit<TestCase, 'description'>; // Optional: add default vars and assertions on test case
+    outputPath?: string; // Optional: write results to file
+  }
+
+  interface TestCase {
+    description?: string;
    vars?: Record<string, string>;
+    assert?: Assertion[];
+
+    prompt?: PromptConfig;
+    grading?: GradingConfig;
+  }
+
+  interface Assertion {
+    type: 'equality' | 'is-json' | 'contains-json' | 'function' | 'similarity' | 'llm-rubric';
+    value?: string;
+    threshold?: number; // For similarity assertions
+    provider?: ApiProvider; // For assertions that require an LLM provider
+  }
+  ```
+
+- `options`: misc options related to how the tests are run
+
+  ```typescript
+  interface EvaluateOptions {
+    maxConcurrency?: number;
+    showProgressBar?: boolean;
+    generateSuggestions?: boolean;
  }
  ```

@@ -134,61 +214,31 @@ You can also use `promptfoo` as a library in your project by importing the `eval
 ```js
 import promptfoo from 'promptfoo';

-const options = {
+const results = await promptfoo.evaluate({
  prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
-  vars: [{ body: 'Hello world' }, { body: "I'm hungry" }],
-};
-
-(async () => {
-  const summary = await promptfoo.evaluate('openai:gpt-3.5-turbo', options);
-  console.log(summary);
-})();
-```
-
-This code imports the `promptfoo` library, defines the evaluation options, and then calls the `evaluate` function with these options. The results are logged to the console:
-
-```js
-{
-  "results": [
+  providers: ['openai:gpt-3.5-turbo'],
+  tests: [
    {
-      "prompt": {
-        "raw": "Rephrase this in French: Hello world",
-        "display": "Rephrase this in French: {{body}}"
+      vars: {
+        body: 'Hello world',
+      },
+    },
+    {
+      vars: {
+        body: "I'm hungry",
      },
-      "vars": {
-        "body": "Hello world"
-      },
-      "response": {
-        "output": "Bonjour le monde",
-        "tokenUsage": {
-          "total": 19,
-          "prompt": 16,
-          "completion": 3
-        }
-      }
    },
-    // ...
  ],
-  "stats": {
-    "successes": 4,
-    "failures": 0,
-    "tokenUsage": {
-      "total": 120,
-      "prompt": 72,
-      "completion": 48
-    }
-  },
-  "table": [
-    // ...
-  ]
-}
+});
 ```

-[See full example here](https://github.com/typpo/promptfoo/tree/main/examples/simple-import)
+This code imports the `promptfoo` library, defines the evaluation options, and then calls the `evaluate` function with these options.
+
+See the full example [here](https://github.com/typpo/promptfoo/tree/main/examples/simple-import), which includes an example results object.

 ## Configuration

- **[Setting up an eval](https://promptfoo.dev/docs/configuration/parameters)**: Learn more about how to set up prompt files, vars file, output, etc.
+- **[Main guide](https://promptfoo.dev/docs/configuration/guide)**: Learn about how to configure your YAML file, setup prompt files, etc.
 - **[Configuring test cases](https://promptfoo.dev/docs/configuration/expected-outputs)**: Learn more about how to configure expected outputs and test assertions.

 ## Installation
--- a/examples/assistant-cli/README.md
+++ b/examples/assistant-cli/README.md
@@ -1,7 +1,13 @@
 This example shows how you can use promptfoo to generate a side-by-side eval of two prompts for an ecommerce chat bot.

-Run:
+Configuration is in `promptfooconfig.yaml`. Run:

 ```
-promptfoo eval -p prompts.txt --vars vars.csv -r openai:chat
+promptfoo eval
+```
+
+Full command-line equivalent:
+
+```
+promptfoo eval --prompts prompts.txt --tests tests.csv --providers openai:gpt-3.5-turbo --output output.json
 ```
--- a/examples/assistant-cli/promptfooconfig.yaml
+++ b/examples/assistant-cli/promptfooconfig.yaml
@@ -0,0 +1,3 @@
+prompts: prompts.txt
+providers: openai:gpt-3.5-turbo
+tests: tests.csv
--- a/examples/assistant-cli/tests.csv
+++ b/examples/assistant-cli/tests.csv
--- a/examples/custom-provider/README.md
+++ b/examples/custom-provider/README.md
@@ -0,0 +1,13 @@
+This example uses a custom API provider in `customProvider.js`. It also uses CSV test cases.
+
+Run:
+
+```
+promptfoo eval
+```
+
+Full command-line equivalent:
+
+```
+promptfoo eval --prompts prompts.txt --tests vars.csv --providers openai:chat --output output.json --providers customProvider.js
+```
--- a/examples/simple-cli-custom-provider/customProvider.js
+++ b/examples/simple-cli-custom-provider/customProvider.js
--- a/examples/custom-provider/promptfooconfig.yaml
+++ b/examples/custom-provider/promptfooconfig.yaml
@@ -0,0 +1,3 @@
+prompts: prompts.txt
+providers: customProvider.js
+tests: vars.csv
--- a/examples/simple-cli-custom-provider/prompts.txt
+++ b/examples/simple-cli-custom-provider/prompts.txt
--- a/examples/simple-cli-custom-provider/vars.csv
+++ b/examples/simple-cli-custom-provider/vars.csv
--- a/examples/gpt-3.5-vs-4/README.md
+++ b/examples/gpt-3.5-vs-4/README.md
@@ -1,7 +1,13 @@
 This example shows how you can use promptfoo to generate a side-by-side eval of multiple prompts to compare GPT 3 and GPT 4 outputs.

-Run:
+Configure in `promptfooconfig.yaml`. Run with:

 ```
-promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo openai:gpt-4
+promptfoo eval
+```
+
+Full command-line equivalent:
+
+```
+promptfoo eval --prompts prompts.txt --providers openai:gpt-3.5-turbo openai:gpt-4
 ```
--- a/examples/gpt-3.5-vs-4/promptfooconfig.yaml
+++ b/examples/gpt-3.5-vs-4/promptfooconfig.yaml
@@ -0,0 +1,4 @@
+prompts: prompts.txt
+providers:
+  - openai:gpt-3.5-turbo
+  - openai:gpt-4
--- a/examples/js-config/README.md
+++ b/examples/js-config/README.md
@@ -0,0 +1,5 @@
+This example is pre-configured in `promptfooconfig.js`. That means you can just run:
+
+```
+promptfoo eval
+```
--- a/examples/js-config/output.html
+++ b/examples/js-config/output.html
@@ -0,0 +1,92 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width" />
+    <title>Table Output</title>
+    <style>
+      body {
+        font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica, Arial,
+          sans-serif;
+      }
+      table,
+      th,
+      td {
+        border: 1px solid black;
+        border-collapse: collapse;
+        text-align: left;
+        word-break: break-all;
+      }
+      th,
+      td {
+        padding: 5px;
+        min-width: 200px;
+      }
+
+      tr > td[data-content^='[PASS]'] {
+        color: green;
+      }
+      tr > td[data-content^='[FAIL]'] {
+        color: #ad0000;
+      }
+    </style>
+  </head>
+  <body>
+    <table>
+      <thead>
+        <th>Rephrase this in {{language}}: {{body}}</th>
+
+        <th>Translate this to conversational {{language}}: {{body}}</th>
+
+        <th>body</th>
+
+        <th>language</th>
+      </thead>
+      <tbody>
+        <tr>
+          <td data-content="Bonjour le monde">Bonjour le monde</td>
+
+          <td data-content="Bonjour le monde">Bonjour le monde</td>
+
+          <td data-content="Hello world">Hello world</td>
+
+          <td data-content="French">French</td>
+        </tr>
+
+        <tr>
+          <td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
+
+          <td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
+
+          <td data-content="I&#39;m hungry">I&#39;m hungry</td>
+
+          <td data-content="French">French</td>
+        </tr>
+
+        <tr>
+          <td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
+
+          <td data-content="Ahoy thar world!">Ahoy thar world!</td>
+
+          <td data-content="Hello world">Hello world</td>
+
+          <td data-content="Pirate">Pirate</td>
+        </tr>
+
+        <tr>
+          <td data-content="Arrr, me belly be empty and yearnin&#39; for grub.">
+            Arrr, me belly be empty and yearnin&#39; for grub.
+          </td>
+
+          <td data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!">
+            Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!
+          </td>
+
+          <td data-content="I&#39;m hungry">I&#39;m hungry</td>
+
+          <td data-content="Pirate">Pirate</td>
+        </tr>
+      </tbody>
+    </table>
+  </body>
+</html>
--- a/examples/js-config/output.json
+++ b/examples/js-config/output.json
@@ -0,0 +1,181 @@
+{
+  "version": 1,
+  "results": [
+    {
+      "prompt": {
+        "raw": "Rephrase this in French: Hello world",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Bonjour le monde",
+        "tokenUsage": {
+          "cached": 19
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational French: Hello world",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Bonjour le monde",
+        "tokenUsage": {
+          "cached": 20
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in French: I&#39;m hungry",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "J'ai faim.",
+        "tokenUsage": {
+          "cached": 24
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational French: I&#39;m hungry",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "J'ai faim.",
+        "tokenUsage": {
+          "cached": 25
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in Pirate: Hello world",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy thar, world!",
+        "tokenUsage": {
+          "cached": 23
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational Pirate: Hello world",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy thar world!",
+        "tokenUsage": {
+          "cached": 23
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in Pirate: I&#39;m hungry",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "Arrr, me belly be empty and yearnin' for grub.",
+        "tokenUsage": {
+          "cached": 33
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational Pirate: I&#39;m hungry",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "Arrr, me belly be rumblin'! I be needin' some grub!",
+        "tokenUsage": {
+          "cached": 39
+        }
+      },
+      "success": true
+    }
+  ],
+  "stats": {
+    "successes": 8,
+    "failures": 0,
+    "tokenUsage": {
+      "total": 0,
+      "prompt": 0,
+      "completion": 0,
+      "cached": 206
+    }
+  },
+  "table": {
+    "head": {
+      "prompts": [
+        "Rephrase this in {{language}}: {{body}}",
+        "Translate this to conversational {{language}}: {{body}}"
+      ],
+      "vars": ["body", "language"]
+    },
+    "body": [
+      {
+        "outputs": ["Bonjour le monde", "Bonjour le monde"],
+        "vars": ["Hello world", "French"]
+      },
+      {
+        "outputs": ["J'ai faim.", "J'ai faim."],
+        "vars": ["I'm hungry", "French"]
+      },
+      {
+        "outputs": ["Ahoy thar, world!", "Ahoy thar world!"],
+        "vars": ["Hello world", "Pirate"]
+      },
+      {
+        "outputs": [
+          "Arrr, me belly be empty and yearnin' for grub.",
+          "Arrr, me belly be rumblin'! I be needin' some grub!"
+        ],
+        "vars": ["I'm hungry", "Pirate"]
+      }
+    ]
+  }
+}
--- a/examples/js-config/promptfooconfig.js
+++ b/examples/js-config/promptfooconfig.js
@@ -0,0 +1,31 @@
+module.exports = {
+  description: 'A translator built with LLM',
+  prompts: ['prompts.txt'],
+  providers: ['openai:gpt-3.5-turbo'],
+  tests: [
+    {
+      vars: {
+        language: 'French',
+        body: 'Hello world',
+      },
+    },
+    {
+      vars: {
+        language: 'French',
+        body: "I'm hungry",
+      },
+    },
+    {
+      vars: {
+        language: 'Pirate',
+        body: 'Hello world',
+      },
+    },
+    {
+      vars: {
+        language: 'Pirate',
+        body: "I'm hungry",
+      },
+    },
+  ],
+};
--- a/examples/js-config/prompts.txt
+++ b/examples/js-config/prompts.txt
@@ -0,0 +1,3 @@
+Rephrase this in {{language}}: {{body}}
+---
+Translate this to conversational {{language}}: {{body}}
--- a/examples/node-package/index.js
+++ b/examples/node-package/index.js
@@ -0,0 +1,22 @@
+import promptfoo from '../../dist/index.js';
+
+(async () => {
+  const results = await promptfoo.evaluate({
+    prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
+    providers: ['openai:gpt-3.5-turbo'],
+    tests: [
+      {
+        vars: {
+          body: 'Hello world',
+        },
+      },
+      {
+        vars: {
+          body: "I'm hungry",
+        },
+      },
+    ],
+  });
+  console.log('RESULTS:');
+  console.log(results);
+})();
--- a/examples/simple-import/output.json
+++ b/examples/simple-import/output.json
--- a/examples/simple-import/package.json
+++ b/examples/simple-import/package.json
--- a/examples/self-grading/README.md
+++ b/examples/self-grading/README.md
@@ -1,9 +1,15 @@
 This example shows how you can have an LLM grade its own output according to predefined expectations.

-Configuration is in promptfooconfig.js
+Identical configurations are provided in `promptfooconfig.js` and `promptfooconfig.yaml`.

 Run:

 ```
 promptfoo eval
 ```
+
+You can also define the tests in a CSV file:
+
+```
+promptfoo eval --tests tests.csv
+```
--- a/examples/self-grading/promptfooconfig.js
+++ b/examples/self-grading/promptfooconfig.js
@@ -1,6 +1,75 @@
 module.exports = {
-  providers: ['openai:chat:gpt-3.5-turbo'],
-  prompts: ['./prompts.txt'],
-  vars: './vars.csv',
-  grader: 'openai:chat:gpt-4',
+  prompts: 'prompts.txt',
+  providers: 'openai:gpt-3.5-turbo',
+  defaultTest: {
+    assert: [
+      {
+        type: 'llm-rubric',
+        value: 'Do not mention that you are an AI or chat assistant',
+      },
+    ],
+  },
+  tests: [
+    {
+      vars: {
+        name: 'Bob',
+        question: 'Can you help me find a specific product on your website?',
+      },
+    },
+    {
+      vars: {
+        name: 'Jane',
+        question: 'Do you have any promotions or discounts currently available?',
+      },
+    },
+    {
+      vars: {
+        name: 'Dave',
+        question: 'What are your shipping and return policies?',
+      },
+    },
+    {
+      vars: {
+        name: 'Jim',
+        question: 'Can you provide more information about the product specifications or features?',
+      },
+    },
+    {
+      vars: {
+        name: 'Alice',
+        question: "Can you recommend products that are similar to what I've been looking at?",
+      },
+    },
+    {
+      vars: {
+        name: 'Sophie',
+        question:
+          'Do you have any recommendations for products that are currently popular or trending?',
+      },
+    },
+    {
+      vars: {
+        name: 'Ben',
+        question: 'Can you check the availability of a product at a specific store location?',
+      },
+    },
+    {
+      vars: {
+        name: 'Jessie',
+        question: 'How can I track my order after it has been shipped?',
+      },
+    },
+    {
+      vars: {
+        name: 'Kim',
+        question: 'What payment methods do you accept?',
+      },
+    },
+    {
+      vars: {
+        name: 'Emily',
+        question: "Can you help me with a problem I'm having with my account or order?",
+      },
+    },
+  ],
 };
--- a/examples/self-grading/promptfooconfig.yaml
+++ b/examples/self-grading/promptfooconfig.yaml
@@ -0,0 +1,37 @@
+prompts: prompts.txt
+providers: openai:gpt-3.5-turbo
+defaultTest:
+  assert:
+    - type: llm-rubric
+      value: Do not mention that you are an AI or chat assistant
+tests:
+  - vars:
+      name: Bob
+      question: Can you help me find a specific product on your website?
+  - vars:
+      name: Jane
+      question: Do you have any promotions or discounts currently available?
+  - vars:
+      name: Dave
+      question: What are your shipping and return policies?
+  - vars:
+      name: Jim
+      question: Can you provide more information about the product specifications or features?
+  - vars:
+      name: Alice
+      question: Can you recommend products that are similar to what I've been looking at?
+  - vars:
+      name: Sophie
+      question: Do you have any recommendations for products that are currently popular or trending?
+  - vars:
+      name: Ben
+      question: Can you check the availability of a product at a specific store location?
+  - vars:
+      name: Jessie
+      question: How can I track my order after it has been shipped?
+  - vars:
+      name: Kim
+      question: What payment methods do you accept?
+  - vars:
+      name: Emily
+      question: Can you help me with a problem I'm having with my account or order?
--- a/examples/self-grading/tests.csv
+++ b/examples/self-grading/tests.csv
--- a/examples/simple-cli-custom-provider/README.md
+++ b/examples/simple-cli-custom-provider/README.md
@@ -1,5 +0,0 @@
-Run:
-
-```
-promptfoo eval --prompts prompts.txt --vars vars.csv --providers openai:chat --output output.json --providers customProvider.js
-```
--- a/examples/simple-cli/README.md
+++ b/examples/simple-cli/README.md
@@ -1,11 +1,11 @@
-This example is pre-configured in `promptfooconfig.js`. That means you can just run:
+This example is pre-configured in `promptfooconfig.yaml` (both identical examples). That means you can just run:

 ```
 promptfoo eval
 ```

-Here's the full command:
+To override prompts, providers, output, etc. you can run:

 ```
-promptfoo eval --prompts prompts.txt --vars vars.csv --providers openai:chat --output output.json
+promptfoo eval --prompts prompts.txt --providers openai:chat --output output.json
 ```
--- a/examples/simple-cli/output.html
+++ b/examples/simple-cli/output.html
@@ -14,39 +14,77 @@
      td {
        border: 1px solid black;
        border-collapse: collapse;
+        text-align: left;
+        word-break: break-all;
      }
      th,
      td {
        padding: 5px;
+        min-width: 200px;
+      }
+
+      tr > td[data-content^='[PASS]'] {
+        color: green;
+      }
+      tr > td[data-content^='[FAIL]'] {
+        color: #ad0000;
      }
    </style>
  </head>
  <body>
    <table>
      <thead>
-        <th>Rephrase this in French: {{body}}</th>
+        <th>Rephrase this in {{language}}: {{body}}</th>

-        <th>Rephrase this like a pirate: {{body}}</th>
+        <th>Translate this to conversational {{language}}: {{body}}</th>

        <th>body</th>
+
+        <th>language</th>
      </thead>
      <tbody>
        <tr>
-          <td>Bonjour le monde</td>
+          <td data-content="Bonjour le monde">Bonjour le monde</td>

-          <td>Ahoy thar, me hearties! Avast ye, world!</td>
+          <td data-content="Bonjour le monde">Bonjour le monde</td>

-          <td>Hello world</td>
+          <td data-content="Hello world">Hello world</td>
+
+          <td data-content="French">French</td>
        </tr>

        <tr>
-          <td>J&#39;ai faim.</td>
+          <td data-content="J&#39;ai faim.">J&#39;ai faim.</td>

-          <td>
-            Arrr, me belly be empty and me throat be parched! I be needin&#39; some grub, matey!
+          <td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
+
+          <td data-content="I&#39;m hungry">I&#39;m hungry</td>
+
+          <td data-content="French">French</td>
+        </tr>
+
+        <tr>
+          <td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
+
+          <td data-content="Ahoy thar world!">Ahoy thar world!</td>
+
+          <td data-content="Hello world">Hello world</td>
+
+          <td data-content="Pirate">Pirate</td>
+        </tr>
+
+        <tr>
+          <td data-content="Arrr, me belly be empty and yearnin&#39; for grub.">
+            Arrr, me belly be empty and yearnin&#39; for grub.
          </td>

-          <td>I&#39;m hungry</td>
+          <td data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!">
+            Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!
+          </td>
+
+          <td data-content="I&#39;m hungry">I&#39;m hungry</td>
+
+          <td data-content="Pirate">Pirate</td>
        </tr>
      </tbody>
    </table>
--- a/examples/simple-cli/output.json
+++ b/examples/simple-cli/output.json
@@ -1,19 +1,36 @@
 {
+  "version": 1,
  "results": [
    {
      "prompt": {
        "raw": "Rephrase this in French: Hello world",
-        "display": "Rephrase this in French: {{body}}"
+        "display": "Rephrase this in {{language}}: {{body}}"
      },
      "vars": {
+        "language": "French",
        "body": "Hello world"
      },
      "response": {
        "output": "Bonjour le monde",
        "tokenUsage": {
-          "total": 19,
-          "prompt": 16,
-          "completion": 3
+          "cached": 19
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational French: Hello world",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Bonjour le monde",
+        "tokenUsage": {
+          "cached": 20
        }
      },
      "success": true
@@ -21,74 +38,144 @@
    {
      "prompt": {
        "raw": "Rephrase this in French: I&#39;m hungry",
-        "display": "Rephrase this in French: {{body}}"
+        "display": "Rephrase this in {{language}}: {{body}}"
      },
      "vars": {
+        "language": "French",
        "body": "I'm hungry"
      },
      "response": {
        "output": "J'ai faim.",
        "tokenUsage": {
-          "total": 24,
-          "prompt": 19,
-          "completion": 5
+          "cached": 24
        }
      },
      "success": true
    },
    {
      "prompt": {
-        "raw": "Rephrase this like a pirate: Hello world",
-        "display": "Rephrase this like a pirate: {{body}}"
-      },
-      "vars": {
-        "body": "Hello world"
-      },
-      "response": {
-        "output": "Ahoy thar, me hearties! Avast ye, world!",
-        "tokenUsage": {
-          "total": 32,
-          "prompt": 17,
-          "completion": 15
-        }
-      },
-      "success": true
-    },
-    {
-      "prompt": {
-        "raw": "Rephrase this like a pirate: I&#39;m hungry",
-        "display": "Rephrase this like a pirate: {{body}}"
+        "raw": "Translate this to conversational French: I&#39;m hungry",
+        "display": "Translate this to conversational {{language}}: {{body}}"
      },
      "vars": {
+        "language": "French",
        "body": "I'm hungry"
      },
      "response": {
-        "output": "Arrr, me belly be empty and me throat be parched! I be needin' some grub, matey!",
+        "output": "J'ai faim.",
        "tokenUsage": {
-          "total": 45,
-          "prompt": 20,
-          "completion": 25
+          "cached": 25
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in Pirate: Hello world",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy thar, world!",
+        "tokenUsage": {
+          "cached": 23
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational Pirate: Hello world",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy thar world!",
+        "tokenUsage": {
+          "cached": 23
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in Pirate: I&#39;m hungry",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "Arrr, me belly be empty and yearnin' for grub.",
+        "tokenUsage": {
+          "cached": 33
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational Pirate: I&#39;m hungry",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "Arrr, me belly be rumblin'! I be needin' some grub!",
+        "tokenUsage": {
+          "cached": 39
        }
      },
      "success": true
    }
  ],
  "stats": {
-    "successes": 4,
+    "successes": 8,
    "failures": 0,
    "tokenUsage": {
-      "total": 120,
-      "prompt": 72,
-      "completion": 48
+      "total": 0,
+      "prompt": 0,
+      "completion": 0,
+      "cached": 206
    }
  },
-  "table": [
-    ["Rephrase this in French: {{body}}", "Rephrase this like a pirate: {{body}}", "body"],
-    ["Bonjour le monde", "Ahoy thar, me hearties! Avast ye, world!", "Hello world"],
-    [
-      "J'ai faim.",
-      "Arrr, me belly be empty and me throat be parched! I be needin' some grub, matey!",
-      "I'm hungry"
+  "table": {
+    "head": {
+      "prompts": [
+        "Rephrase this in {{language}}: {{body}}",
+        "Translate this to conversational {{language}}: {{body}}"
+      ],
+      "vars": ["body", "language"]
+    },
+    "body": [
+      {
+        "outputs": ["Bonjour le monde", "Bonjour le monde"],
+        "vars": ["Hello world", "French"]
+      },
+      {
+        "outputs": ["J'ai faim.", "J'ai faim."],
+        "vars": ["I'm hungry", "French"]
+      },
+      {
+        "outputs": ["Ahoy thar, world!", "Ahoy thar world!"],
+        "vars": ["Hello world", "Pirate"]
+      },
+      {
+        "outputs": [
+          "Arrr, me belly be empty and yearnin' for grub.",
+          "Arrr, me belly be rumblin'! I be needin' some grub!"
+        ],
+        "vars": ["I'm hungry", "Pirate"]
+      }
    ]
-  ]
+  }
 }
--- a/examples/simple-cli/promptfooconfig.js
+++ b/examples/simple-cli/promptfooconfig.js
@@ -1,5 +0,0 @@
-module.exports = {
-  providers: ['openai:gpt-3.5-turbo'],
-  prompts: ['./prompts.txt'],
-  vars: './vars.csv',
-};
--- a/examples/simple-cli/promptfooconfig.yaml
+++ b/examples/simple-cli/promptfooconfig.yaml
@@ -0,0 +1,16 @@
+description: A translator built with LLM
+prompts: [prompts.txt]
+providers: [openai:gpt-3.5-turbo]
+tests:
+  - vars:
+      language: French
+      body: Hello world
+  - vars:
+      language: French
+      body: I'm hungry
+  - vars:
+      language: Pirate
+      body: Hello world
+  - vars:
+      language: Pirate
+      body: I'm hungry
--- a/examples/simple-cli/prompts.txt
+++ b/examples/simple-cli/prompts.txt
@@ -1,3 +1,3 @@
-Rephrase this in French: {{body}}
+Rephrase this in {{language}}: {{body}}
 ---
-Rephrase this like a pirate: {{body}}
+Translate this to conversational {{language}}: {{body}}
--- a/examples/simple-cli/vars.csv
+++ b/examples/simple-cli/vars.csv
@@ -1,3 +0,0 @@
-body
-Hello world
-I'm hungry
--- a/examples/simple-csv/README.md
+++ b/examples/simple-csv/README.md
@@ -0,0 +1,11 @@
+This example is pre-configured in `promptfooconfig.yaml`. Run:
+
+```
+promptfoo eval
+```
+
+Here's the full command:
+
+```
+promptfoo eval --prompts prompts.txt --tests tests.csv --providers openai:gpt-3.5-turbo
+```
--- a/examples/simple-csv/output.html
+++ b/examples/simple-csv/output.html
@@ -0,0 +1,92 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width" />
+    <title>Table Output</title>
+    <style>
+      body {
+        font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, Helvetica, Arial,
+          sans-serif;
+      }
+      table,
+      th,
+      td {
+        border: 1px solid black;
+        border-collapse: collapse;
+        text-align: left;
+        word-break: break-all;
+      }
+      th,
+      td {
+        padding: 5px;
+        min-width: 200px;
+      }
+
+      tr > td[data-content^='[PASS]'] {
+        color: green;
+      }
+      tr > td[data-content^='[FAIL]'] {
+        color: #ad0000;
+      }
+    </style>
+  </head>
+  <body>
+    <table>
+      <thead>
+        <th>Rephrase this in {{language}}: {{body}}</th>
+
+        <th>Translate this to conversational {{language}}: {{body}}</th>
+
+        <th>body</th>
+
+        <th>language</th>
+      </thead>
+      <tbody>
+        <tr>
+          <td data-content="Bonjour le monde">Bonjour le monde</td>
+
+          <td data-content="Bonjour le monde">Bonjour le monde</td>
+
+          <td data-content="Hello world">Hello world</td>
+
+          <td data-content="French">French</td>
+        </tr>
+
+        <tr>
+          <td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
+
+          <td data-content="J&#39;ai faim.">J&#39;ai faim.</td>
+
+          <td data-content="I&#39;m hungry">I&#39;m hungry</td>
+
+          <td data-content="French">French</td>
+        </tr>
+
+        <tr>
+          <td data-content="Ahoy thar, world!">Ahoy thar, world!</td>
+
+          <td data-content="Ahoy thar world!">Ahoy thar world!</td>
+
+          <td data-content="Hello world">Hello world</td>
+
+          <td data-content="Pirate">Pirate</td>
+        </tr>
+
+        <tr>
+          <td data-content="Arrr, me belly be empty and yearnin&#39; for grub.">
+            Arrr, me belly be empty and yearnin&#39; for grub.
+          </td>
+
+          <td data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!">
+            Arrr, me belly be rumblin&#39;! I be needin&#39; some grub!
+          </td>
+
+          <td data-content="I&#39;m hungry">I&#39;m hungry</td>
+
+          <td data-content="Pirate">Pirate</td>
+        </tr>
+      </tbody>
+    </table>
+  </body>
+</html>
--- a/examples/simple-csv/output.json
+++ b/examples/simple-csv/output.json
@@ -0,0 +1,181 @@
+{
+  "version": 1,
+  "results": [
+    {
+      "prompt": {
+        "raw": "Rephrase this in French: Hello world",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Bonjour le monde",
+        "tokenUsage": {
+          "cached": 19
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational French: Hello world",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Bonjour le monde",
+        "tokenUsage": {
+          "cached": 20
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in French: I&#39;m hungry",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "J'ai faim.",
+        "tokenUsage": {
+          "cached": 24
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational French: I&#39;m hungry",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "French",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "J'ai faim.",
+        "tokenUsage": {
+          "cached": 25
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in Pirate: Hello world",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy thar, world!",
+        "tokenUsage": {
+          "cached": 23
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational Pirate: Hello world",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy thar world!",
+        "tokenUsage": {
+          "cached": 23
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Rephrase this in Pirate: I&#39;m hungry",
+        "display": "Rephrase this in {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "Arrr, me belly be empty and yearnin' for grub.",
+        "tokenUsage": {
+          "cached": 33
+        }
+      },
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Translate this to conversational Pirate: I&#39;m hungry",
+        "display": "Translate this to conversational {{language}}: {{body}}"
+      },
+      "vars": {
+        "language": "Pirate",
+        "body": "I'm hungry"
+      },
+      "response": {
+        "output": "Arrr, me belly be rumblin'! I be needin' some grub!",
+        "tokenUsage": {
+          "cached": 39
+        }
+      },
+      "success": true
+    }
+  ],
+  "stats": {
+    "successes": 8,
+    "failures": 0,
+    "tokenUsage": {
+      "total": 0,
+      "prompt": 0,
+      "completion": 0,
+      "cached": 206
+    }
+  },
+  "table": {
+    "head": {
+      "prompts": [
+        "Rephrase this in {{language}}: {{body}}",
+        "Translate this to conversational {{language}}: {{body}}"
+      ],
+      "vars": ["body", "language"]
+    },
+    "body": [
+      {
+        "outputs": ["Bonjour le monde", "Bonjour le monde"],
+        "vars": ["Hello world", "French"]
+      },
+      {
+        "outputs": ["J'ai faim.", "J'ai faim."],
+        "vars": ["I'm hungry", "French"]
+      },
+      {
+        "outputs": ["Ahoy thar, world!", "Ahoy thar world!"],
+        "vars": ["Hello world", "Pirate"]
+      },
+      {
+        "outputs": [
+          "Arrr, me belly be empty and yearnin' for grub.",
+          "Arrr, me belly be rumblin'! I be needin' some grub!"
+        ],
+        "vars": ["I'm hungry", "Pirate"]
+      }
+    ]
+  }
+}
--- a/examples/simple-csv/promptfooconfig.yaml
+++ b/examples/simple-csv/promptfooconfig.yaml
@@ -0,0 +1,4 @@
+description: A translator built with LLM
+prompts: prompts.txt
+providers: openai:gpt-3.5-turbo
+tests: tests.csv
--- a/examples/simple-csv/prompts.txt
+++ b/examples/simple-csv/prompts.txt
@@ -0,0 +1,3 @@
+Rephrase this in {{language}}: {{body}}
+---
+Translate this to conversational {{language}}: {{body}}
--- a/examples/simple-csv/tests.csv
+++ b/examples/simple-csv/tests.csv
@@ -0,0 +1,5 @@
+language,body
+French,Hello world
+French,I'm hungry
+Pirate,Hello world
+Pirate,I'm hungry
--- a/examples/simple-import/index.js
+++ b/examples/simple-import/index.js
@@ -1,10 +0,0 @@
-import promptfoo from '../../dist/index.js';
-
-(async () => {
-  const results = await promptfoo.evaluate('openai:chat', {
-    prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
-    vars: [{ body: 'Hello world' }, { body: "I'm hungry" }],
-  });
-  console.log('RESULTS:');
-  console.log(results);
-})();
--- a/examples/simple-test/README.md
+++ b/examples/simple-test/README.md
@@ -1,5 +1,14 @@
-This example shows how you can set an expected value in vars.csv and emit a PASS/FAIL based on it:
+This example shows a YAML configuration with inline tests.
+
+Run the test suite with:

 ```
-promptfoo eval --prompts prompts.txt --vars vars.csv --providers openai:chat --output output.html
+promptfoo eval
+```
+
+Note that you can edit the configuration to use a CSV test input instead. Set
+`tests: tests.csv` and try running it again, or run:
+
+```
+promptfoo eval --tests tests.csv
 ```
--- a/examples/simple-test/output.csv
+++ b/examples/simple-test/output.csv
@@ -1,3 +1,44 @@
-RESULT,Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body
-PASS,Ahoy mateys o' the world!,"Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world
-PASS,"I be feelin' a mighty need for grub, matey.","Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry
+Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body
+"Ahoy thar, world!","Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world
+"I be feelin' a mighty need for grub, matey.","Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry
+"""Yarr, me hearties! Spew forth a JSON tale o' yer life!""","[FAIL] Expected Arrr, me hearties! Gather round and listen to the tale of me life as a pirate. 
+
+{
+""name"": ""Captain Blackbeard"",
+""age"": 35,
+""occupation"": ""Pirate"",
+""location"": ""The Caribbean"",
+""crew"": [""Redbeard"", ""Long John"", ""Calico Jack""],
+""ship"": {
+""name"": ""The Black Pearl"",
+""type"": ""Galleon"",
+""weapons"": [""Cannons"", ""Cutlasses"", ""Pistols""]
+},
+""treasure"": {
+""gold"": 50000,
+""jewels"": [""Diamonds"", ""Emeralds"", ""Rubies""]
+},
+""adventures"": [
+{
+""title"": ""The Raid on Port Royal"",
+""description"": ""We plundered the town and took all their riches!"",
+""date"": ""June 12, 1720""
+},
+{
+""title"": ""The Battle of Nassau"",
+""description"": ""We fought off the British navy and claimed the port for ourselves!"",
+""date"": ""September 3, 1721""
+},
+{
+""title"": ""The Treasure of Tortuga"",
+""description"": ""We found a hidden treasure trove on the island of Tortuga!"",
+""date"": ""December 18, 1722""
+}
+]
+}
+
+Me life as a pirate has been full of adventure and danger. Me crew and I have raided towns, battled the British navy, and found hidden treasures. We've sailed the seas on me trusty ship, The Black Pearl, armed with cannons, cutlasses, and pistols. And we've amassed a great fortune in gold and jewels.
+
+But it's not all fun and games, me hearties. We've faced many challenges and dangers along the way. We've battled fierce storms, deadly sea monsters, and treacherous rival pirates. And we've lost many good men in the process.
+
+But through it all, we've remained true to our pirate code and our love of adventure. And we'll continue to sail the seas, seeking out new treasures and new adventures, until the end of our days. Arrr! to be valid JSON, but it isn't: SyntaxError: Unexpected token A in JSON at position 0",Output a JSON story of your life
--- a/examples/simple-test/output.html
+++ b/examples/simple-test/output.html
@@ -27,7 +27,7 @@
        color: green;
      }
      tr > td[data-content^='[FAIL]'] {
-        color: red;
+        color: #ad0000;
      }
    </style>
  </head>
@@ -42,33 +42,27 @@
      </thead>
      <tbody>
        <tr>
-          <td data-content="[PASS] Ahoy mateys o&#39; the world!">
-            [PASS] Ahoy mateys o&#39; the world!
-          </td>
+          <td data-content="Ahoy thar, world!">Ahoy thar, world!</td>

          <td
-            data-content="[PASS] Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate, comin&#39; to ye from the seven seas. Ahoy, hello world!"
+            data-content="Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate, comin&#39; to ye from the seven seas. Ahoy, hello world!"
          >
-            [PASS] Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate,
-            comin&#39; to ye from the seven seas. Ahoy, hello world!
+            Ahoy there, me hearties! Avast ye landlubbers! &#39;Tis I, a fearsome pirate, comin&#39;
+            to ye from the seven seas. Ahoy, hello world!
          </td>

          <td data-content="Hello world">Hello world</td>
        </tr>

        <tr>
-          <td
-            data-content="[FAIL] I be starvin&#39;!
-
-Expected: fn:output.toLowerCase().includes(&#39;grub&#39;)"
-          >
-            [FAIL] I be starvin&#39;! Expected: fn:output.toLowerCase().includes(&#39;grub&#39;)
+          <td data-content="I be feelin&#39; a mighty need for grub, matey.">
+            I be feelin&#39; a mighty need for grub, matey.
          </td>

          <td
-            data-content="[PASS] Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some vittles or ye&#39;ll be walkin&#39; the plank!"
+            data-content="Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some vittles or ye&#39;ll be walkin&#39; the plank!"
          >
-            [PASS] Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some
+            Arrr, me belly be rumblin&#39;! I be needin&#39; some grub, mateys! Bring me some
            vittles or ye&#39;ll be walkin&#39; the plank!
          </td>

--- a/examples/simple-test/output.json
+++ b/examples/simple-test/output.json
@@ -1,4 +1,5 @@
 {
+  "version": 1,
  "results": [
    {
      "prompt": {
@@ -6,34 +7,12 @@
        "display": "Rephrase this from English to Pirate: {{body}}"
      },
      "vars": {
-        "body": "Hello world",
-        "__expected": "fn:output.toLowerCase().includes('ahoy')"
+        "body": "Hello world"
      },
      "response": {
-        "output": "Ahoy mateys o' the world!",
+        "output": "Ahoy thar, world!",
        "tokenUsage": {
-          "total": 27,
-          "prompt": 18,
-          "completion": 9
-        }
-      },
-      "success": true
-    },
-    {
-      "prompt": {
-        "raw": "Pretend you're a pirate and speak these words: Hello world",
-        "display": "Pretend you're a pirate and speak these words: {{body}}"
-      },
-      "vars": {
-        "body": "Hello world",
-        "__expected": "fn:output.toLowerCase().includes('ahoy')"
-      },
-      "response": {
-        "output": "Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",
-        "tokenUsage": {
-          "total": 64,
-          "prompt": 22,
-          "completion": 42
+          "cached": 25
        }
      },
      "success": true
@@ -44,19 +23,31 @@
        "display": "Rephrase this from English to Pirate: {{body}}"
      },
      "vars": {
-        "body": "I'm hungry",
-        "__expected": "fn:output.toLowerCase().includes('grub')"
+        "body": "I'm hungry"
      },
      "response": {
-        "output": "I be starvin'!",
+        "output": "I be feelin' a mighty need for grub, matey.",
        "tokenUsage": {
-          "total": 27,
-          "prompt": 21,
-          "completion": 6
+          "cached": 35
        }
      },
-      "success": false,
-      "error": "Expected: fn:output.toLowerCase().includes('grub')"
+      "success": true
+    },
+    {
+      "prompt": {
+        "raw": "Pretend you're a pirate and speak these words: Hello world",
+        "display": "Pretend you're a pirate and speak these words: {{body}}"
+      },
+      "vars": {
+        "body": "Hello world"
+      },
+      "response": {
+        "output": "Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",
+        "tokenUsage": {
+          "cached": 64
+        }
+      },
+      "success": true
    },
    {
      "prompt": {
@@ -64,27 +55,25 @@
        "display": "Pretend you're a pirate and speak these words: {{body}}"
      },
      "vars": {
-        "body": "I'm hungry",
-        "__expected": "fn:output.toLowerCase().includes('grub')"
+        "body": "I'm hungry"
      },
      "response": {
        "output": "Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",
        "tokenUsage": {
-          "total": 63,
-          "prompt": 25,
-          "completion": 38
+          "cached": 63
        }
      },
      "success": true
    }
  ],
  "stats": {
-    "successes": 3,
-    "failures": 1,
+    "successes": 4,
+    "failures": 0,
    "tokenUsage": {
-      "total": 181,
-      "prompt": 86,
-      "completion": 95
+      "total": 0,
+      "prompt": 0,
+      "completion": 0,
+      "cached": 187
    }
  },
  "table": {
@@ -98,15 +87,15 @@
    "body": [
      {
        "outputs": [
-          "[PASS] Ahoy mateys o' the world!",
-          "[PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!"
+          "Ahoy thar, world!",
+          "Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!"
        ],
        "vars": ["Hello world"]
      },
      {
        "outputs": [
-          "[FAIL] Expected: fn:output.toLowerCase().includes('grub')\n---\nI be starvin'!",
-          "[PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!"
+          "I be feelin' a mighty need for grub, matey.",
+          "Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!"
        ],
        "vars": ["I'm hungry"]
      }
--- a/examples/simple-test/promptfooconfig.yaml
+++ b/examples/simple-test/promptfooconfig.yaml
@@ -0,0 +1,44 @@
+prompts: [prompts.txt]
+providers: [openai:gpt-3.5-turbo]
+tests:
+  - description: Check for exact match
+    vars:
+      body: Yes
+    assert:
+      - type: equals
+        value: Yarr
+
+  - description: Another basic substring check
+    vars:
+      body: I'm hungry
+    assert:
+      - type: javascript
+        value: output.toLowerCase().includes('grub')
+
+  - description: Check if output is JSON
+    vars:
+      body: Output the story of your life in JSON
+    assert:
+      - type: is-json
+
+  - description: Check for semantic similarity
+    vars:
+      body: Hello world
+    assert:
+      # Look for substring
+      - type: javascript
+        value: output.toLowerCase().includes('ahoy')
+
+      # Check for semantic similarity
+      - type: similar
+        value: Ahoy, world
+
+  - description: Use LLM to evaluate output
+    vars:
+      body: The quick brown fox jumps over the lazy dog
+    assert:
+      # Ask the LLM to check if it spoke like a pirate
+      - type: llm-rubric
+        value: Is spoken like a pirate
+
+outputPath: output.csv
--- a/examples/simple-test/tests.csv
+++ b/examples/simple-test/tests.csv
@@ -1,3 +1,4 @@
 body,__expected
 Hello world,fn:output.toLowerCase().includes('ahoy')
 I'm hungry,fn:output.toLowerCase().includes('grub')
+Output the story of your life in json,is-json
--- a/package-lock.json
+++ b/package-lock.json
@@ -27,6 +27,7 @@
        "nunjucks": "^3.2.4",
        "opener": "^1.5.2",
        "socket.io": "^4.6.1",
+        "tiny-invariant": "^1.3.1",
        "winston": "^3.8.2"
      },
      "bin": {
@@ -5124,6 +5125,11 @@
      "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
      "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
    },
+    "node_modules/tiny-invariant": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.1.tgz",
+      "integrity": "sha512-AD5ih2NlSssTCwsMznbvwMZpJ1cbhkGd2uueNxzv2jDlEeZdU04JQfRnggJQ8DrcVBGjAsCKwFBbDlVNtEMlzw=="
+    },
    "node_modules/tmpl": {
      "version": "1.0.5",
      "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
@@ -9547,6 +9553,11 @@
      "resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",
      "integrity": "sha512-uuVGNWzgJ4yhRaNSiubPY7OjISw4sw4E5Uv0wbjp+OzcbmVU/rsT8ujgcXJhn9ypzsgr5vlzpPqP+MBBKcGvbg=="
    },
+    "tiny-invariant": {
+      "version": "1.3.1",
+      "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.1.tgz",
+      "integrity": "sha512-AD5ih2NlSssTCwsMznbvwMZpJ1cbhkGd2uueNxzv2jDlEeZdU04JQfRnggJQ8DrcVBGjAsCKwFBbDlVNtEMlzw=="
+    },
    "tmpl": {
      "version": "1.0.5",
      "resolved": "https://registry.npmjs.org/tmpl/-/tmpl-1.0.5.tgz",
--- a/package.json
+++ b/package.json
@@ -75,6 +75,7 @@
    "nunjucks": "^3.2.4",
    "opener": "^1.5.2",
    "socket.io": "^4.6.1",
+    "tiny-invariant": "^1.3.1",
    "winston": "^3.8.2"
  }
 }
--- a/src/assertions.ts
+++ b/src/assertions.ts
@@ -1,3 +1,4 @@
+import invariant from 'tiny-invariant';
 import nunjucks from 'nunjucks';

 import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai.js';
@@ -5,45 +6,120 @@ import { cosineSimilarity } from './util.js';
 import { loadApiProvider } from './providers.js';
 import { DEFAULT_GRADING_PROMPT } from './prompts.js';

-import type { EvaluateOptions, GradingConfig, TokenUsage } from './types.js';
-
-interface GradingResult {
-  pass: boolean;
-  reason: string;
-  tokensUsed: TokenUsage;
-}
+import type { Assertion, GradingConfig, TestCase, GradingResult } from './types.js';

 const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;

 const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;

-export async function matchesExpectedValue(
-  expected: string,
+export async function runAssertions(test: TestCase, output: string): Promise<GradingResult> {
+  const tokensUsed = {
+    total: 0,
+    prompt: 0,
+    completion: 0,
+  };
+
+  if (!test.assert) {
+    return { pass: true, reason: 'No assertions', tokensUsed };
+  }
+
+  for (const assertion of test.assert) {
+    const result = await runAssertion(assertion, test, output);
+    if (!result.pass) {
+      return result;
+    }
+
+    if (result.tokensUsed) {
+      tokensUsed.total += result.tokensUsed.total;
+      tokensUsed.prompt += result.tokensUsed.prompt;
+      tokensUsed.completion += result.tokensUsed.completion;
+    }
+  }
+
+  return { pass: true, reason: 'All assertions passed', tokensUsed };
+}
+
+export async function runAssertion(
+  assertion: Assertion,
+  test: TestCase,
  output: string,
-  options: EvaluateOptions,
-): Promise<{ pass: boolean; reason?: string }> {
-  const match = expected.match(SIMILAR_REGEX);
+): Promise<GradingResult> {
+  let pass: boolean = false;

-  if (match) {
-    const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
-    const rest = expected.replace(SIMILAR_REGEX, '').trim();
-    return matchesSimilarity(rest, output, threshold);
-  } else if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
-    // TODO(1.0): delete eval: legacy option
-    const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
-    const functionBody = expected.slice(sliceLength);
-
-    const customFunction = new Function('output', `return ${functionBody}`);
-    return { pass: customFunction(output) };
-  } else if (expected.startsWith('grade:')) {
-    return matchesLlmRubric(expected.slice(6), output, options.grading);
-  } else {
-    const pass = expected === output;
+  if (assertion.type === 'equals') {
+    pass = assertion.value === output;
    return {
      pass,
-      reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
+      reason: pass ? 'Assertion passed' : `Expected output "${assertion.value}"`,
    };
  }
+
+  if (assertion.type === 'is-json') {
+    try {
+      JSON.parse(output);
+      return { pass: true, reason: 'Assertion passed' };
+    } catch (err) {
+      return {
+        pass: false,
+        reason: `Expected output to be valid JSON, but it isn't.\nError: ${err}`,
+      };
+    }
+  }
+
+  if (assertion.type === 'contains-json') {
+    const pass = containsJSON(output);
+    return {
+      pass,
+      reason: pass ? 'Assertion passed' : 'Expected output to contain valid JSON',
+    };
+  }
+
+  if (assertion.type === 'javascript') {
+    try {
+      const customFunction = new Function('output', `return ${assertion.value}`);
+      pass = customFunction(output);
+    } catch (err) {
+      return {
+        pass: false,
+        reason: `Custom function threw error: ${(err as Error).message}`,
+      };
+    }
+    return {
+      pass,
+      reason: pass ? 'Assertion passed' : `Custom function returned false`,
+    };
+  }
+
+  if (assertion.type === 'similar') {
+    invariant(assertion.value, 'Similarity assertion must have a string value');
+    invariant(assertion.threshold, 'Similarity assertion must have a threshold');
+    return matchesSimilarity(assertion.value, output, assertion.threshold);
+  }
+
+  if (assertion.type === 'llm-rubric') {
+    invariant(assertion.value, 'Similarity assertion must have a string value');
+    return matchesLlmRubric(assertion.value, output, test.options);
+  }
+
+  throw new Error('Unknown assertion type: ' + assertion.type);
+}
+
+function containsJSON(str: string): boolean {
+  // Regular expression to check for JSON-like pattern
+  const jsonPattern = /({[\s\S]*}|\[[\s\S]*])/;
+
+  const match = str.match(jsonPattern);
+
+  if (!match) {
+    return false;
+  }
+
+  try {
+    JSON.parse(match[0]);
+    return true;
+  } catch (error) {
+    return false;
+  }
 }

 export async function matchesSimilarity(
@@ -105,7 +181,7 @@ export async function matchesLlmRubric(
    );
  }

-  const prompt = nunjucks.renderString(options.prompt || DEFAULT_GRADING_PROMPT, {
+  const prompt = nunjucks.renderString(options.rubricPrompt || DEFAULT_GRADING_PROMPT, {
    content: output,
    rubric: expected,
  });
@@ -148,6 +224,43 @@ export async function matchesLlmRubric(
  }
 }

+export function assertionFromString(expected: string): Assertion {
+  const match = expected.match(SIMILAR_REGEX);
+  if (match) {
+    const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
+    const rest = expected.replace(SIMILAR_REGEX, '').trim();
+    return {
+      type: 'similar',
+      value: rest,
+      threshold,
+    };
+  }
+  if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
+    // TODO(1.0): delete eval: legacy option
+    const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
+    const functionBody = expected.slice(sliceLength);
+    return {
+      type: 'javascript',
+      value: functionBody,
+    };
+  }
+  if (expected.startsWith('grade:')) {
+    return {
+      type: 'llm-rubric',
+      value: expected.slice(6),
+    };
+  }
+  if (expected === 'is-json' || expected === 'contains-json') {
+    return {
+      type: expected,
+    };
+  }
+  return {
+    type: 'equals',
+    value: expected,
+  };
+}
+
 export default {
  matchesSimilarity,
  matchesLlmRubric,
--- a/src/cache.ts
+++ b/src/cache.ts
@@ -2,7 +2,6 @@ import path from 'node:path';

 import cacheManager from 'cache-manager';
 import fsStore from 'cache-manager-fs-hash';
-import fetch, { Response } from 'node-fetch';

 import logger from './logger.js';
 import { getConfigDirectoryPath, fetchWithTimeout } from './util.js';
--- a/src/evaluator.ts
+++ b/src/evaluator.ts
@@ -5,7 +5,7 @@ import chalk from 'chalk';
 import nunjucks from 'nunjucks';

 import logger from './logger.js';
-import { matchesExpectedValue } from './assertions.js';
+import { runAssertions } from './assertions.js';

 import type { SingleBar } from 'cli-progress';
 import type {
@@ -15,14 +15,18 @@ import type {
  EvaluateStats,
  EvaluateSummary,
  EvaluateTable,
+  TestSuite,
  Prompt,
+  TestCase,
 } from './types.js';
 import { generatePrompts } from './suggestions.js';

 interface RunEvalOptions {
  provider: ApiProvider;
  prompt: string;
-  vars?: Record<string, string>;
+
+  test: TestCase;
+
  includeProviderId?: boolean;

  rowIndex: number;
@@ -32,10 +36,12 @@ interface RunEvalOptions {
 const DEFAULT_MAX_CONCURRENCY = 4;

 class Evaluator {
+  testSuite: TestSuite;
  options: EvaluateOptions;
  stats: EvaluateStats;

-  constructor(options: EvaluateOptions) {
+  constructor(testSuite: TestSuite, options: EvaluateOptions) {
+    this.testSuite = testSuite;
    this.options = options;
    this.stats = {
      successes: 0,
@@ -52,10 +58,10 @@ class Evaluator {
  async runEval({
    provider,
    prompt,
-    vars,
+    test,
    includeProviderId,
  }: RunEvalOptions): Promise<EvaluateResult> {
-    vars = vars || {};
+    const vars = test.vars || {};
    const renderedPrompt = nunjucks.renderString(prompt, vars);

    // Note that we're using original prompt, not renderedPrompt
@@ -79,23 +85,28 @@ class Evaluator {
      if (response.error) {
        ret.error = response.error;
      } else if (response.output) {
-        const checkResult = vars.__expected
-          ? await matchesExpectedValue(vars.__expected, response.output, this.options)
-          : { pass: true };
+        const checkResult = await runAssertions(test, response.output);
        if (!checkResult.pass) {
-          ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
+          ret.error = checkResult.reason;
        }
        ret.success = checkResult.pass;
+        if (checkResult.tokensUsed) {
+          this.stats.tokenUsage.total += checkResult.tokensUsed.total;
+          this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
+          this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
+        }
      } else {
        ret.success = false;
        ret.error = 'No output';
      }

      // Update token usage stats
-      this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
-      this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
-      this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
-      this.stats.tokenUsage.cached += response.tokenUsage?.cached || 0;
+      if (response.tokenUsage) {
+        this.stats.tokenUsage.total += response.tokenUsage.total || 0;
+        this.stats.tokenUsage.prompt += response.tokenUsage.prompt || 0;
+        this.stats.tokenUsage.completion += response.tokenUsage.completion || 0;
+        this.stats.tokenUsage.cached += response.tokenUsage.cached || 0;
+      }

      if (ret.success) {
        this.stats.successes++;
@@ -114,12 +125,13 @@ class Evaluator {
  }

  async evaluate(): Promise<EvaluateSummary> {
-    const options = this.options;
+    const { testSuite, options } = this;
    const prompts: Prompt[] = [];

-    if (options.prompt?.generateSuggestions) {
+    if (options.generateSuggestions) {
+      // TODO(ian): Move this into its own command/file
      logger.info(`Generating prompt variations...`);
-      const { prompts: newPrompts, error } = await generatePrompts(options.prompts[0], 1);
+      const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0], 1);
      if (error || !newPrompts) {
        throw new Error(`Failed to generate prompts: ${error}`);
      }
@@ -142,7 +154,7 @@ class Evaluator {
            async (answer) => {
              rl.close();
              if (answer.toLowerCase().startsWith('y')) {
-                options.prompts.push(prompt);
+                testSuite.prompts.push(prompt);
                numAdded++;
              } else {
                logger.info('Skipping this prompt.');
@@ -159,10 +171,11 @@ class Evaluator {
      }
    }

-    for (const promptContent of options.prompts) {
-      for (const provider of options.providers) {
+    // Split prompts by provider
+    for (const promptContent of testSuite.prompts) {
+      for (const provider of testSuite.providers) {
        const display =
-          options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
+          testSuite.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
        prompts.push({
          raw: promptContent,
          display,
@@ -170,29 +183,49 @@ class Evaluator {
      }
    }

-    const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
-    const varsWithSpecialColsRemoved = vars.map((v) => {
-      const ret = { ...v };
-      Object.keys(ret).forEach((key) => {
-        if (key.startsWith('__')) {
-          delete ret[key];
-        }
-      });
-      return ret;
+    // Aggregate all vars across test cases
+
+    const tests = (
+      testSuite.tests || [
+        {
+          // Dummy test for cases when we're only comparing raw prompts.
+        },
+      ]
+    ).map((test) => {
+      const finalTestCase: TestCase = Object.assign({}, testSuite.defaultTest);
+      return Object.assign(finalTestCase, test);
    });
-    const isTest = vars[0].__expected;
+
+    const varNames: Set<string> = new Set();
+    const varsWithSpecialColsRemoved: Record<string, string>[] = [];
+    for (const testCase of tests) {
+      if (testCase.vars) {
+        const varWithSpecialColsRemoved: Record<string, string> = {};
+        for (const varName of Object.keys(testCase.vars)) {
+          varNames.add(varName);
+          varWithSpecialColsRemoved[varName] = testCase.vars[varName];
+        }
+        varsWithSpecialColsRemoved.push(varWithSpecialColsRemoved);
+      }
+    }
+
+    // Set up table...
+    const isTest = tests.some((t) => !!t.assert);
+
    const table: EvaluateTable = {
      head: {
        prompts: prompts.map((p) => p.display),
-        vars: Object.keys(varsWithSpecialColsRemoved[0]),
+        vars: Array.from(varNames).sort(),
+        // TODO(ian): add assertions to table?
      },
      body: [],
    };

+    // And progress bar...
    let progressbar: SingleBar | undefined;
    if (options.showProgressBar) {
      const totalNumRuns =
-        options.prompts.length * options.providers.length * (options.vars?.length || 1);
+        testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
      const cliProgress = await import('cli-progress');
      progressbar = new cliProgress.SingleBar(
        {
@@ -208,21 +241,31 @@ class Evaluator {
      });
    }

+    // Set up eval cases
    const runEvalOptions: RunEvalOptions[] = [];
    let rowIndex = 0;
-    for (const row of vars) {
+    for (const testCase of tests) {
      let colIndex = 0;

-      const prependToPrompt = row.__prefix || options.prompt?.prefix || '';
-      const appendToPrompt = row.__suffix || options.prompt?.suffix || '';
+      // Handle default properties
+      testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
+      testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
+      testCase.options = testCase.options || {};
+      testCase.options.provider =
+        testCase.options.provider || testSuite.defaultTest?.options?.provider;
+      const prependToPrompt =
+        testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
+      const appendToPrompt =
+        testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';

-      for (const promptContent of options.prompts) {
-        for (const provider of options.providers) {
+      // Finalize test case eval
+      for (const promptContent of testSuite.prompts) {
+        for (const provider of testSuite.providers) {
          runEvalOptions.push({
            provider,
            prompt: prependToPrompt + promptContent + appendToPrompt,
-            vars: row,
-            includeProviderId: options.providers.length > 1,
+            test: testCase,
+            includeProviderId: testSuite.providers.length > 1,
            rowIndex,
            colIndex,
          });
@@ -232,6 +275,7 @@ class Evaluator {
      rowIndex++;
    }

+    // Actually run the eval
    const results: EvaluateResult[] = [];
    await async.forEachOfLimit(
      runEvalOptions,
@@ -245,7 +289,7 @@ class Evaluator {
          progressbar.increment({
            provider: options.provider.id(),
            prompt: options.prompt.slice(0, 10),
-            vars: Object.entries(options.vars || {})
+            vars: Object.entries(options.test.vars || {})
              .map(([k, v]) => `${k}=${v}`)
              .join(' ')
              .slice(0, 10),
@@ -276,7 +320,7 @@ class Evaluator {
        if (!table.body[rowIndex]) {
          table.body[rowIndex] = {
            outputs: [],
-            vars: Object.values(options.vars || {}),
+            vars: table.head.vars.map((varName) => options.test.vars?.[varName] || ''),
          };
        }
        table.body[rowIndex].outputs[colIndex] = resultText;
@@ -291,7 +335,7 @@ class Evaluator {
  }
 }

-export function evaluate(options: EvaluateOptions) {
-  const ev = new Evaluator(options);
+export function evaluate(testSuite: TestSuite, options: EvaluateOptions) {
+  const ev = new Evaluator(testSuite, options);
  return ev.evaluate();
 }
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,37 +1,25 @@
 import { evaluate as doEvaluate } from './evaluator.js';
-import { loadApiProvider } from './providers.js';
+import { loadApiProviders } from './providers.js';
 import assertions from './assertions.js';
 import providers from './providers.js';

-import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
+import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types.js';
+import { readTests } from './util.js';

 export * from './types.js';

-async function evaluate(
-  providers: (string | ApiProvider)[] | (string | ApiProvider),
-  options: Omit<EvaluateOptions, 'providers'>,
-): Promise<EvaluateSummary> {
-  let apiProviders: ApiProvider[] = [];
-  const addProvider = async (provider: ApiProvider | string) => {
-    if (typeof provider === 'string') {
-      apiProviders.push(await loadApiProvider(provider));
-    } else {
-      apiProviders.push(provider);
-    }
+interface EvaluateTestSuite extends TestSuiteConfig {
+  prompts: string[];
+}
+
+async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
+  const constructedTestSuite: TestSuite = {
+    ...testSuite,
+    prompts: testSuite.prompts, // raw prompts expected
+    providers: await loadApiProviders(testSuite.providers),
+    tests: readTests(testSuite.tests),
  };
-
-  if (Array.isArray(providers)) {
-    for (const provider of providers) {
-      await addProvider(provider);
-    }
-  } else {
-    await addProvider(providers);
-  }
-
-  return doEvaluate({
-    ...options,
-    providers: apiProviders,
-  });
+  return doEvaluate(constructedTestSuite, options);
 }

 module.exports = {
--- a/src/main.ts
+++ b/src/main.ts
@@ -1,21 +1,35 @@
 #!/usr/bin/env node
 import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
-import { parse, join as pathJoin } from 'path';
+import { join as pathJoin } from 'path';

 import Table from 'cli-table3';
 import chalk from 'chalk';
 import { Command } from 'commander';

 import logger, { setLogLevel } from './logger.js';
-import { loadApiProvider } from './providers.js';
+import { loadApiProvider, loadApiProviders } from './providers.js';
 import { evaluate } from './evaluator.js';
-import { readPrompts, readVars, writeLatestResults, writeOutput } from './util.js';
+import {
+  maybeReadConfig,
+  readConfig,
+  readPrompts,
+  readTests,
+  writeLatestResults,
+  writeOutput,
+} from './util.js';
 import { getDirectory } from './esm.js';
 import { init } from './web/server.js';
-
-import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
 import { disableCache } from './cache.js';

+import type {
+  CommandLineOptions,
+  EvaluateOptions,
+  TestCase,
+  TestSuite,
+  UnifiedConfig,
+} from './types.js';
+import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding.js';
+
 function createDummyFiles(directory: string | null) {
  if (directory) {
    // Make the directory if it doesn't exist
@@ -23,31 +37,6 @@ function createDummyFiles(directory: string | null) {
      mkdirSync(directory);
    }
  }
-  const dummyPrompts = `Your first prompt goes here
---
-Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
---
-This is the next prompt.
-
-These prompts are nunjucks templates, so you can use logic like this:
-{% if var1 %}
-  {{ var1 }}
-{% endif %}`;
-  const dummyVars =
-    'var1,var2,var3\nvalue1,value2,value3\nanother value1,another value2,another value3';
-  const dummyConfig = `module.exports = {
-  prompts: ['prompts.txt'],
-  providers: ['openai:gpt-3.5-turbo'],
-  vars: 'vars.csv',
-  maxConcurrency: 4,
-};`;
-  const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
-\`\`\`
-promptfoo eval
-\`\`\`
-
-You'll probably want to change a few of the prompts in prompts.txt and the variables in vars.csv before letting it rip.
-`;

  if (directory) {
    if (!existsSync(directory)) {
@@ -58,10 +47,9 @@ You'll probably want to change a few of the prompts in prompts.txt and the varia
    directory = '.';
  }

-  writeFileSync(pathJoin(process.cwd(), directory, 'prompts.txt'), dummyPrompts);
-  writeFileSync(pathJoin(process.cwd(), directory, 'vars.csv'), dummyVars);
-  writeFileSync(pathJoin(process.cwd(), directory, 'promptfooconfig.js'), dummyConfig);
-  writeFileSync(pathJoin(process.cwd(), directory, 'README.md'), readme);
+  writeFileSync(pathJoin(process.cwd(), directory, 'prompts.txt'), DEFAULT_PROMPTS);
+  writeFileSync(pathJoin(process.cwd(), directory, 'promptfooconfig.yaml'), DEFAULT_YAML_CONFIG);
+  writeFileSync(pathJoin(process.cwd(), directory, 'README.md'), DEFAULT_README);

  if (directory === '.') {
    logger.info(
@@ -74,15 +62,26 @@ You'll probably want to change a few of the prompts in prompts.txt and the varia
 }

 async function main() {
-  let defaultConfig: Partial<CommandLineOptions> = {};
-  if (existsSync('promptfooconfig.js')) {
-    // @ts-ignore
-    defaultConfig = (await import(pathJoin(process.cwd(), './promptfooconfig.js'))).default;
-    logger.info('Loaded default config from promptfooconfig.js');
+  const pwd = process.cwd();
+  const potentialPaths = [
+    pathJoin(pwd, 'promptfooconfig.js'),
+    pathJoin(pwd, 'promptfooconfig.json'),
+    pathJoin(pwd, 'promptfooconfig.yaml'),
+  ];
+  let config: Partial<UnifiedConfig> = {};
+  for (const path of potentialPaths) {
+    const maybeConfig = maybeReadConfig(path);
+    if (maybeConfig) {
+      config = maybeConfig;
+      break;
+    }
  }
-  if (existsSync('promptfooconfig.json')) {
-    defaultConfig = JSON.parse(readFileSync('promptfooconfig.json', 'utf-8'));
-    logger.info('Loaded default config from promptfooconfig.json');
+
+  let evaluateOptions: EvaluateOptions = {};
+  if (config.evaluateOptions) {
+    evaluateOptions.generateSuggestions = config.evaluateOptions.generateSuggestions;
+    evaluateOptions.maxConcurrency = config.evaluateOptions.maxConcurrency;
+    evaluateOptions.showProgressBar = config.evaluateOptions.showProgressBar;
  }

  const program = new Command();
@@ -113,35 +112,29 @@ async function main() {
  program
    .command('eval')
    .description('Evaluate prompts')
-    .requiredOption(
-      '-p, --prompts <paths...>',
-      'Paths to prompt files (.txt)',
-      defaultConfig.prompts,
-    )
+    .requiredOption('-p, --prompts <paths...>', 'Paths to prompt files (.txt)', config.prompts)
    .requiredOption(
      '-r, --providers <name or path...>',
      'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module',
-      defaultConfig.providers,
-    )
-    .option(
-      '-o, --output <path>',
-      'Path to output file (csv, json, yaml, html)',
-      defaultConfig.output,
-    )
-    .option(
-      '-v, --vars <path>',
-      'Path to file with prompt variables (csv, json, yaml)',
-      defaultConfig.vars,
+      config?.providers,
    )
    .option(
      '-c, --config <path>',
-      'Path to configuration file. Automatically loads promptfooconfig.js',
-      defaultConfig.config,
+      'Path to configuration file. Automatically loads promptfooconfig.js/json/yaml',
    )
+    .option(
+      // TODO(ian): Remove `vars` for v1
+      '-v, --vars, -t, --tests <path>',
+      'Path to CSV with test cases',
+      config?.commandLineOptions?.vars,
+    )
+    .option('-o, --output <path>', 'Path to output file (csv, json, yaml, html)', config.outputPath)
    .option(
      '-j, --max-concurrency <number>',
      'Maximum number of concurrent API calls',
-      String(defaultConfig.maxConcurrency),
+      config.evaluateOptions?.maxConcurrency
+        ? String(config.evaluateOptions.maxConcurrency)
+        : undefined,
    )
    .option(
      '--table-cell-max-length <number>',
@@ -155,36 +148,20 @@ async function main() {
    .option(
      '--prompt-prefix <path>',
      'This prefix is prepended to every prompt',
-      defaultConfig.promptPrefix,
+      config.defaultTest?.options?.prefix,
    )
    .option(
      '--prompt-suffix <path>',
      'This suffix is append to every prompt',
-      defaultConfig.promptSuffix,
+      config.defaultTest?.options?.suffix,
    )
    .option('--no-write', 'Do not write results to promptfoo directory')
    .option('--no-cache', 'Do not read or write results to disk cache')
-    .option('--grader', 'Model that will grade outputs', defaultConfig.grader)
-    .option('--verbose', 'Show debug logs', defaultConfig.verbose)
+    .option('--grader', 'Model that will grade outputs', config?.commandLineOptions?.grader)
+    .option('--verbose', 'Show debug logs', config?.commandLineOptions?.verbose)
    .option('--view [port]', 'View in browser ui')
    .action(async (cmdObj: CommandLineOptions & Command) => {
-      const configPath = cmdObj.config;
-      let config = {};
-      if (configPath) {
-        const ext = parse(configPath).ext;
-        switch (ext) {
-          case '.json':
-            const content = readFileSync(configPath, 'utf-8');
-            config = JSON.parse(content);
-            break;
-          case '.js':
-            config = require(configPath);
-            break;
-          default:
-            throw new Error(`Unsupported configuration file format: ${ext}`);
-        }
-      }
-
+      // Misc settings
      if (cmdObj.verbose) {
        setLogLevel('debug');
      }
@@ -192,38 +169,74 @@ async function main() {
        disableCache();
      }

-      let vars: VarMapping[] = [];
-      if (cmdObj.vars) {
-        vars = readVars(cmdObj.vars);
-      }
-
-      const providers = await Promise.all(
-        cmdObj.providers.map(async (p) => await loadApiProvider(p)),
-      );
+      // Config parsing
      const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
-      const options: EvaluateOptions = {
-        prompts: readPrompts(cmdObj.prompts),
-        vars,
-        providers,
-        showProgressBar: true,
-        maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
-        prompt: {
-          prefix: cmdObj.promptPrefix,
-          suffix: cmdObj.promptSuffix,
-        },
-        ...config,
-      };
-
-      if (cmdObj.grader) {
-        options.grading = {
-          provider: await loadApiProvider(cmdObj.grader),
+      const configPath = cmdObj.config;
+      if (configPath) {
+        config = readConfig(configPath);
+      } else {
+        config = {
+          prompts: cmdObj.prompts || config.prompts,
+          providers: cmdObj.providers || config.providers,
+          tests: cmdObj.vars || config.tests,
        };
      }
-      if (cmdObj.generateSuggestions) {
-        options.prompt!.generateSuggestions = true;
+
+      // Validation
+      if (!config.prompts || config.prompts.length === 0) {
+        logger.error(chalk.red('You must provide at least 1 prompt file'));
+        process.exit(1);
+      }
+      if (!config.providers || config.providers.length === 0) {
+        logger.error(
+          chalk.red('You must specify at least 1 provider (for example, openai:gpt-3.5-turbo)'),
+        );
+        process.exit(1);
      }

-      const summary = await evaluate(options);
+      // Parse prompts, providers, and tests
+      const parsedPrompts = readPrompts(config.prompts);
+      const parsedProviders = await loadApiProviders(config.providers);
+      const parsedTests: TestCase[] = readTests(config.tests);
+
+      if (parsedPrompts.length === 0) {
+        logger.error(chalk.red('No prompts found'));
+        process.exit(1);
+      }
+
+      const defaultTest: TestCase = {
+        options: {
+          prefix: cmdObj.promptPrefix,
+          suffix: cmdObj.promptSuffix,
+          provider: cmdObj.grader,
+          // rubricPrompt:
+        },
+        ...config.defaultTest,
+      };
+
+      const testSuite: TestSuite = {
+        description: config.description,
+        prompts: parsedPrompts,
+        providers: parsedProviders,
+        tests: parsedTests,
+        defaultTest,
+      };
+
+      const options: EvaluateOptions = {
+        showProgressBar: true,
+        maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
+        ...evaluateOptions,
+      };
+
+      if (cmdObj.grader && testSuite.defaultTest) {
+        testSuite.defaultTest.options = testSuite.defaultTest.options || {};
+        testSuite.defaultTest.options.provider = await loadApiProvider(cmdObj.grader);
+      }
+      if (cmdObj.generateSuggestions) {
+        options.generateSuggestions = true;
+      }
+
+      const summary = await evaluate(testSuite, options);

      if (cmdObj.output) {
        logger.info(chalk.yellow(`Writing output to ${cmdObj.output}`));
--- a/src/onboarding.ts
+++ b/src/onboarding.ts
@@ -0,0 +1,61 @@
+export const DEFAULT_PROMPTS = `Your first prompt goes here
+---
+Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
+---
+This is the next prompt.
+
+These prompts are nunjucks templates, so you can use logic like this:
+{% if var1 %}
+  {{ var1 }}
+{% endif %}
+---
+If you prefer, you can break prompts into multiple files (make sure to edit promptfooconfig.yaml accordingly)
+`;
+
+export const DEFAULT_YAML_CONFIG = `# This configuration runs each prompt through a series of example inputs and checks if they meet requirements.
+
+prompts: [prompts.txt]
+providers: [openai:gpt-3.5-turbo]
+tests:
+  - description: First test case - automatic review
+    vars:
+      var1: first variable's value
+      var2: another value
+      var3: some other value
+    assert:
+      - type: equality
+        value: expected LLM output goes here
+      - type: function
+        value: output.includes('some text')
+
+  - description: Second test case - manual review
+    # Test cases don't need assertions if you prefer to manually review the output
+    vars:
+      var1: new value
+      var2: another value
+      var3: third value
+
+  - description: Third test case - other types of automatic review
+    vars:
+      var1: yet another value
+      var2: and another
+      var3: dear llm, please output your response in json format
+    assert:
+      - type: contains-json
+      - type: similarity
+        value: ensures that output is semantically similar to this text
+      - type: llm-rubric
+        value: ensure that output contains a reference to X
+`;
+
+export const DEFAULT_README = `To get started, set your OPENAI_API_KEY environment variable.
+
+Next, change a few of the prompts in prompts.txt and edit promptfooconfig.yaml.
+
+Then run:
+\`\`\`
+promptfoo eval
+\`\`\`
+
+Afterwards, you can view the results by running \`promptfoo view\`
+`;
--- a/src/providers.ts
+++ b/src/providers.ts
@@ -5,6 +5,15 @@ import { ApiProvider } from './types.js';
 import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai.js';
 import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai.js';

+export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
+  if (typeof providerPaths === 'string') {
+    return [await loadApiProvider(providerPaths)];
+  } else if (Array.isArray(providerPaths)) {
+    return Promise.all(providerPaths.map((provider) => loadApiProvider(provider)));
+  }
+  throw new Error('Invalid providers list');
+}
+
 export async function loadApiProvider(providerPath: string): Promise<ApiProvider> {
  if (providerPath?.startsWith('openai:')) {
    // Load OpenAI module
--- a/src/types.ts
+++ b/src/types.ts
@@ -1,11 +1,16 @@
 export interface CommandLineOptions {
+  // Shared with TestSuite
  prompts: string[];
  providers: string[];
-  output?: string;
+  output: string;
+
+  // Shared with EvaluateOptions
+  maxConcurrency: string;
+
+  // Command line only
  vars?: string;
  config?: string;
  verbose?: boolean;
-  maxConcurrency?: string;
  grader?: string;
  view?: string;
  tableCellMaxLength?: string;
@@ -48,27 +53,19 @@ export interface CsvRow {
 export type VarMapping = Record<string, string>;

 export interface GradingConfig {
-  prompt?: string;
+  rubricPrompt?: string;
  provider?: string | ApiProvider;
 }

 export interface PromptConfig {
  prefix?: string;
  suffix?: string;
-  generateSuggestions?: boolean;
 }

 export interface EvaluateOptions {
-  providers: ApiProvider[];
-  prompts: string[];
-  vars?: VarMapping[];
-
  maxConcurrency?: number;
  showProgressBar?: boolean;
-
-  grading?: GradingConfig;
-
-  prompt?: PromptConfig;
+  generateSuggestions?: boolean;
 }

 export interface Prompt {
@@ -108,3 +105,83 @@ export interface EvaluateSummary {
  table: EvaluateTable;
  stats: EvaluateStats;
 }
+
+export interface GradingResult {
+  pass: boolean;
+  reason: string;
+  tokensUsed?: TokenUsage;
+}
+
+// TODO(ian): maybe Assertion should support {type: config} to make the yaml cleaner
+export interface Assertion {
+  // Type of assertion
+  type: 'equals' | 'is-json' | 'contains-json' | 'javascript' | 'similar' | 'llm-rubric';
+
+  // The expected value, if applicable
+  value?: string;
+
+  // The threshold value, only applicable for similarity (cosine distance)
+  threshold?: number;
+
+  // Some assertions (similarity, llm-rubric) require an LLM provider
+  provider?: ApiProvider;
+}
+
+// Each test case is graded pass/fail.  A test case represents a unique input to the LLM after substituting `vars` in the prompt.
+export interface TestCase {
+  // Optional description of what you're testing
+  description?: string;
+
+  // Key-value pairs to substitute in the prompt
+  vars?: Record<string, string>;
+
+  // Optional list of automatic checks to run on the LLM output
+  assert?: Assertion[];
+
+  // Additional configuration settings for the prompt
+  options?: PromptConfig & GradingConfig;
+}
+
+// The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
+export interface TestSuite {
+  // Optional description of what your LLM is trying to do
+  description?: string;
+
+  // One or more LLM APIs to use
+  providers: ApiProvider[];
+
+  // One or more prompt strings
+  prompts: string[];
+
+  // Test cases
+  tests?: TestCase[];
+
+  // Default test case config
+  defaultTest?: Partial<TestCase>;
+}
+
+// TestSuiteConfig = Test Suite, but before everything is parsed and resolved.  Providers are just strings, prompts are filepaths, tests can be filepath or inline.
+export interface TestSuiteConfig {
+  // Optional description of what your LLM is trying to do
+  description?: string;
+
+  // One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
+  providers: string | string[];
+
+  // One or more prompt files to load
+  prompts: string | string[];
+
+  // Path to a test file, OR list of LLM prompt variations (aka "test case")
+  tests: string | TestCase[];
+
+  // Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
+  defaultTest?: Omit<TestCase, 'description'>;
+
+  // Path to write output. Writes to console/web viewer if not set.
+  outputPath?: string;
+}
+
+export type UnifiedConfig = TestSuiteConfig & {
+  evaluateOptions: EvaluateOptions;
+  commandLineOptions: Partial<CommandLineOptions>;
+};
--- a/src/util.ts
+++ b/src/util.ts
@@ -7,7 +7,6 @@ import yaml from 'js-yaml';
 import nunjucks from 'nunjucks';
 import { globSync } from 'glob';
 import { parse as parsePath } from 'path';
-import { CsvRow } from './types.js';
 import { parse as parseCsv } from 'csv-parse/sync';
 import { stringify } from 'csv-stringify/sync';

@@ -16,7 +15,16 @@ import { getDirectory } from './esm.js';

 import type { RequestInfo, RequestInit, Response } from 'node-fetch';

-import type { EvaluateSummary } from './types.js';
+import type {
+  Assertion,
+  CsvRow,
+  EvaluateSummary,
+  CommandLineOptions,
+  TestSuite,
+  UnifiedConfig,
+  TestCase,
+} from './types.js';
+import { assertionFromString } from './assertions.js';

 const PROMPT_DELIMITER = '---';

@@ -28,7 +36,35 @@ function parseJson(json: string): any | undefined {
  }
 }

-export function readPrompts(promptPathsOrGlobs: string[]): string[] {
+export function maybeReadConfig(configPath: string): UnifiedConfig | undefined {
+  try {
+    return readConfig(configPath);
+  } catch {
+    return undefined;
+  }
+}
+
+export function readConfig(configPath: string): UnifiedConfig {
+  if (!fs.existsSync(configPath)) {
+    throw new Error(`Config file not found: ${configPath}`);
+  }
+  const ext = path.parse(configPath).ext;
+  switch (ext) {
+    case '.json':
+      const content = fs.readFileSync(configPath, 'utf-8');
+      return JSON.parse(content) as UnifiedConfig;
+    case '.js':
+      return require(configPath) as UnifiedConfig;
+    case '.yaml':
+      return yaml.load(fs.readFileSync(configPath, 'utf-8')) as UnifiedConfig;
+    default:
+      throw new Error(`Unsupported configuration file format: ${ext}`);
+  }
+}
+
+export function readPrompts(promptPathsOrGlobs: string | string[]): string[] {
+  promptPathsOrGlobs =
+    typeof promptPathsOrGlobs === 'string' ? [promptPathsOrGlobs] : promptPathsOrGlobs;
  const promptPaths = promptPathsOrGlobs.flatMap((pathOrGlob) => globSync(pathOrGlob));
  let promptContents: string[] = [];

@@ -49,6 +85,9 @@ export function readPrompts(promptPathsOrGlobs: string[]): string[] {
  if (promptContents.length === 1) {
    promptContents = promptContents[0].split(PROMPT_DELIMITER).map((p) => p.trim());
  }
+  if (promptContents.length === 0) {
+    throw new Error(`There are no prompts in ${promptPathsOrGlobs.join(', ')}`);
+  }
  return promptContents;
 }

@@ -67,6 +106,37 @@ export function readVars(varsPath: string): CsvRow[] {
  return rows;
 }

+export function readTests(tests: string | TestCase[] | undefined): TestCase[] {
+  if (!tests) {
+    return [];
+  }
+
+  if (typeof tests === 'string') {
+    // It's a filepath, load from CSV
+    const vars = readVars(tests);
+    return vars.map((row, idx) => {
+      const test = testCaseFromCsvRow(row);
+      test.description = `Row #${idx + 1}`;
+      return test;
+    });
+  }
+
+  // Some validation of the shape of tests
+  for (const test of tests) {
+    if (!test.assert && !test.vars) {
+      throw new Error(
+        `Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
+          test,
+          null,
+          2,
+        )}`,
+      );
+    }
+  }
+
+  return tests;
+}
+
 export function writeOutput(outputPath: string, summary: EvaluateSummary): void {
  const outputExtension = outputPath.split('.').pop()?.toLowerCase();

@@ -153,3 +223,20 @@ export function cosineSimilarity(vecA: number[], vecB: number[]) {
  const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
  return dotProduct / (vecAMagnitude * vecBMagnitude);
 }
+
+export function testCaseFromCsvRow(row: CsvRow): TestCase {
+  const vars: Record<string, string> = {};
+  const asserts: Assertion[] = [];
+  for (const [key, value] of Object.entries(row)) {
+    if (key === '__expected') {
+      asserts.push(assertionFromString(value));
+    } else {
+      vars[key] = value;
+    }
+  }
+
+  return {
+    vars,
+    assert: asserts,
+  };
+}
--- a/src/web/server.ts
+++ b/src/web/server.ts
@@ -32,24 +32,6 @@ export function init(port = 15500) {
    },
  });

-  interface EvaluateRequestBody {
-    provider: string;
-    options: {
-      prompts: string[];
-      vars: Record<string, string>[];
-    };
-  }
-
-  app.post('/evaluate', async (req: Request, res: Response) => {
-    try {
-      const { provider, options } = req.body as EvaluateRequestBody;
-      const summary = await promptfoo.evaluate(provider, options);
-      res.json(summary);
-    } catch (error) {
-      res.status(500).json({ message: 'Error evaluating prompts' });
-    }
-  });
-
  const latestJsonPath = getLatestResultsPath();
  const readLatestJson = () => {
    const data = fs.readFileSync(latestJsonPath, 'utf8');
--- a/test/assertions.test.ts
+++ b/test/assertions.test.ts
@@ -0,0 +1,258 @@
+import {
+  runAssertions,
+  runAssertion,
+  matchesSimilarity,
+  matchesLlmRubric,
+  assertionFromString,
+} from '../src/assertions';
+import { DefaultEmbeddingProvider } from '../src/providers/openai';
+import type {
+  Assertion,
+  ApiProvider,
+  TestCase,
+  GradingConfig,
+  ProviderResponse,
+  GradingResult,
+} from '../src/types';
+
+describe('runAssertions', () => {
+  const test: TestCase = {
+    assert: [
+      {
+        type: 'equals',
+        value: 'Expected output',
+      },
+    ],
+  };
+
+  it('should pass when all assertions pass', async () => {
+    const output = 'Expected output';
+
+    const result: GradingResult = await runAssertions(test, output);
+    expect(result.pass).toBeTruthy();
+    expect(result.reason).toBe('All assertions passed');
+  });
+
+  it('should fail when any assertion fails', async () => {
+    const output = 'Different output';
+
+    const result: GradingResult = await runAssertions(test, output);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toBe('Expected output "Expected output"');
+  });
+});
+
+describe('runAssertion', () => {
+  const equalityAssertion: Assertion = {
+    type: 'equals',
+    value: 'Expected output',
+  };
+
+  const isJsonAssertion: Assertion = {
+    type: 'is-json',
+  };
+
+  const containsJsonAssertion: Assertion = {
+    type: 'contains-json',
+  };
+
+  const functionAssertion: Assertion = {
+    type: 'javascript',
+    value: 'output === "Expected output"',
+  };
+
+  it('should pass when the equality assertion passes', async () => {
+    const output = 'Expected output';
+
+    const result: GradingResult = await runAssertion(equalityAssertion, {} as TestCase, output);
+    expect(result.pass).toBeTruthy();
+    expect(result.reason).toBe('Assertion passed');
+  });
+
+  it('should fail when the equality assertion fails', async () => {
+    const output = 'Different output';
+
+    const result: GradingResult = await runAssertion(equalityAssertion, {} as TestCase, output);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toBe('Expected output "Expected output"');
+  });
+
+  it('should pass when the is-json assertion passes', async () => {
+    const output = '{"key": "value"}';
+
+    const result: GradingResult = await runAssertion(isJsonAssertion, {} as TestCase, output);
+    expect(result.pass).toBeTruthy();
+    expect(result.reason).toBe('Assertion passed');
+  });
+
+  it('should fail when the is-json assertion fails', async () => {
+    const output = 'Not valid JSON';
+
+    const result: GradingResult = await runAssertion(isJsonAssertion, {} as TestCase, output);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toContain('Expected output to be valid JSON');
+  });
+
+  it('should pass when the contains-json assertion passes', async () => {
+    const output =
+      'this is some other stuff \n\n {"key": "value", "key2": {"key3": "value2", "key4": ["value3", "value4"]}} \n\n blah blah';
+
+    const result: GradingResult = await runAssertion(containsJsonAssertion, {} as TestCase, output);
+    expect(result.pass).toBeTruthy();
+    expect(result.reason).toBe('Assertion passed');
+  });
+
+  it('should fail when the contains-json assertion fails', async () => {
+    const output = 'Not valid JSON';
+
+    const result: GradingResult = await runAssertion(containsJsonAssertion, {} as TestCase, output);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toContain('Expected output to contain valid JSON');
+  });
+
+  it('should pass when the function assertion passes', async () => {
+    const output = 'Expected output';
+
+    const result: GradingResult = await runAssertion(functionAssertion, {} as TestCase, output);
+    expect(result.pass).toBeTruthy();
+    expect(result.reason).toBe('Assertion passed');
+  });
+
+  it('should fail when the function assertion fails', async () => {
+    const output = 'Different output';
+
+    const result: GradingResult = await runAssertion(functionAssertion, {} as TestCase, output);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toBe('Custom function returned false');
+  });
+});
+
+describe('assertionFromString', () => {
+  it('should create an equality assertion', () => {
+    const expected = 'Expected output';
+
+    const result: Assertion = assertionFromString(expected);
+    expect(result.type).toBe('equals');
+    expect(result.value).toBe(expected);
+  });
+
+  it('should create an is-json assertion', () => {
+    const expected = 'is-json';
+
+    const result: Assertion = assertionFromString(expected);
+    expect(result.type).toBe('is-json');
+  });
+
+  it('should create an contains-json assertion', () => {
+    const expected = 'contains-json';
+
+    const result: Assertion = assertionFromString(expected);
+    expect(result.type).toBe('contains-json');
+  });
+
+  it('should create a function assertion', () => {
+    const expected = 'fn:output === "Expected output"';
+
+    const result: Assertion = assertionFromString(expected);
+    expect(result.type).toBe('javascript');
+    expect(result.value).toBe('output === "Expected output"');
+  });
+
+  it('should create a similarity assertion', () => {
+    const expected = 'similar(0.9):Expected output';
+
+    const result: Assertion = assertionFromString(expected);
+    expect(result.type).toBe('similar');
+    expect(result.value).toBe('Expected output');
+    expect(result.threshold).toBe(0.9);
+  });
+});
+
+describe('matchesSimilarity', () => {
+  beforeEach(() => {
+    jest.spyOn(DefaultEmbeddingProvider, 'callEmbeddingApi').mockImplementation((text) => {
+      if (text === 'Expected output' || text === 'Sample output') {
+        return Promise.resolve({
+          embedding: [1, 0, 0],
+          tokenUsage: { total: 5, prompt: 2, completion: 3 },
+        });
+      } else if (text === 'Different output') {
+        return Promise.resolve({
+          embedding: [0, 1, 0],
+          tokenUsage: { total: 5, prompt: 2, completion: 3 },
+        });
+      }
+      return Promise.reject(new Error('Unexpected input'));
+    });
+  });
+
+  afterEach(() => {
+    jest.restoreAllMocks();
+  });
+
+  it('should pass when similarity is above the threshold', async () => {
+    const expected = 'Expected output';
+    const output = 'Sample output';
+    const threshold = 0.5;
+
+    const result = await matchesSimilarity(expected, output, threshold);
+    expect(result.pass).toBeTruthy();
+    expect(result.reason).toBe('Similarity 1 is greater than threshold 0.5');
+  });
+
+  it('should fail when similarity is below the threshold', async () => {
+    const expected = 'Expected output';
+    const output = 'Different output';
+    const threshold = 0.9;
+
+    const result = await matchesSimilarity(expected, output, threshold);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toBe('Similarity 0 is less than threshold 0.9');
+  });
+});
+
+describe('matchesLlmRubric', () => {
+  class TestGrader implements ApiProvider {
+    async callApi(): Promise<ProviderResponse> {
+      return {
+        output: JSON.stringify({ pass: true }),
+        tokenUsage: { total: 10, prompt: 5, completion: 5 },
+      };
+    }
+
+    id(): string {
+      return 'TestGradingProvider';
+    }
+  }
+  const Grader = new TestGrader();
+
+  it('should pass when the grading provider returns a passing result', async () => {
+    const expected = 'Expected output';
+    const output = 'Sample output';
+    const options: GradingConfig = {
+      rubricPrompt: 'Grading prompt',
+      provider: Grader,
+    };
+
+    const result = await matchesLlmRubric(expected, output, options);
+    expect(result.pass).toBeTruthy();
+  });
+
+  it('should fail when the grading provider returns a failing result', async () => {
+    const expected = 'Expected output';
+    const output = 'Different output';
+    const options: GradingConfig = {
+      rubricPrompt: 'Grading prompt',
+      provider: Grader,
+    };
+
+    jest.spyOn(Grader, 'callApi').mockResolvedValueOnce({
+      output: JSON.stringify({ pass: false, reason: 'Grading failed' }),
+      tokenUsage: { total: 10, prompt: 5, completion: 5 },
+    });
+
+    const result = await matchesLlmRubric(expected, output, options);
+    expect(result.pass).toBeFalsy();
+    expect(result.reason).toBe('Grading failed');
+  });
+});
--- a/test/evaluator.test.ts
+++ b/test/evaluator.test.ts
@@ -1,7 +1,8 @@
 import { evaluate } from '../src/evaluator.js';
-
 import type { ApiProvider } from '../src/types.js';

+import { TestSuite } from '../src/types.js';
+
 jest.mock('node-fetch', () => jest.fn());

 jest.mock('../src/esm.js');
@@ -36,13 +37,17 @@ describe('evaluator', () => {
  });

  test('evaluate with vars', async () => {
-    const options = {
-      prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
-      vars: [{ var1: 'value1', var2: 'value2' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
+      prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
+      tests: [
+        {
+          vars: { var1: 'value1', var2: 'value2' },
+        },
+      ],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(1);
@@ -54,13 +59,17 @@ describe('evaluator', () => {
  });

  test('evaluate with multiple providers', async () => {
-    const options = {
-      prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
-      vars: [{ var1: 'value1', var2: 'value2' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider, mockApiProvider],
+      prompts: ['Test prompt {{ var1 }} {{ var2 }}'],
+      tests: [
+        {
+          vars: { var1: 'value1', var2: 'value2' },
+        },
+      ],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
    expect(summary.stats.successes).toBe(2);
@@ -73,13 +82,13 @@ describe('evaluator', () => {
    expect(summary.results[0].response?.output).toBe('Test output');
  });

-  test('evaluate without vars', async () => {
-    const options = {
-      prompts: ['Test prompt'],
+  test('evaluate without tests', async () => {
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
+      prompts: ['Test prompt'],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(1);
@@ -90,13 +99,13 @@ describe('evaluator', () => {
    expect(summary.results[0].response?.output).toBe('Test output');
  });

-  test('evaluate without vars with multiple providers', async () => {
-    const options = {
-      prompts: ['Test prompt'],
+  test('evaluate without tests with multiple providers', async () => {
+    const testSuite: TestSuite = {
      providers: [mockApiProvider, mockApiProvider, mockApiProvider],
+      prompts: ['Test prompt'],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(3);
    expect(summary.stats.successes).toBe(3);
@@ -108,13 +117,22 @@ describe('evaluator', () => {
  });

  test('evaluate with expected value matching output', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'Test output' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
+      prompts: ['Test prompt'],
+      tests: [
+        {
+          assert: [
+            {
+              type: 'equals',
+              value: 'Test output',
+            },
+          ],
+        },
+      ],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(1);
@@ -124,13 +142,22 @@ describe('evaluator', () => {
  });

  test('evaluate with expected value not matching output', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'Different output' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
+      prompts: ['Test prompt'],
+      tests: [
+        {
+          assert: [
+            {
+              type: 'equals',
+              value: 'Different output',
+            },
+          ],
+        },
+      ],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(0);
@@ -140,13 +167,22 @@ describe('evaluator', () => {
  });

  test('evaluate with fn: expected value', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'fn:output === "Test output";' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
+      prompts: ['Test prompt'],
+      tests: [
+        {
+          assert: [
+            {
+              type: 'javascript',
+              value: 'output === "Test output";',
+            },
+          ],
+        },
+      ],
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(1);
@@ -156,46 +192,22 @@ describe('evaluator', () => {
  });

  test('evaluate with fn: expected value not matching output', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'fn:output === "Different output";' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
+      prompts: ['Test prompt'],
+      tests: [
+        {
+          assert: [
+            {
+              type: 'javascript',
+              value: 'output === "Different output";',
+            },
+          ],
+        },
+      ],
    };

-    const summary = await evaluate(options);
-
-    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
-    expect(summary.stats.successes).toBe(0);
-    expect(summary.stats.failures).toBe(1);
-    expect(summary.results[0].success).toBe(false);
-    expect(summary.results[0].response?.output).toBe('Test output');
-  });
-
-  // TODO(1.0): remove legacy test
-  test('evaluate with eval: (legacy) expected value', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'eval:output === "Test output";' }],
-      providers: [mockApiProvider],
-    };
-
-    const summary = await evaluate(options);
-
-    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
-    expect(summary.stats.successes).toBe(1);
-    expect(summary.stats.failures).toBe(0);
-    expect(summary.results[0].success).toBe(true);
-    expect(summary.results[0].response?.output).toBe('Test output');
-  });
-
-  test('evaluate with eval: (legacy) expected value not matching output', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'eval:output === "Different output";' }],
-      providers: [mockApiProvider],
-    };
-
-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(0);
@@ -205,16 +217,27 @@ describe('evaluator', () => {
  });

  test('evaluate with grading expected value', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'grade:output is a test output' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
-      grading: {
-        provider: mockGradingApiProviderPasses,
+      prompts: ['Test prompt'],
+      tests: [
+        {
+          assert: [
+            {
+              type: 'llm-rubric',
+              value: 'output is a test output',
+            },
+          ],
+        },
+      ],
+      defaultTest: {
+        options: {
+          provider: mockGradingApiProviderPasses,
+        },
      },
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(1);
@@ -224,16 +247,27 @@ describe('evaluator', () => {
  });

  test('evaluate with grading expected value does not pass', async () => {
-    const options = {
-      prompts: ['Test prompt'],
-      vars: [{ __expected: 'grade:output is a test output' }],
+    const testSuite: TestSuite = {
      providers: [mockApiProvider],
-      grading: {
-        provider: mockGradingApiProviderFails,
+      prompts: ['Test prompt'],
+      tests: [
+        {
+          assert: [
+            {
+              type: 'llm-rubric',
+              value: 'output is a test output',
+            },
+          ],
+        },
+      ],
+      defaultTest: {
+        options: {
+          provider: mockGradingApiProviderFails,
+        },
      },
    };

-    const summary = await evaluate(options);
+    const summary = await evaluate(testSuite, {});

    expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
    expect(summary.stats.successes).toBe(0);