Add assertions in webview (#100)

This commit is contained in:
Ian Webster
2023-08-07 16:14:55 -07:00
committed by GitHub
parent 2e40da26bf
commit 747d6f3434
8 changed files with 219 additions and 87 deletions

View File

@@ -1,80 +1,82 @@
Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body
"[FAIL] Expected output ""Yarr""
"{""raw"":""Rephrase this from English to Pirate: {{body}}"",""display"":""Rephrase this from English to Pirate: {{body}}""}","{""raw"":""Pretend you're a pirate and speak these words: {{body}}"",""display"":""Pretend you're a pirate and speak these words: {{body}}""}",body
"[FAIL] (0.00) Expected output ""Yarr""
---
Aye.","[FAIL] Expected output ""Yarr""
Aye","[FAIL] (0.00) Expected output ""Yarr""
---
Ahoy mateys! Yes, I be a pirate and proud of it! Ye best be ready to hoist the Jolly Roger and set sail on the high seas with me crew. Yes, we'll be plunderin' and pillagin' all the treasure we can find. So, hoist the anchor and let's set sail! Yes, we'll be the most feared pirates on the seven seas! Arrr!",Yes
"[PASS] I be feelin' a mighty need for grub, matey.","[PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry
"[PASS] ""Spew forth the tale of yer life in JSON, me hearty!""","[PASS] {
Arr, me hearties! Yes, be the word ye be seekin'.",Yes
"[FAIL] (0.00) Expected output to contain ""grub""
---
I be feelin' a mighty hunger.","[PASS] (1.00) Arr, me hearties! I be feelin' a mighty rumblin' in me belly, for I be famished! Aye, I be longin' for a feast fit for a pirate. Bring me a bounty of savory meats, hearty stews, and fresh fruits to quell this hunger that be gnawin' at me insides. I be yearnin' for a taste of the sea, with fish so fresh it be jumpin' straight from the ocean onto me plate. So, me mateys, let's set sail on a culinary adventure and fill our bellies with the finest grub a pirate could ask for!",I'm hungry
"[PASS] (1.00) ""Splice out the tale of yer life in JSON, matey!""","[PASS] (1.00) {
""name"": ""Captain Blackbeard"",
""age"": 45,
""alias"": ""The Dread Pirate"",
""occupation"": ""Pirate"",
""birthplace"": ""Port Royal"",
""birthyear"": 1680,
""nationality"": ""English"",
""crew"": [
{
""name"": ""Redbeard"",
""age"": 32,
""position"": ""First Mate""
},
{
""name"": ""Scurvy Pete"",
""age"": 28,
""position"": ""Navigator""
},
{
""name"": ""One-Eyed Jack"",
""age"": 50,
""position"": ""Quartermaster""
}
""Redbeard"",
""One-Eyed Jack"",
""Mad Anne"",
""Salty Sam""
],
""ships"": [
{
""name"": ""The Black Pearl"",
""type"": ""Galleon"",
""captured"": [
""The Spanish Armada"",
""The British Navy""
]
""type"": ""Sloop"",
""year"": 1700
},
{
""name"": ""The Jolly Roger"",
""type"": ""Sloop"",
""captured"": [
""Merchant ships"",
""Fishing vessels""
]
""name"": ""The Revenge"",
""type"": ""Frigate"",
""year"": 1715
}
],
""treasure"": [
""treasures"": [
{
""type"": ""Gold"",
""amount"": 50000
""name"": ""Cursed Aztec Gold"",
""value"": ""Priceless""
},
{
""type"": ""Silver"",
""amount"": 25000
""name"": ""Emerald of the Sea"",
""value"": ""500,000 gold coins""
},
{
""type"": ""Jewels"",
""amount"": 100000
""name"": ""Golden Chalice"",
""value"": ""200,000 gold coins""
}
],
""adventures"": [
{
""name"": ""The Battle of Nassau"",
""location"": ""Nassau"",
""outcome"": ""Victory""
},
{
""name"": ""The Raid on Port Royal"",
""location"": ""Port Royal"",
""outcome"": ""Success""
},
{
""name"": ""The Treasure of Tortuga"",
""year"": 1698,
""location"": ""Tortuga"",
""outcome"": ""Success""
""description"": ""First encounter with pirates, joined crew of Captain Redbeard.""
},
{
""year"": 1702,
""location"": ""Isla de Muerta"",
""description"": ""Discovered cursed Aztec gold, narrowly escaped the wrath of cursed pirates.""
},
{
""year"": 1705,
""location"": ""Port Royal"",
""description"": ""Led a successful raid on the governor's mansion, acquired valuable treasures.""
},
{
""year"": 1710,
""location"": ""Treasure Island"",
""description"": ""Unearthed the hidden treasure of Captain Flint, became legendary among pirates.""
},
{
""year"": 1718,
""location"": ""Nassau"",
""description"": ""Declared war against the British Empire, led a fleet of pirate ships.""
}
]
}",Output the story of your life in JSON
[PASS] Ahoy mateys o' the world!,"[PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world
"[PASS] Arrr, thar be a swift brown fox leaping o'er th' lazy dog.","[PASS] Arrr, me hearties! The quick brown fox be jumpin' over the lazy dog! Ye best be keepin' yer eyes peeled for that sly fox, or he'll be stealin' all yer booty!",The quick brown fox jumps over the lazy dog
"[PASS] (0.96) Ahoy thar, mateys!","[PASS] (0.96) Ahoy, me hearties! Avast ye, landlubbers! 'Tis I, a fearsome pirate, settin' sail on the vast seas. Hear me words, ye scurvy dogs: ""Hello world!""",Hello world
"[FAIL] (0.50) Expected output to contain ""arrr""
---
Thar swift brown fox be jumpin' o'er thar lazy dog.","[FAIL] (0.50) Expected output to contain ""arrr""
---
Arr, me hearties! Listen up, ye scurvy dogs! The quick brown fox be jumpin' o'er the lazy dog, aye!",The quick brown fox jumps over the lazy dog
1 Rephrase this from English to Pirate: {{body}} {"raw":"Rephrase this from English to Pirate: {{body}}","display":"Rephrase this from English to Pirate: {{body}}"} Pretend you're a pirate and speak these words: {{body}} {"raw":"Pretend you're a pirate and speak these words: {{body}}","display":"Pretend you're a pirate and speak these words: {{body}}"} body
2 [FAIL] Expected output "Yarr" --- Aye. [FAIL] (0.00) Expected output "Yarr" --- Aye [FAIL] Expected output "Yarr" --- Ahoy mateys! Yes, I be a pirate and proud of it! Ye best be ready to hoist the Jolly Roger and set sail on the high seas with me crew. Yes, we'll be plunderin' and pillagin' all the treasure we can find. So, hoist the anchor and let's set sail! Yes, we'll be the most feared pirates on the seven seas! Arrr! [FAIL] (0.00) Expected output "Yarr" --- Arr, me hearties! Yes, be the word ye be seekin'. Yes
3 [PASS] I be feelin' a mighty need for grub, matey. [FAIL] (0.00) Expected output to contain "grub" --- I be feelin' a mighty hunger. [PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank! [PASS] (1.00) Arr, me hearties! I be feelin' a mighty rumblin' in me belly, for I be famished! Aye, I be longin' for a feast fit for a pirate. Bring me a bounty of savory meats, hearty stews, and fresh fruits to quell this hunger that be gnawin' at me insides. I be yearnin' for a taste of the sea, with fish so fresh it be jumpin' straight from the ocean onto me plate. So, me mateys, let's set sail on a culinary adventure and fill our bellies with the finest grub a pirate could ask for! I'm hungry
4 [PASS] "Spew forth the tale of yer life in JSON, me hearty!" [PASS] (1.00) "Splice out the tale of yer life in JSON, matey!" [PASS] { "name": "Captain Blackbeard", "age": 45, "occupation": "Pirate", "crew": [ { "name": "Redbeard", "age": 32, "position": "First Mate" }, { "name": "Scurvy Pete", "age": 28, "position": "Navigator" }, { "name": "One-Eyed Jack", "age": 50, "position": "Quartermaster" } ], "ships": [ { "name": "The Black Pearl", "type": "Galleon", "captured": [ "The Spanish Armada", "The British Navy" ] }, { "name": "The Jolly Roger", "type": "Sloop", "captured": [ "Merchant ships", "Fishing vessels" ] } ], "treasure": [ { "type": "Gold", "amount": 50000 }, { "type": "Silver", "amount": 25000 }, { "type": "Jewels", "amount": 100000 } ], "adventures": [ { "name": "The Battle of Nassau", "location": "Nassau", "outcome": "Victory" }, { "name": "The Raid on Port Royal", "location": "Port Royal", "outcome": "Success" }, { "name": "The Treasure of Tortuga", "location": "Tortuga", "outcome": "Success" } ] } [PASS] (1.00) { "name": "Captain Blackbeard", "alias": "The Dread Pirate", "occupation": "Pirate", "birthplace": "Port Royal", "birthyear": 1680, "nationality": "English", "crew": [ "Redbeard", "One-Eyed Jack", "Mad Anne", "Salty Sam" ], "ships": [ { "name": "The Black Pearl", "type": "Sloop", "year": 1700 }, { "name": "The Revenge", "type": "Frigate", "year": 1715 } ], "treasures": [ { "name": "Cursed Aztec Gold", "value": "Priceless" }, { "name": "Emerald of the Sea", "value": "500,000 gold coins" }, { "name": "Golden Chalice", "value": "200,000 gold coins" } ], "adventures": [ { "year": 1698, "location": "Tortuga", "description": "First encounter with pirates, joined crew of Captain Redbeard." }, { "year": 1702, "location": "Isla de Muerta", "description": "Discovered cursed Aztec gold, narrowly escaped the wrath of cursed pirates." }, { "year": 1705, "location": "Port Royal", "description": "Led a successful raid on the governor's mansion, acquired valuable treasures." }, { "year": 1710, "location": "Treasure Island", "description": "Unearthed the hidden treasure of Captain Flint, became legendary among pirates." }, { "year": 1718, "location": "Nassau", "description": "Declared war against the British Empire, led a fleet of pirate ships." } ] } Output the story of your life in JSON
5 [PASS] Ahoy mateys o' the world! [PASS] (0.96) Ahoy thar, mateys! [PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world! [PASS] (0.96) Ahoy, me hearties! Avast ye, landlubbers! 'Tis I, a fearsome pirate, settin' sail on the vast seas. Hear me words, ye scurvy dogs: "Hello world!" Hello world
6 [PASS] Arrr, thar be a swift brown fox leaping o'er th' lazy dog. [FAIL] (0.50) Expected output to contain "arrr" --- Thar swift brown fox be jumpin' o'er thar lazy dog. [PASS] Arrr, me hearties! The quick brown fox be jumpin' over the lazy dog! Ye best be keepin' yer eyes peeled for that sly fox, or he'll be stealin' all yer booty! [FAIL] (0.50) Expected output to contain "arrr" --- Arr, me hearties! Listen up, ye scurvy dogs! The quick brown fox be jumpin' o'er the lazy dog, aye! The quick brown fox jumps over the lazy dog
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

View File

@@ -40,6 +40,7 @@ function handleRougeScore(
: `${baseType.toUpperCase()} score ${score} is less than threshold ${
assertion.threshold || 0.75
}`,
assertion,
};
}
@@ -51,13 +52,14 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
};
if (!test.assert || test.assert.length < 1) {
return { pass: true, score: 1, reason: 'No assertions', tokensUsed };
return { pass: true, score: 1, reason: 'No assertions', tokensUsed, assertion: null };
}
let totalScore = 0;
let totalWeight = 0;
let allPass = true;
let failedReason = '';
const componentResults: GradingResult[] = [];
for (const assertion of test.assert) {
const weight = assertion.weight || 1;
@@ -65,6 +67,8 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
const result = await runAssertion(assertion, test, output);
totalScore += result.score * weight;
componentResults.push(result);
if (result.tokensUsed) {
tokensUsed.total += result.tokensUsed.total;
tokensUsed.prompt += result.tokensUsed.prompt;
@@ -85,6 +89,8 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
score: totalScore / totalWeight,
reason: allPass ? 'All assertions passed' : failedReason,
tokensUsed,
componentResults,
assertion: null,
};
}
@@ -120,6 +126,7 @@ export async function runAssertion(
pass,
score: pass ? 1 : 0,
reason: pass ? 'Assertion passed' : `Expected output "${renderedValue}"`,
assertion,
};
}
@@ -134,6 +141,7 @@ export async function runAssertion(
pass,
score: pass ? 1 : 0,
reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON',
assertion,
};
}
@@ -150,6 +158,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
assertion,
};
}
@@ -166,6 +175,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${renderedValue.join(', ')}"`,
assertion,
};
}
@@ -182,6 +192,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain all of "${renderedValue.join(', ')}"`,
assertion,
};
}
@@ -199,6 +210,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}match regex "${renderedValue}"`,
assertion,
};
}
@@ -215,6 +227,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
assertion,
};
}
@@ -231,6 +244,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}start with "${renderedValue}"`,
assertion,
};
}
@@ -242,6 +256,7 @@ export async function runAssertion(
reason: pass
? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
assertion,
};
}
@@ -271,6 +286,7 @@ export async function runAssertion(
score: 0,
reason: `Custom function threw error: ${(err as Error).message}
${renderedValue}`,
assertion,
};
}
return {
@@ -280,6 +296,7 @@ ${renderedValue}`,
? 'Assertion passed'
: `Custom function returned ${inverse ? 'true' : 'false'}
${renderedValue}`,
assertion,
};
}
@@ -315,6 +332,7 @@ ${renderedValue}`,
pass: false,
score: 0,
reason: `Python code execution failed: ${(err as Error).message}`,
assertion,
};
}
return {
@@ -324,6 +342,7 @@ ${renderedValue}`,
? 'Assertion passed'
: `Python code returned ${pass ? 'true' : 'false'}
${assertion.value}`,
assertion,
};
}
@@ -333,7 +352,10 @@ ${assertion.value}`,
typeof renderedValue === 'string',
'"contains" assertion type must have a string value',
);
return matchesSimilarity(renderedValue, output, assertion.threshold || 0.75, inverse);
return {
assertion,
...(await matchesSimilarity(renderedValue, output, assertion.threshold || 0.75, inverse)),
};
}
if (baseType === 'llm-rubric') {
@@ -342,7 +364,10 @@ ${assertion.value}`,
typeof renderedValue === 'string',
'"contains" assertion type must have a string value',
);
return matchesLlmRubric(renderedValue, output, test.options);
return {
assertion,
...(await matchesLlmRubric(renderedValue, output, test.options)),
};
}
if (baseType === 'webhook') {
@@ -384,6 +409,7 @@ ${assertion.value}`,
pass: false,
score: 0,
reason: `Webhook error: ${(err as Error).message}`,
assertion,
};
}
@@ -391,6 +417,7 @@ ${assertion.value}`,
pass,
score,
reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
assertion,
};
}
@@ -428,7 +455,7 @@ export async function matchesSimilarity(
output: string,
threshold: number,
inverse: boolean = false,
): Promise<GradingResult> {
): Promise<Omit<GradingResult, 'assertion'>> {
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
@@ -483,7 +510,7 @@ export async function matchesLlmRubric(
expected: string,
output: string,
options?: GradingConfig,
): Promise<GradingResult> {
): Promise<Omit<GradingResult, 'assertion'>> {
if (!options) {
throw new Error(
'Cannot grade output without grading config. Specify --grader option or grading config.',

View File

@@ -156,6 +156,7 @@ class Evaluator {
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
}
ret.response = processedResponse;
ret.gradingResult = checkResult;
} else {
ret.success = false;
ret.score = 0;
@@ -464,6 +465,7 @@ class Evaluator {
prompt: row.prompt.raw,
latencyMs: row.latencyMs,
tokenUsage: row.response?.tokenUsage,
gradingResult: row.gradingResult,
};
},
);

View File

@@ -97,6 +97,7 @@ export interface EvaluateResult {
success: boolean;
score: number;
latencyMs: number;
gradingResult?: GradingResult;
}
export interface EvaluateTableOutput {
@@ -106,6 +107,7 @@ export interface EvaluateTableOutput {
prompt: string;
latencyMs: number;
tokenUsage?: Partial<TokenUsage>;
gradingResult?: GradingResult;
}
export interface EvaluateTable {
@@ -138,6 +140,8 @@ export interface GradingResult {
score: number;
reason: string;
tokensUsed?: TokenUsage;
componentResults?: GradingResult[];
assertion: Assertion | null;
}
type BaseAssertionTypes =

View File

@@ -1,4 +1,5 @@
import { useState, useEffect } from 'react';
import Box from '@mui/material/Box';
import Button from '@mui/material/Button';
import Dialog from '@mui/material/Dialog';
import DialogActions from '@mui/material/DialogActions';
@@ -8,12 +9,60 @@ import TextareaAutosize from '@mui/base/TextareaAutosize';
import IconButton from '@mui/material/IconButton';
import ContentCopyIcon from '@mui/icons-material/ContentCopy';
import CheckIcon from '@mui/icons-material/Check';
import Table from '@mui/material/Table';
import TableBody from '@mui/material/TableBody';
import TableCell from '@mui/material/TableCell';
import TableContainer from '@mui/material/TableContainer';
import TableHead from '@mui/material/TableHead';
import TableRow from '@mui/material/TableRow';
import Typography from '@mui/material/Typography';
import type { GradingResult } from '../../../types';
interface EvalOutputPromptDialogProps {
open: boolean;
onClose: () => void;
prompt: string;
output?: string;
gradingResults?: GradingResult[];
}
function AssertionResults({ gradingResults }: { gradingResults?: GradingResult[] }) {
if (!gradingResults) {
return null;
}
return (
<Box mt={2}>
<Typography variant="subtitle1">Assertions</Typography>
<TableContainer>
<Table>
<TableHead>
<TableRow>
<TableCell style={{ fontWeight: 'bold' }}>Pass</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Score</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Type</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Value</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Reason</TableCell>
</TableRow>
</TableHead>
<TableBody>
{gradingResults.map((result, i) => (
<TableRow key={i}>
<TableCell>{result.pass ? '✅' : '❌'}</TableCell>
<TableCell>{result.score}</TableCell>
<TableCell>{result.assertion?.type || ''}</TableCell>
<TableCell>
{result.assertion?.value ? String(result.assertion.value) : '-'}
</TableCell>
<TableCell>{result.reason}</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
</Box>
);
}
export default function EvalOutputPromptDialog({
@@ -21,6 +70,7 @@ export default function EvalOutputPromptDialog({
onClose,
prompt,
output,
gradingResults,
}: EvalOutputPromptDialogProps) {
const [copied, setCopied] = useState(false);
@@ -35,28 +85,40 @@ export default function EvalOutputPromptDialog({
return (
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
<DialogTitle>Prompt</DialogTitle>
<DialogTitle>Details</DialogTitle>
<DialogContent>
<TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} />
<IconButton
onClick={() => copyToClipboard(prompt)}
style={{ position: 'absolute', right: '10px', top: '10px' }}
>
{copied ? <CheckIcon /> : <ContentCopyIcon />}
</IconButton>
</DialogContent>
{output && (
<>
<DialogTitle>Output</DialogTitle>
<DialogContent>
<Box mb={2}>
<Typography variant="subtitle1" style={{ marginBottom: '1rem' }}>
Prompt
</Typography>
<TextareaAutosize
readOnly
value={prompt}
style={{ width: '100%', padding: '0.75rem' }}
maxRows={20}
/>
<IconButton
onClick={() => copyToClipboard(prompt)}
style={{ position: 'absolute', right: '10px', top: '10px' }}
>
{copied ? <CheckIcon /> : <ContentCopyIcon />}
</IconButton>
</Box>
{output && (
<Box my={2}>
<Typography variant="subtitle1" style={{ marginBottom: '1rem', marginTop: '1rem' }}>
Output
</Typography>
<TextareaAutosize
readOnly
maxRows={20}
value={output}
style={{ width: '100%', padding: '0.75rem' }}
/>
</DialogContent>
</>
)}
</Box>
)}
<AssertionResults gradingResults={gradingResults} />
</DialogContent>
<DialogActions>
<Button onClick={onClose}>Close</Button>
</DialogActions>

View File

@@ -155,6 +155,7 @@ function EvalOutputCell({
open={openPrompt}
onClose={handlePromptClose}
prompt={output.prompt}
gradingResults={output.gradingResult?.componentResults}
output={text}
/>
</>
@@ -223,12 +224,25 @@ export default function ResultsTable({
const { table, setTable } = useStore();
invariant(table, 'Table should be defined');
const { head, body } = table;
const numGood = head.prompts.map((_, idx) =>
const numGoodTests = head.prompts.map((_, idx) =>
body.reduce((acc, row) => {
return acc + (row.outputs[idx].pass ? 1 : 0);
}, 0),
);
const numAsserts = head.prompts.map((_, idx) =>
body.reduce((acc, row) => {
return acc + (row.outputs[idx].gradingResult?.componentResults?.length || 0);
}, 0),
);
const numGoodAsserts = head.prompts.map((_, idx) =>
body.reduce((acc, row) => {
const componentResults = row.outputs[idx].gradingResult?.componentResults;
return acc + (componentResults ? componentResults.filter((r) => r.pass).length : 0);
}, 0),
);
const handleRating = (rowIndex: number, promptIndex: number, isPass: boolean) => {
const updatedData = [...body];
const updatedRow = { ...updatedData[rowIndex] };
@@ -243,10 +257,13 @@ export default function ResultsTable({
});
};
const highestPassingIndex = numGood.reduce((maxIndex, currentPassCount, currentIndex, array) => {
return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
}, 0);
const highestPassingCount = numGood[highestPassingIndex];
const highestPassingIndex = numGoodTests.reduce(
(maxIndex, currentPassCount, currentIndex, array) => {
return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
},
0,
);
const highestPassingCount = numGoodTests[highestPassingIndex];
const columnHelper = createColumnHelper<EvalRow>();
const columns = [
columnHelper.group({
@@ -282,9 +299,9 @@ export default function ResultsTable({
columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
id: `Prompt ${idx + 1}`,
header: () => {
const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
const pct = ((numGoodTests[idx] / body.length) * 100.0).toFixed(2);
const isHighestPassing =
numGood[idx] === highestPassingCount && highestPassingCount !== 0;
numGoodTests[idx] === highestPassingCount && highestPassingCount !== 0;
const columnId = `Prompt ${idx + 1}`;
const isChecked = failureFilter[columnId] || false;
// TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
@@ -313,12 +330,18 @@ export default function ResultsTable({
/>
)}
<div className={`summary ${isHighestPassing ? 'highlight' : ''}`}>
Passing: <strong>{pct}%</strong> ({numGood[idx]} / {body.length})
Passing: <strong>{pct}%</strong> ({numGoodTests[idx]}/{body.length} cases
{numAsserts[idx] ? (
<span>
, {numGoodAsserts[idx]}/{numAsserts[idx]} asserts
</span>
) : null}
)
</div>
</>
);
},
cell: (info: CellContext<EvalRow, string>) => (
cell: (info: CellContext<EvalRow, EvalRowOutput>) => (
<EvalOutputCell
output={info.getValue() as unknown as EvalRowOutput}
maxTextLength={maxTextLength}

View File

@@ -1,4 +1,4 @@
import { TokenUsage } from '../../../types';
import { EvaluateResult, TokenUsage } from '../../../types';
type Prompt = {
display: string;
@@ -10,6 +10,7 @@ export type EvalHead = {
vars: string[];
};
// TODO(ian): Remove this and replace with EvaluateResult
export type EvalRowOutput = {
pass: boolean;
score: number;
@@ -17,6 +18,7 @@ export type EvalRowOutput = {
prompt: string;
latencyMs: number;
tokenUsage?: Partial<TokenUsage>;
gradingResult: EvaluateResult['gradingResult'];
};
export type EvalRow = {

View File

@@ -70,12 +70,22 @@ describe('runAssertion', () => {
const javascriptFunctionAssertion: Assertion = {
type: 'javascript',
value: async (output: string) => ({ pass: true, score: 0.5, reason: 'Assertion passed' }),
value: async (output: string) => ({
pass: true,
score: 0.5,
reason: 'Assertion passed',
assertion: null,
}),
};
const javascriptFunctionFailAssertion: Assertion = {
type: 'javascript',
value: async (output: string) => ({ pass: false, score: 0.5, reason: 'Assertion failed' }),
value: async (output: string) => ({
pass: false,
score: 0.5,
reason: 'Assertion failed',
assertion: null,
}),
};
it('should pass when the equality assertion passes', async () => {