Add assertions in webview (#100)

This commit is contained in:
Ian Webster
2023-08-07 16:14:55 -07:00
committed by GitHub
parent 2e40da26bf
commit 747d6f3434
8 changed files with 219 additions and 87 deletions

View File

@@ -1,80 +1,82 @@
Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body "{""raw"":""Rephrase this from English to Pirate: {{body}}"",""display"":""Rephrase this from English to Pirate: {{body}}""}","{""raw"":""Pretend you're a pirate and speak these words: {{body}}"",""display"":""Pretend you're a pirate and speak these words: {{body}}""}",body
"[FAIL] Expected output ""Yarr"" "[FAIL] (0.00) Expected output ""Yarr""
--- ---
Aye.","[FAIL] Expected output ""Yarr"" Aye","[FAIL] (0.00) Expected output ""Yarr""
--- ---
Ahoy mateys! Yes, I be a pirate and proud of it! Ye best be ready to hoist the Jolly Roger and set sail on the high seas with me crew. Yes, we'll be plunderin' and pillagin' all the treasure we can find. So, hoist the anchor and let's set sail! Yes, we'll be the most feared pirates on the seven seas! Arrr!",Yes Arr, me hearties! Yes, be the word ye be seekin'.",Yes
"[PASS] I be feelin' a mighty need for grub, matey.","[PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry "[FAIL] (0.00) Expected output to contain ""grub""
"[PASS] ""Spew forth the tale of yer life in JSON, me hearty!""","[PASS] { ---
I be feelin' a mighty hunger.","[PASS] (1.00) Arr, me hearties! I be feelin' a mighty rumblin' in me belly, for I be famished! Aye, I be longin' for a feast fit for a pirate. Bring me a bounty of savory meats, hearty stews, and fresh fruits to quell this hunger that be gnawin' at me insides. I be yearnin' for a taste of the sea, with fish so fresh it be jumpin' straight from the ocean onto me plate. So, me mateys, let's set sail on a culinary adventure and fill our bellies with the finest grub a pirate could ask for!",I'm hungry
"[PASS] (1.00) ""Splice out the tale of yer life in JSON, matey!""","[PASS] (1.00) {
""name"": ""Captain Blackbeard"", ""name"": ""Captain Blackbeard"",
""age"": 45, ""alias"": ""The Dread Pirate"",
""occupation"": ""Pirate"", ""occupation"": ""Pirate"",
""birthplace"": ""Port Royal"",
""birthyear"": 1680,
""nationality"": ""English"",
""crew"": [ ""crew"": [
{ ""Redbeard"",
""name"": ""Redbeard"", ""One-Eyed Jack"",
""age"": 32, ""Mad Anne"",
""position"": ""First Mate"" ""Salty Sam""
},
{
""name"": ""Scurvy Pete"",
""age"": 28,
""position"": ""Navigator""
},
{
""name"": ""One-Eyed Jack"",
""age"": 50,
""position"": ""Quartermaster""
}
], ],
""ships"": [ ""ships"": [
{ {
""name"": ""The Black Pearl"", ""name"": ""The Black Pearl"",
""type"": ""Galleon"", ""type"": ""Sloop"",
""captured"": [ ""year"": 1700
""The Spanish Armada"",
""The British Navy""
]
}, },
{ {
""name"": ""The Jolly Roger"", ""name"": ""The Revenge"",
""type"": ""Sloop"", ""type"": ""Frigate"",
""captured"": [ ""year"": 1715
""Merchant ships"",
""Fishing vessels""
]
} }
], ],
""treasure"": [ ""treasures"": [
{ {
""type"": ""Gold"", ""name"": ""Cursed Aztec Gold"",
""amount"": 50000 ""value"": ""Priceless""
}, },
{ {
""type"": ""Silver"", ""name"": ""Emerald of the Sea"",
""amount"": 25000 ""value"": ""500,000 gold coins""
}, },
{ {
""type"": ""Jewels"", ""name"": ""Golden Chalice"",
""amount"": 100000 ""value"": ""200,000 gold coins""
} }
], ],
""adventures"": [ ""adventures"": [
{ {
""name"": ""The Battle of Nassau"", ""year"": 1698,
""location"": ""Nassau"",
""outcome"": ""Victory""
},
{
""name"": ""The Raid on Port Royal"",
""location"": ""Port Royal"",
""outcome"": ""Success""
},
{
""name"": ""The Treasure of Tortuga"",
""location"": ""Tortuga"", ""location"": ""Tortuga"",
""outcome"": ""Success"" ""description"": ""First encounter with pirates, joined crew of Captain Redbeard.""
},
{
""year"": 1702,
""location"": ""Isla de Muerta"",
""description"": ""Discovered cursed Aztec gold, narrowly escaped the wrath of cursed pirates.""
},
{
""year"": 1705,
""location"": ""Port Royal"",
""description"": ""Led a successful raid on the governor's mansion, acquired valuable treasures.""
},
{
""year"": 1710,
""location"": ""Treasure Island"",
""description"": ""Unearthed the hidden treasure of Captain Flint, became legendary among pirates.""
},
{
""year"": 1718,
""location"": ""Nassau"",
""description"": ""Declared war against the British Empire, led a fleet of pirate ships.""
} }
] ]
}",Output the story of your life in JSON }",Output the story of your life in JSON
[PASS] Ahoy mateys o' the world!,"[PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world "[PASS] (0.96) Ahoy thar, mateys!","[PASS] (0.96) Ahoy, me hearties! Avast ye, landlubbers! 'Tis I, a fearsome pirate, settin' sail on the vast seas. Hear me words, ye scurvy dogs: ""Hello world!""",Hello world
"[PASS] Arrr, thar be a swift brown fox leaping o'er th' lazy dog.","[PASS] Arrr, me hearties! The quick brown fox be jumpin' over the lazy dog! Ye best be keepin' yer eyes peeled for that sly fox, or he'll be stealin' all yer booty!",The quick brown fox jumps over the lazy dog "[FAIL] (0.50) Expected output to contain ""arrr""
---
Thar swift brown fox be jumpin' o'er thar lazy dog.","[FAIL] (0.50) Expected output to contain ""arrr""
---
Arr, me hearties! Listen up, ye scurvy dogs! The quick brown fox be jumpin' o'er the lazy dog, aye!",The quick brown fox jumps over the lazy dog
1 Rephrase this from English to Pirate: {{body}} {"raw":"Rephrase this from English to Pirate: {{body}}","display":"Rephrase this from English to Pirate: {{body}}"} Pretend you're a pirate and speak these words: {{body}} {"raw":"Pretend you're a pirate and speak these words: {{body}}","display":"Pretend you're a pirate and speak these words: {{body}}"} body
2 [FAIL] Expected output "Yarr" --- Aye. [FAIL] (0.00) Expected output "Yarr" --- Aye [FAIL] Expected output "Yarr" --- Ahoy mateys! Yes, I be a pirate and proud of it! Ye best be ready to hoist the Jolly Roger and set sail on the high seas with me crew. Yes, we'll be plunderin' and pillagin' all the treasure we can find. So, hoist the anchor and let's set sail! Yes, we'll be the most feared pirates on the seven seas! Arrr! [FAIL] (0.00) Expected output "Yarr" --- Arr, me hearties! Yes, be the word ye be seekin'. Yes
3 [PASS] I be feelin' a mighty need for grub, matey. [FAIL] (0.00) Expected output to contain "grub" --- I be feelin' a mighty hunger. [PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank! [PASS] (1.00) Arr, me hearties! I be feelin' a mighty rumblin' in me belly, for I be famished! Aye, I be longin' for a feast fit for a pirate. Bring me a bounty of savory meats, hearty stews, and fresh fruits to quell this hunger that be gnawin' at me insides. I be yearnin' for a taste of the sea, with fish so fresh it be jumpin' straight from the ocean onto me plate. So, me mateys, let's set sail on a culinary adventure and fill our bellies with the finest grub a pirate could ask for! I'm hungry
4 [PASS] "Spew forth the tale of yer life in JSON, me hearty!" [PASS] (1.00) "Splice out the tale of yer life in JSON, matey!" [PASS] { "name": "Captain Blackbeard", "age": 45, "occupation": "Pirate", "crew": [ { "name": "Redbeard", "age": 32, "position": "First Mate" }, { "name": "Scurvy Pete", "age": 28, "position": "Navigator" }, { "name": "One-Eyed Jack", "age": 50, "position": "Quartermaster" } ], "ships": [ { "name": "The Black Pearl", "type": "Galleon", "captured": [ "The Spanish Armada", "The British Navy" ] }, { "name": "The Jolly Roger", "type": "Sloop", "captured": [ "Merchant ships", "Fishing vessels" ] } ], "treasure": [ { "type": "Gold", "amount": 50000 }, { "type": "Silver", "amount": 25000 }, { "type": "Jewels", "amount": 100000 } ], "adventures": [ { "name": "The Battle of Nassau", "location": "Nassau", "outcome": "Victory" }, { "name": "The Raid on Port Royal", "location": "Port Royal", "outcome": "Success" }, { "name": "The Treasure of Tortuga", "location": "Tortuga", "outcome": "Success" } ] } [PASS] (1.00) { "name": "Captain Blackbeard", "alias": "The Dread Pirate", "occupation": "Pirate", "birthplace": "Port Royal", "birthyear": 1680, "nationality": "English", "crew": [ "Redbeard", "One-Eyed Jack", "Mad Anne", "Salty Sam" ], "ships": [ { "name": "The Black Pearl", "type": "Sloop", "year": 1700 }, { "name": "The Revenge", "type": "Frigate", "year": 1715 } ], "treasures": [ { "name": "Cursed Aztec Gold", "value": "Priceless" }, { "name": "Emerald of the Sea", "value": "500,000 gold coins" }, { "name": "Golden Chalice", "value": "200,000 gold coins" } ], "adventures": [ { "year": 1698, "location": "Tortuga", "description": "First encounter with pirates, joined crew of Captain Redbeard." }, { "year": 1702, "location": "Isla de Muerta", "description": "Discovered cursed Aztec gold, narrowly escaped the wrath of cursed pirates." }, { "year": 1705, "location": "Port Royal", "description": "Led a successful raid on the governor's mansion, acquired valuable treasures." }, { "year": 1710, "location": "Treasure Island", "description": "Unearthed the hidden treasure of Captain Flint, became legendary among pirates." }, { "year": 1718, "location": "Nassau", "description": "Declared war against the British Empire, led a fleet of pirate ships." } ] } Output the story of your life in JSON
5 [PASS] Ahoy mateys o' the world! [PASS] (0.96) Ahoy thar, mateys! [PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world! [PASS] (0.96) Ahoy, me hearties! Avast ye, landlubbers! 'Tis I, a fearsome pirate, settin' sail on the vast seas. Hear me words, ye scurvy dogs: "Hello world!" Hello world
6 [PASS] Arrr, thar be a swift brown fox leaping o'er th' lazy dog. [FAIL] (0.50) Expected output to contain "arrr" --- Thar swift brown fox be jumpin' o'er thar lazy dog. [PASS] Arrr, me hearties! The quick brown fox be jumpin' over the lazy dog! Ye best be keepin' yer eyes peeled for that sly fox, or he'll be stealin' all yer booty! [FAIL] (0.50) Expected output to contain "arrr" --- Arr, me hearties! Listen up, ye scurvy dogs! The quick brown fox be jumpin' o'er the lazy dog, aye! The quick brown fox jumps over the lazy dog
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

View File

@@ -40,6 +40,7 @@ function handleRougeScore(
: `${baseType.toUpperCase()} score ${score} is less than threshold ${ : `${baseType.toUpperCase()} score ${score} is less than threshold ${
assertion.threshold || 0.75 assertion.threshold || 0.75
}`, }`,
assertion,
}; };
} }
@@ -51,13 +52,14 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
}; };
if (!test.assert || test.assert.length < 1) { if (!test.assert || test.assert.length < 1) {
return { pass: true, score: 1, reason: 'No assertions', tokensUsed }; return { pass: true, score: 1, reason: 'No assertions', tokensUsed, assertion: null };
} }
let totalScore = 0; let totalScore = 0;
let totalWeight = 0; let totalWeight = 0;
let allPass = true; let allPass = true;
let failedReason = ''; let failedReason = '';
const componentResults: GradingResult[] = [];
for (const assertion of test.assert) { for (const assertion of test.assert) {
const weight = assertion.weight || 1; const weight = assertion.weight || 1;
@@ -65,6 +67,8 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
const result = await runAssertion(assertion, test, output); const result = await runAssertion(assertion, test, output);
totalScore += result.score * weight; totalScore += result.score * weight;
componentResults.push(result);
if (result.tokensUsed) { if (result.tokensUsed) {
tokensUsed.total += result.tokensUsed.total; tokensUsed.total += result.tokensUsed.total;
tokensUsed.prompt += result.tokensUsed.prompt; tokensUsed.prompt += result.tokensUsed.prompt;
@@ -85,6 +89,8 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
score: totalScore / totalWeight, score: totalScore / totalWeight,
reason: allPass ? 'All assertions passed' : failedReason, reason: allPass ? 'All assertions passed' : failedReason,
tokensUsed, tokensUsed,
componentResults,
assertion: null,
}; };
} }
@@ -120,6 +126,7 @@ export async function runAssertion(
pass, pass,
score: pass ? 1 : 0, score: pass ? 1 : 0,
reason: pass ? 'Assertion passed' : `Expected output "${renderedValue}"`, reason: pass ? 'Assertion passed' : `Expected output "${renderedValue}"`,
assertion,
}; };
} }
@@ -134,6 +141,7 @@ export async function runAssertion(
pass, pass,
score: pass ? 1 : 0, score: pass ? 1 : 0,
reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON', reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON',
assertion,
}; };
} }
@@ -150,6 +158,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`, : `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
assertion,
}; };
} }
@@ -166,6 +175,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${renderedValue.join(', ')}"`, : `Expected output to ${inverse ? 'not ' : ''}contain one of "${renderedValue.join(', ')}"`,
assertion,
}; };
} }
@@ -182,6 +192,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain all of "${renderedValue.join(', ')}"`, : `Expected output to ${inverse ? 'not ' : ''}contain all of "${renderedValue.join(', ')}"`,
assertion,
}; };
} }
@@ -199,6 +210,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}match regex "${renderedValue}"`, : `Expected output to ${inverse ? 'not ' : ''}match regex "${renderedValue}"`,
assertion,
}; };
} }
@@ -215,6 +227,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`, : `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
assertion,
}; };
} }
@@ -231,6 +244,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}start with "${renderedValue}"`, : `Expected output to ${inverse ? 'not ' : ''}start with "${renderedValue}"`,
assertion,
}; };
} }
@@ -242,6 +256,7 @@ export async function runAssertion(
reason: pass reason: pass
? 'Assertion passed' ? 'Assertion passed'
: `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`, : `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
assertion,
}; };
} }
@@ -271,6 +286,7 @@ export async function runAssertion(
score: 0, score: 0,
reason: `Custom function threw error: ${(err as Error).message} reason: `Custom function threw error: ${(err as Error).message}
${renderedValue}`, ${renderedValue}`,
assertion,
}; };
} }
return { return {
@@ -280,6 +296,7 @@ ${renderedValue}`,
? 'Assertion passed' ? 'Assertion passed'
: `Custom function returned ${inverse ? 'true' : 'false'} : `Custom function returned ${inverse ? 'true' : 'false'}
${renderedValue}`, ${renderedValue}`,
assertion,
}; };
} }
@@ -315,6 +332,7 @@ ${renderedValue}`,
pass: false, pass: false,
score: 0, score: 0,
reason: `Python code execution failed: ${(err as Error).message}`, reason: `Python code execution failed: ${(err as Error).message}`,
assertion,
}; };
} }
return { return {
@@ -324,6 +342,7 @@ ${renderedValue}`,
? 'Assertion passed' ? 'Assertion passed'
: `Python code returned ${pass ? 'true' : 'false'} : `Python code returned ${pass ? 'true' : 'false'}
${assertion.value}`, ${assertion.value}`,
assertion,
}; };
} }
@@ -333,7 +352,10 @@ ${assertion.value}`,
typeof renderedValue === 'string', typeof renderedValue === 'string',
'"contains" assertion type must have a string value', '"contains" assertion type must have a string value',
); );
return matchesSimilarity(renderedValue, output, assertion.threshold || 0.75, inverse); return {
assertion,
...(await matchesSimilarity(renderedValue, output, assertion.threshold || 0.75, inverse)),
};
} }
if (baseType === 'llm-rubric') { if (baseType === 'llm-rubric') {
@@ -342,7 +364,10 @@ ${assertion.value}`,
typeof renderedValue === 'string', typeof renderedValue === 'string',
'"contains" assertion type must have a string value', '"contains" assertion type must have a string value',
); );
return matchesLlmRubric(renderedValue, output, test.options); return {
assertion,
...(await matchesLlmRubric(renderedValue, output, test.options)),
};
} }
if (baseType === 'webhook') { if (baseType === 'webhook') {
@@ -384,6 +409,7 @@ ${assertion.value}`,
pass: false, pass: false,
score: 0, score: 0,
reason: `Webhook error: ${(err as Error).message}`, reason: `Webhook error: ${(err as Error).message}`,
assertion,
}; };
} }
@@ -391,6 +417,7 @@ ${assertion.value}`,
pass, pass,
score, score,
reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`, reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
assertion,
}; };
} }
@@ -428,7 +455,7 @@ export async function matchesSimilarity(
output: string, output: string,
threshold: number, threshold: number,
inverse: boolean = false, inverse: boolean = false,
): Promise<GradingResult> { ): Promise<Omit<GradingResult, 'assertion'>> {
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected); const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output); const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
@@ -483,7 +510,7 @@ export async function matchesLlmRubric(
expected: string, expected: string,
output: string, output: string,
options?: GradingConfig, options?: GradingConfig,
): Promise<GradingResult> { ): Promise<Omit<GradingResult, 'assertion'>> {
if (!options) { if (!options) {
throw new Error( throw new Error(
'Cannot grade output without grading config. Specify --grader option or grading config.', 'Cannot grade output without grading config. Specify --grader option or grading config.',

View File

@@ -156,6 +156,7 @@ class Evaluator {
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion; this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
} }
ret.response = processedResponse; ret.response = processedResponse;
ret.gradingResult = checkResult;
} else { } else {
ret.success = false; ret.success = false;
ret.score = 0; ret.score = 0;
@@ -464,6 +465,7 @@ class Evaluator {
prompt: row.prompt.raw, prompt: row.prompt.raw,
latencyMs: row.latencyMs, latencyMs: row.latencyMs,
tokenUsage: row.response?.tokenUsage, tokenUsage: row.response?.tokenUsage,
gradingResult: row.gradingResult,
}; };
}, },
); );

View File

@@ -97,6 +97,7 @@ export interface EvaluateResult {
success: boolean; success: boolean;
score: number; score: number;
latencyMs: number; latencyMs: number;
gradingResult?: GradingResult;
} }
export interface EvaluateTableOutput { export interface EvaluateTableOutput {
@@ -106,6 +107,7 @@ export interface EvaluateTableOutput {
prompt: string; prompt: string;
latencyMs: number; latencyMs: number;
tokenUsage?: Partial<TokenUsage>; tokenUsage?: Partial<TokenUsage>;
gradingResult?: GradingResult;
} }
export interface EvaluateTable { export interface EvaluateTable {
@@ -138,6 +140,8 @@ export interface GradingResult {
score: number; score: number;
reason: string; reason: string;
tokensUsed?: TokenUsage; tokensUsed?: TokenUsage;
componentResults?: GradingResult[];
assertion: Assertion | null;
} }
type BaseAssertionTypes = type BaseAssertionTypes =

View File

@@ -1,4 +1,5 @@
import { useState, useEffect } from 'react'; import { useState, useEffect } from 'react';
import Box from '@mui/material/Box';
import Button from '@mui/material/Button'; import Button from '@mui/material/Button';
import Dialog from '@mui/material/Dialog'; import Dialog from '@mui/material/Dialog';
import DialogActions from '@mui/material/DialogActions'; import DialogActions from '@mui/material/DialogActions';
@@ -8,12 +9,60 @@ import TextareaAutosize from '@mui/base/TextareaAutosize';
import IconButton from '@mui/material/IconButton'; import IconButton from '@mui/material/IconButton';
import ContentCopyIcon from '@mui/icons-material/ContentCopy'; import ContentCopyIcon from '@mui/icons-material/ContentCopy';
import CheckIcon from '@mui/icons-material/Check'; import CheckIcon from '@mui/icons-material/Check';
import Table from '@mui/material/Table';
import TableBody from '@mui/material/TableBody';
import TableCell from '@mui/material/TableCell';
import TableContainer from '@mui/material/TableContainer';
import TableHead from '@mui/material/TableHead';
import TableRow from '@mui/material/TableRow';
import Typography from '@mui/material/Typography';
import type { GradingResult } from '../../../types';
interface EvalOutputPromptDialogProps { interface EvalOutputPromptDialogProps {
open: boolean; open: boolean;
onClose: () => void; onClose: () => void;
prompt: string; prompt: string;
output?: string; output?: string;
gradingResults?: GradingResult[];
}
function AssertionResults({ gradingResults }: { gradingResults?: GradingResult[] }) {
if (!gradingResults) {
return null;
}
return (
<Box mt={2}>
<Typography variant="subtitle1">Assertions</Typography>
<TableContainer>
<Table>
<TableHead>
<TableRow>
<TableCell style={{ fontWeight: 'bold' }}>Pass</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Score</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Type</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Value</TableCell>
<TableCell style={{ fontWeight: 'bold' }}>Reason</TableCell>
</TableRow>
</TableHead>
<TableBody>
{gradingResults.map((result, i) => (
<TableRow key={i}>
<TableCell>{result.pass ? '✅' : '❌'}</TableCell>
<TableCell>{result.score}</TableCell>
<TableCell>{result.assertion?.type || ''}</TableCell>
<TableCell>
{result.assertion?.value ? String(result.assertion.value) : '-'}
</TableCell>
<TableCell>{result.reason}</TableCell>
</TableRow>
))}
</TableBody>
</Table>
</TableContainer>
</Box>
);
} }
export default function EvalOutputPromptDialog({ export default function EvalOutputPromptDialog({
@@ -21,6 +70,7 @@ export default function EvalOutputPromptDialog({
onClose, onClose,
prompt, prompt,
output, output,
gradingResults,
}: EvalOutputPromptDialogProps) { }: EvalOutputPromptDialogProps) {
const [copied, setCopied] = useState(false); const [copied, setCopied] = useState(false);
@@ -35,28 +85,40 @@ export default function EvalOutputPromptDialog({
return ( return (
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg"> <Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
<DialogTitle>Prompt</DialogTitle> <DialogTitle>Details</DialogTitle>
<DialogContent> <DialogContent>
<TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} /> <Box mb={2}>
<IconButton <Typography variant="subtitle1" style={{ marginBottom: '1rem' }}>
onClick={() => copyToClipboard(prompt)} Prompt
style={{ position: 'absolute', right: '10px', top: '10px' }} </Typography>
> <TextareaAutosize
{copied ? <CheckIcon /> : <ContentCopyIcon />} readOnly
</IconButton> value={prompt}
</DialogContent> style={{ width: '100%', padding: '0.75rem' }}
{output && ( maxRows={20}
<> />
<DialogTitle>Output</DialogTitle> <IconButton
<DialogContent> onClick={() => copyToClipboard(prompt)}
style={{ position: 'absolute', right: '10px', top: '10px' }}
>
{copied ? <CheckIcon /> : <ContentCopyIcon />}
</IconButton>
</Box>
{output && (
<Box my={2}>
<Typography variant="subtitle1" style={{ marginBottom: '1rem', marginTop: '1rem' }}>
Output
</Typography>
<TextareaAutosize <TextareaAutosize
readOnly readOnly
maxRows={20}
value={output} value={output}
style={{ width: '100%', padding: '0.75rem' }} style={{ width: '100%', padding: '0.75rem' }}
/> />
</DialogContent> </Box>
</> )}
)} <AssertionResults gradingResults={gradingResults} />
</DialogContent>
<DialogActions> <DialogActions>
<Button onClick={onClose}>Close</Button> <Button onClick={onClose}>Close</Button>
</DialogActions> </DialogActions>

View File

@@ -155,6 +155,7 @@ function EvalOutputCell({
open={openPrompt} open={openPrompt}
onClose={handlePromptClose} onClose={handlePromptClose}
prompt={output.prompt} prompt={output.prompt}
gradingResults={output.gradingResult?.componentResults}
output={text} output={text}
/> />
</> </>
@@ -223,12 +224,25 @@ export default function ResultsTable({
const { table, setTable } = useStore(); const { table, setTable } = useStore();
invariant(table, 'Table should be defined'); invariant(table, 'Table should be defined');
const { head, body } = table; const { head, body } = table;
const numGood = head.prompts.map((_, idx) => const numGoodTests = head.prompts.map((_, idx) =>
body.reduce((acc, row) => { body.reduce((acc, row) => {
return acc + (row.outputs[idx].pass ? 1 : 0); return acc + (row.outputs[idx].pass ? 1 : 0);
}, 0), }, 0),
); );
const numAsserts = head.prompts.map((_, idx) =>
body.reduce((acc, row) => {
return acc + (row.outputs[idx].gradingResult?.componentResults?.length || 0);
}, 0),
);
const numGoodAsserts = head.prompts.map((_, idx) =>
body.reduce((acc, row) => {
const componentResults = row.outputs[idx].gradingResult?.componentResults;
return acc + (componentResults ? componentResults.filter((r) => r.pass).length : 0);
}, 0),
);
const handleRating = (rowIndex: number, promptIndex: number, isPass: boolean) => { const handleRating = (rowIndex: number, promptIndex: number, isPass: boolean) => {
const updatedData = [...body]; const updatedData = [...body];
const updatedRow = { ...updatedData[rowIndex] }; const updatedRow = { ...updatedData[rowIndex] };
@@ -243,10 +257,13 @@ export default function ResultsTable({
}); });
}; };
const highestPassingIndex = numGood.reduce((maxIndex, currentPassCount, currentIndex, array) => { const highestPassingIndex = numGoodTests.reduce(
return currentPassCount > array[maxIndex] ? currentIndex : maxIndex; (maxIndex, currentPassCount, currentIndex, array) => {
}, 0); return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
const highestPassingCount = numGood[highestPassingIndex]; },
0,
);
const highestPassingCount = numGoodTests[highestPassingIndex];
const columnHelper = createColumnHelper<EvalRow>(); const columnHelper = createColumnHelper<EvalRow>();
const columns = [ const columns = [
columnHelper.group({ columnHelper.group({
@@ -282,9 +299,9 @@ export default function ResultsTable({
columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), { columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
id: `Prompt ${idx + 1}`, id: `Prompt ${idx + 1}`,
header: () => { header: () => {
const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2); const pct = ((numGoodTests[idx] / body.length) * 100.0).toFixed(2);
const isHighestPassing = const isHighestPassing =
numGood[idx] === highestPassingCount && highestPassingCount !== 0; numGoodTests[idx] === highestPassingCount && highestPassingCount !== 0;
const columnId = `Prompt ${idx + 1}`; const columnId = `Prompt ${idx + 1}`;
const isChecked = failureFilter[columnId] || false; const isChecked = failureFilter[columnId] || false;
// TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0 // TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
@@ -313,12 +330,18 @@ export default function ResultsTable({
/> />
)} )}
<div className={`summary ${isHighestPassing ? 'highlight' : ''}`}> <div className={`summary ${isHighestPassing ? 'highlight' : ''}`}>
Passing: <strong>{pct}%</strong> ({numGood[idx]} / {body.length}) Passing: <strong>{pct}%</strong> ({numGoodTests[idx]}/{body.length} cases
{numAsserts[idx] ? (
<span>
, {numGoodAsserts[idx]}/{numAsserts[idx]} asserts
</span>
) : null}
)
</div> </div>
</> </>
); );
}, },
cell: (info: CellContext<EvalRow, string>) => ( cell: (info: CellContext<EvalRow, EvalRowOutput>) => (
<EvalOutputCell <EvalOutputCell
output={info.getValue() as unknown as EvalRowOutput} output={info.getValue() as unknown as EvalRowOutput}
maxTextLength={maxTextLength} maxTextLength={maxTextLength}

View File

@@ -1,4 +1,4 @@
import { TokenUsage } from '../../../types'; import { EvaluateResult, TokenUsage } from '../../../types';
type Prompt = { type Prompt = {
display: string; display: string;
@@ -10,6 +10,7 @@ export type EvalHead = {
vars: string[]; vars: string[];
}; };
// TODO(ian): Remove this and replace with EvaluateResult
export type EvalRowOutput = { export type EvalRowOutput = {
pass: boolean; pass: boolean;
score: number; score: number;
@@ -17,6 +18,7 @@ export type EvalRowOutput = {
prompt: string; prompt: string;
latencyMs: number; latencyMs: number;
tokenUsage?: Partial<TokenUsage>; tokenUsage?: Partial<TokenUsage>;
gradingResult: EvaluateResult['gradingResult'];
}; };
export type EvalRow = { export type EvalRow = {

View File

@@ -70,12 +70,22 @@ describe('runAssertion', () => {
const javascriptFunctionAssertion: Assertion = { const javascriptFunctionAssertion: Assertion = {
type: 'javascript', type: 'javascript',
value: async (output: string) => ({ pass: true, score: 0.5, reason: 'Assertion passed' }), value: async (output: string) => ({
pass: true,
score: 0.5,
reason: 'Assertion passed',
assertion: null,
}),
}; };
const javascriptFunctionFailAssertion: Assertion = { const javascriptFunctionFailAssertion: Assertion = {
type: 'javascript', type: 'javascript',
value: async (output: string) => ({ pass: false, score: 0.5, reason: 'Assertion failed' }), value: async (output: string) => ({
pass: false,
score: 0.5,
reason: 'Assertion failed',
assertion: null,
}),
}; };
it('should pass when the equality assertion passes', async () => { it('should pass when the equality assertion passes', async () => {