mirror of
https://github.com/promptfoo/promptfoo.git
synced 2023-08-15 01:10:51 +03:00
Add assertions in webview (#100)
This commit is contained in:
@@ -1,80 +1,82 @@
|
||||
Rephrase this from English to Pirate: {{body}},Pretend you're a pirate and speak these words: {{body}},body
|
||||
"[FAIL] Expected output ""Yarr""
|
||||
"{""raw"":""Rephrase this from English to Pirate: {{body}}"",""display"":""Rephrase this from English to Pirate: {{body}}""}","{""raw"":""Pretend you're a pirate and speak these words: {{body}}"",""display"":""Pretend you're a pirate and speak these words: {{body}}""}",body
|
||||
"[FAIL] (0.00) Expected output ""Yarr""
|
||||
---
|
||||
Aye.","[FAIL] Expected output ""Yarr""
|
||||
Aye","[FAIL] (0.00) Expected output ""Yarr""
|
||||
---
|
||||
Ahoy mateys! Yes, I be a pirate and proud of it! Ye best be ready to hoist the Jolly Roger and set sail on the high seas with me crew. Yes, we'll be plunderin' and pillagin' all the treasure we can find. So, hoist the anchor and let's set sail! Yes, we'll be the most feared pirates on the seven seas! Arrr!",Yes
|
||||
"[PASS] I be feelin' a mighty need for grub, matey.","[PASS] Arrr, me belly be rumblin'! I be needin' some grub, mateys! Bring me some vittles or ye'll be walkin' the plank!",I'm hungry
|
||||
"[PASS] ""Spew forth the tale of yer life in JSON, me hearty!""","[PASS] {
|
||||
Arr, me hearties! Yes, be the word ye be seekin'.",Yes
|
||||
"[FAIL] (0.00) Expected output to contain ""grub""
|
||||
---
|
||||
I be feelin' a mighty hunger.","[PASS] (1.00) Arr, me hearties! I be feelin' a mighty rumblin' in me belly, for I be famished! Aye, I be longin' for a feast fit for a pirate. Bring me a bounty of savory meats, hearty stews, and fresh fruits to quell this hunger that be gnawin' at me insides. I be yearnin' for a taste of the sea, with fish so fresh it be jumpin' straight from the ocean onto me plate. So, me mateys, let's set sail on a culinary adventure and fill our bellies with the finest grub a pirate could ask for!",I'm hungry
|
||||
"[PASS] (1.00) ""Splice out the tale of yer life in JSON, matey!""","[PASS] (1.00) {
|
||||
""name"": ""Captain Blackbeard"",
|
||||
""age"": 45,
|
||||
""alias"": ""The Dread Pirate"",
|
||||
""occupation"": ""Pirate"",
|
||||
""birthplace"": ""Port Royal"",
|
||||
""birthyear"": 1680,
|
||||
""nationality"": ""English"",
|
||||
""crew"": [
|
||||
{
|
||||
""name"": ""Redbeard"",
|
||||
""age"": 32,
|
||||
""position"": ""First Mate""
|
||||
},
|
||||
{
|
||||
""name"": ""Scurvy Pete"",
|
||||
""age"": 28,
|
||||
""position"": ""Navigator""
|
||||
},
|
||||
{
|
||||
""name"": ""One-Eyed Jack"",
|
||||
""age"": 50,
|
||||
""position"": ""Quartermaster""
|
||||
}
|
||||
""Redbeard"",
|
||||
""One-Eyed Jack"",
|
||||
""Mad Anne"",
|
||||
""Salty Sam""
|
||||
],
|
||||
""ships"": [
|
||||
{
|
||||
""name"": ""The Black Pearl"",
|
||||
""type"": ""Galleon"",
|
||||
""captured"": [
|
||||
""The Spanish Armada"",
|
||||
""The British Navy""
|
||||
]
|
||||
""type"": ""Sloop"",
|
||||
""year"": 1700
|
||||
},
|
||||
{
|
||||
""name"": ""The Jolly Roger"",
|
||||
""type"": ""Sloop"",
|
||||
""captured"": [
|
||||
""Merchant ships"",
|
||||
""Fishing vessels""
|
||||
]
|
||||
""name"": ""The Revenge"",
|
||||
""type"": ""Frigate"",
|
||||
""year"": 1715
|
||||
}
|
||||
],
|
||||
""treasure"": [
|
||||
""treasures"": [
|
||||
{
|
||||
""type"": ""Gold"",
|
||||
""amount"": 50000
|
||||
""name"": ""Cursed Aztec Gold"",
|
||||
""value"": ""Priceless""
|
||||
},
|
||||
{
|
||||
""type"": ""Silver"",
|
||||
""amount"": 25000
|
||||
""name"": ""Emerald of the Sea"",
|
||||
""value"": ""500,000 gold coins""
|
||||
},
|
||||
{
|
||||
""type"": ""Jewels"",
|
||||
""amount"": 100000
|
||||
""name"": ""Golden Chalice"",
|
||||
""value"": ""200,000 gold coins""
|
||||
}
|
||||
],
|
||||
""adventures"": [
|
||||
{
|
||||
""name"": ""The Battle of Nassau"",
|
||||
""location"": ""Nassau"",
|
||||
""outcome"": ""Victory""
|
||||
},
|
||||
{
|
||||
""name"": ""The Raid on Port Royal"",
|
||||
""location"": ""Port Royal"",
|
||||
""outcome"": ""Success""
|
||||
},
|
||||
{
|
||||
""name"": ""The Treasure of Tortuga"",
|
||||
""year"": 1698,
|
||||
""location"": ""Tortuga"",
|
||||
""outcome"": ""Success""
|
||||
""description"": ""First encounter with pirates, joined crew of Captain Redbeard.""
|
||||
},
|
||||
{
|
||||
""year"": 1702,
|
||||
""location"": ""Isla de Muerta"",
|
||||
""description"": ""Discovered cursed Aztec gold, narrowly escaped the wrath of cursed pirates.""
|
||||
},
|
||||
{
|
||||
""year"": 1705,
|
||||
""location"": ""Port Royal"",
|
||||
""description"": ""Led a successful raid on the governor's mansion, acquired valuable treasures.""
|
||||
},
|
||||
{
|
||||
""year"": 1710,
|
||||
""location"": ""Treasure Island"",
|
||||
""description"": ""Unearthed the hidden treasure of Captain Flint, became legendary among pirates.""
|
||||
},
|
||||
{
|
||||
""year"": 1718,
|
||||
""location"": ""Nassau"",
|
||||
""description"": ""Declared war against the British Empire, led a fleet of pirate ships.""
|
||||
}
|
||||
]
|
||||
}",Output the story of your life in JSON
|
||||
[PASS] Ahoy mateys o' the world!,"[PASS] Ahoy there, me hearties! Avast ye landlubbers! 'Tis I, a fearsome pirate, comin' to ye from the seven seas. Ahoy, hello world!",Hello world
|
||||
"[PASS] Arrr, thar be a swift brown fox leaping o'er th' lazy dog.","[PASS] Arrr, me hearties! The quick brown fox be jumpin' over the lazy dog! Ye best be keepin' yer eyes peeled for that sly fox, or he'll be stealin' all yer booty!",The quick brown fox jumps over the lazy dog
|
||||
"[PASS] (0.96) Ahoy thar, mateys!","[PASS] (0.96) Ahoy, me hearties! Avast ye, landlubbers! 'Tis I, a fearsome pirate, settin' sail on the vast seas. Hear me words, ye scurvy dogs: ""Hello world!""",Hello world
|
||||
"[FAIL] (0.50) Expected output to contain ""arrr""
|
||||
---
|
||||
Thar swift brown fox be jumpin' o'er thar lazy dog.","[FAIL] (0.50) Expected output to contain ""arrr""
|
||||
---
|
||||
Arr, me hearties! Listen up, ye scurvy dogs! The quick brown fox be jumpin' o'er the lazy dog, aye!",The quick brown fox jumps over the lazy dog
|
||||
|
||||
|
@@ -40,6 +40,7 @@ function handleRougeScore(
|
||||
: `${baseType.toUpperCase()} score ${score} is less than threshold ${
|
||||
assertion.threshold || 0.75
|
||||
}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -51,13 +52,14 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
|
||||
};
|
||||
|
||||
if (!test.assert || test.assert.length < 1) {
|
||||
return { pass: true, score: 1, reason: 'No assertions', tokensUsed };
|
||||
return { pass: true, score: 1, reason: 'No assertions', tokensUsed, assertion: null };
|
||||
}
|
||||
|
||||
let totalScore = 0;
|
||||
let totalWeight = 0;
|
||||
let allPass = true;
|
||||
let failedReason = '';
|
||||
const componentResults: GradingResult[] = [];
|
||||
|
||||
for (const assertion of test.assert) {
|
||||
const weight = assertion.weight || 1;
|
||||
@@ -65,6 +67,8 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
|
||||
|
||||
const result = await runAssertion(assertion, test, output);
|
||||
totalScore += result.score * weight;
|
||||
componentResults.push(result);
|
||||
|
||||
if (result.tokensUsed) {
|
||||
tokensUsed.total += result.tokensUsed.total;
|
||||
tokensUsed.prompt += result.tokensUsed.prompt;
|
||||
@@ -85,6 +89,8 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
|
||||
score: totalScore / totalWeight,
|
||||
reason: allPass ? 'All assertions passed' : failedReason,
|
||||
tokensUsed,
|
||||
componentResults,
|
||||
assertion: null,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -120,6 +126,7 @@ export async function runAssertion(
|
||||
pass,
|
||||
score: pass ? 1 : 0,
|
||||
reason: pass ? 'Assertion passed' : `Expected output "${renderedValue}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -134,6 +141,7 @@ export async function runAssertion(
|
||||
pass,
|
||||
score: pass ? 1 : 0,
|
||||
reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON',
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -150,6 +158,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -166,6 +175,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${renderedValue.join(', ')}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -182,6 +192,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}contain all of "${renderedValue.join(', ')}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -199,6 +210,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}match regex "${renderedValue}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -215,6 +227,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}contain "${renderedValue}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -231,6 +244,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}start with "${renderedValue}"`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -242,6 +256,7 @@ export async function runAssertion(
|
||||
reason: pass
|
||||
? 'Assertion passed'
|
||||
: `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -271,6 +286,7 @@ export async function runAssertion(
|
||||
score: 0,
|
||||
reason: `Custom function threw error: ${(err as Error).message}
|
||||
${renderedValue}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
return {
|
||||
@@ -280,6 +296,7 @@ ${renderedValue}`,
|
||||
? 'Assertion passed'
|
||||
: `Custom function returned ${inverse ? 'true' : 'false'}
|
||||
${renderedValue}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -315,6 +332,7 @@ ${renderedValue}`,
|
||||
pass: false,
|
||||
score: 0,
|
||||
reason: `Python code execution failed: ${(err as Error).message}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
return {
|
||||
@@ -324,6 +342,7 @@ ${renderedValue}`,
|
||||
? 'Assertion passed'
|
||||
: `Python code returned ${pass ? 'true' : 'false'}
|
||||
${assertion.value}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -333,7 +352,10 @@ ${assertion.value}`,
|
||||
typeof renderedValue === 'string',
|
||||
'"contains" assertion type must have a string value',
|
||||
);
|
||||
return matchesSimilarity(renderedValue, output, assertion.threshold || 0.75, inverse);
|
||||
return {
|
||||
assertion,
|
||||
...(await matchesSimilarity(renderedValue, output, assertion.threshold || 0.75, inverse)),
|
||||
};
|
||||
}
|
||||
|
||||
if (baseType === 'llm-rubric') {
|
||||
@@ -342,7 +364,10 @@ ${assertion.value}`,
|
||||
typeof renderedValue === 'string',
|
||||
'"contains" assertion type must have a string value',
|
||||
);
|
||||
return matchesLlmRubric(renderedValue, output, test.options);
|
||||
return {
|
||||
assertion,
|
||||
...(await matchesLlmRubric(renderedValue, output, test.options)),
|
||||
};
|
||||
}
|
||||
|
||||
if (baseType === 'webhook') {
|
||||
@@ -384,6 +409,7 @@ ${assertion.value}`,
|
||||
pass: false,
|
||||
score: 0,
|
||||
reason: `Webhook error: ${(err as Error).message}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -391,6 +417,7 @@ ${assertion.value}`,
|
||||
pass,
|
||||
score,
|
||||
reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
|
||||
assertion,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -428,7 +455,7 @@ export async function matchesSimilarity(
|
||||
output: string,
|
||||
threshold: number,
|
||||
inverse: boolean = false,
|
||||
): Promise<GradingResult> {
|
||||
): Promise<Omit<GradingResult, 'assertion'>> {
|
||||
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
||||
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
|
||||
|
||||
@@ -483,7 +510,7 @@ export async function matchesLlmRubric(
|
||||
expected: string,
|
||||
output: string,
|
||||
options?: GradingConfig,
|
||||
): Promise<GradingResult> {
|
||||
): Promise<Omit<GradingResult, 'assertion'>> {
|
||||
if (!options) {
|
||||
throw new Error(
|
||||
'Cannot grade output without grading config. Specify --grader option or grading config.',
|
||||
|
||||
@@ -156,6 +156,7 @@ class Evaluator {
|
||||
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
|
||||
}
|
||||
ret.response = processedResponse;
|
||||
ret.gradingResult = checkResult;
|
||||
} else {
|
||||
ret.success = false;
|
||||
ret.score = 0;
|
||||
@@ -464,6 +465,7 @@ class Evaluator {
|
||||
prompt: row.prompt.raw,
|
||||
latencyMs: row.latencyMs,
|
||||
tokenUsage: row.response?.tokenUsage,
|
||||
gradingResult: row.gradingResult,
|
||||
};
|
||||
},
|
||||
);
|
||||
|
||||
@@ -97,6 +97,7 @@ export interface EvaluateResult {
|
||||
success: boolean;
|
||||
score: number;
|
||||
latencyMs: number;
|
||||
gradingResult?: GradingResult;
|
||||
}
|
||||
|
||||
export interface EvaluateTableOutput {
|
||||
@@ -106,6 +107,7 @@ export interface EvaluateTableOutput {
|
||||
prompt: string;
|
||||
latencyMs: number;
|
||||
tokenUsage?: Partial<TokenUsage>;
|
||||
gradingResult?: GradingResult;
|
||||
}
|
||||
|
||||
export interface EvaluateTable {
|
||||
@@ -138,6 +140,8 @@ export interface GradingResult {
|
||||
score: number;
|
||||
reason: string;
|
||||
tokensUsed?: TokenUsage;
|
||||
componentResults?: GradingResult[];
|
||||
assertion: Assertion | null;
|
||||
}
|
||||
|
||||
type BaseAssertionTypes =
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { useState, useEffect } from 'react';
|
||||
import Box from '@mui/material/Box';
|
||||
import Button from '@mui/material/Button';
|
||||
import Dialog from '@mui/material/Dialog';
|
||||
import DialogActions from '@mui/material/DialogActions';
|
||||
@@ -8,12 +9,60 @@ import TextareaAutosize from '@mui/base/TextareaAutosize';
|
||||
import IconButton from '@mui/material/IconButton';
|
||||
import ContentCopyIcon from '@mui/icons-material/ContentCopy';
|
||||
import CheckIcon from '@mui/icons-material/Check';
|
||||
import Table from '@mui/material/Table';
|
||||
import TableBody from '@mui/material/TableBody';
|
||||
import TableCell from '@mui/material/TableCell';
|
||||
import TableContainer from '@mui/material/TableContainer';
|
||||
import TableHead from '@mui/material/TableHead';
|
||||
import TableRow from '@mui/material/TableRow';
|
||||
import Typography from '@mui/material/Typography';
|
||||
|
||||
import type { GradingResult } from '../../../types';
|
||||
|
||||
interface EvalOutputPromptDialogProps {
|
||||
open: boolean;
|
||||
onClose: () => void;
|
||||
prompt: string;
|
||||
output?: string;
|
||||
gradingResults?: GradingResult[];
|
||||
}
|
||||
|
||||
function AssertionResults({ gradingResults }: { gradingResults?: GradingResult[] }) {
|
||||
if (!gradingResults) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (
|
||||
<Box mt={2}>
|
||||
<Typography variant="subtitle1">Assertions</Typography>
|
||||
<TableContainer>
|
||||
<Table>
|
||||
<TableHead>
|
||||
<TableRow>
|
||||
<TableCell style={{ fontWeight: 'bold' }}>Pass</TableCell>
|
||||
<TableCell style={{ fontWeight: 'bold' }}>Score</TableCell>
|
||||
<TableCell style={{ fontWeight: 'bold' }}>Type</TableCell>
|
||||
<TableCell style={{ fontWeight: 'bold' }}>Value</TableCell>
|
||||
<TableCell style={{ fontWeight: 'bold' }}>Reason</TableCell>
|
||||
</TableRow>
|
||||
</TableHead>
|
||||
<TableBody>
|
||||
{gradingResults.map((result, i) => (
|
||||
<TableRow key={i}>
|
||||
<TableCell>{result.pass ? '✅' : '❌'}</TableCell>
|
||||
<TableCell>{result.score}</TableCell>
|
||||
<TableCell>{result.assertion?.type || ''}</TableCell>
|
||||
<TableCell>
|
||||
{result.assertion?.value ? String(result.assertion.value) : '-'}
|
||||
</TableCell>
|
||||
<TableCell>{result.reason}</TableCell>
|
||||
</TableRow>
|
||||
))}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</TableContainer>
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
|
||||
export default function EvalOutputPromptDialog({
|
||||
@@ -21,6 +70,7 @@ export default function EvalOutputPromptDialog({
|
||||
onClose,
|
||||
prompt,
|
||||
output,
|
||||
gradingResults,
|
||||
}: EvalOutputPromptDialogProps) {
|
||||
const [copied, setCopied] = useState(false);
|
||||
|
||||
@@ -35,28 +85,40 @@ export default function EvalOutputPromptDialog({
|
||||
|
||||
return (
|
||||
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
|
||||
<DialogTitle>Prompt</DialogTitle>
|
||||
<DialogTitle>Details</DialogTitle>
|
||||
<DialogContent>
|
||||
<TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} />
|
||||
<IconButton
|
||||
onClick={() => copyToClipboard(prompt)}
|
||||
style={{ position: 'absolute', right: '10px', top: '10px' }}
|
||||
>
|
||||
{copied ? <CheckIcon /> : <ContentCopyIcon />}
|
||||
</IconButton>
|
||||
</DialogContent>
|
||||
{output && (
|
||||
<>
|
||||
<DialogTitle>Output</DialogTitle>
|
||||
<DialogContent>
|
||||
<Box mb={2}>
|
||||
<Typography variant="subtitle1" style={{ marginBottom: '1rem' }}>
|
||||
Prompt
|
||||
</Typography>
|
||||
<TextareaAutosize
|
||||
readOnly
|
||||
value={prompt}
|
||||
style={{ width: '100%', padding: '0.75rem' }}
|
||||
maxRows={20}
|
||||
/>
|
||||
<IconButton
|
||||
onClick={() => copyToClipboard(prompt)}
|
||||
style={{ position: 'absolute', right: '10px', top: '10px' }}
|
||||
>
|
||||
{copied ? <CheckIcon /> : <ContentCopyIcon />}
|
||||
</IconButton>
|
||||
</Box>
|
||||
{output && (
|
||||
<Box my={2}>
|
||||
<Typography variant="subtitle1" style={{ marginBottom: '1rem', marginTop: '1rem' }}>
|
||||
Output
|
||||
</Typography>
|
||||
<TextareaAutosize
|
||||
readOnly
|
||||
maxRows={20}
|
||||
value={output}
|
||||
style={{ width: '100%', padding: '0.75rem' }}
|
||||
/>
|
||||
</DialogContent>
|
||||
</>
|
||||
)}
|
||||
</Box>
|
||||
)}
|
||||
<AssertionResults gradingResults={gradingResults} />
|
||||
</DialogContent>
|
||||
<DialogActions>
|
||||
<Button onClick={onClose}>Close</Button>
|
||||
</DialogActions>
|
||||
|
||||
@@ -155,6 +155,7 @@ function EvalOutputCell({
|
||||
open={openPrompt}
|
||||
onClose={handlePromptClose}
|
||||
prompt={output.prompt}
|
||||
gradingResults={output.gradingResult?.componentResults}
|
||||
output={text}
|
||||
/>
|
||||
</>
|
||||
@@ -223,12 +224,25 @@ export default function ResultsTable({
|
||||
const { table, setTable } = useStore();
|
||||
invariant(table, 'Table should be defined');
|
||||
const { head, body } = table;
|
||||
const numGood = head.prompts.map((_, idx) =>
|
||||
const numGoodTests = head.prompts.map((_, idx) =>
|
||||
body.reduce((acc, row) => {
|
||||
return acc + (row.outputs[idx].pass ? 1 : 0);
|
||||
}, 0),
|
||||
);
|
||||
|
||||
const numAsserts = head.prompts.map((_, idx) =>
|
||||
body.reduce((acc, row) => {
|
||||
return acc + (row.outputs[idx].gradingResult?.componentResults?.length || 0);
|
||||
}, 0),
|
||||
);
|
||||
|
||||
const numGoodAsserts = head.prompts.map((_, idx) =>
|
||||
body.reduce((acc, row) => {
|
||||
const componentResults = row.outputs[idx].gradingResult?.componentResults;
|
||||
return acc + (componentResults ? componentResults.filter((r) => r.pass).length : 0);
|
||||
}, 0),
|
||||
);
|
||||
|
||||
const handleRating = (rowIndex: number, promptIndex: number, isPass: boolean) => {
|
||||
const updatedData = [...body];
|
||||
const updatedRow = { ...updatedData[rowIndex] };
|
||||
@@ -243,10 +257,13 @@ export default function ResultsTable({
|
||||
});
|
||||
};
|
||||
|
||||
const highestPassingIndex = numGood.reduce((maxIndex, currentPassCount, currentIndex, array) => {
|
||||
return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
|
||||
}, 0);
|
||||
const highestPassingCount = numGood[highestPassingIndex];
|
||||
const highestPassingIndex = numGoodTests.reduce(
|
||||
(maxIndex, currentPassCount, currentIndex, array) => {
|
||||
return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
|
||||
},
|
||||
0,
|
||||
);
|
||||
const highestPassingCount = numGoodTests[highestPassingIndex];
|
||||
const columnHelper = createColumnHelper<EvalRow>();
|
||||
const columns = [
|
||||
columnHelper.group({
|
||||
@@ -282,9 +299,9 @@ export default function ResultsTable({
|
||||
columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
|
||||
id: `Prompt ${idx + 1}`,
|
||||
header: () => {
|
||||
const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
|
||||
const pct = ((numGoodTests[idx] / body.length) * 100.0).toFixed(2);
|
||||
const isHighestPassing =
|
||||
numGood[idx] === highestPassingCount && highestPassingCount !== 0;
|
||||
numGoodTests[idx] === highestPassingCount && highestPassingCount !== 0;
|
||||
const columnId = `Prompt ${idx + 1}`;
|
||||
const isChecked = failureFilter[columnId] || false;
|
||||
// TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
|
||||
@@ -313,12 +330,18 @@ export default function ResultsTable({
|
||||
/>
|
||||
)}
|
||||
<div className={`summary ${isHighestPassing ? 'highlight' : ''}`}>
|
||||
Passing: <strong>{pct}%</strong> ({numGood[idx]} / {body.length})
|
||||
Passing: <strong>{pct}%</strong> ({numGoodTests[idx]}/{body.length} cases
|
||||
{numAsserts[idx] ? (
|
||||
<span>
|
||||
, {numGoodAsserts[idx]}/{numAsserts[idx]} asserts
|
||||
</span>
|
||||
) : null}
|
||||
)
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
},
|
||||
cell: (info: CellContext<EvalRow, string>) => (
|
||||
cell: (info: CellContext<EvalRow, EvalRowOutput>) => (
|
||||
<EvalOutputCell
|
||||
output={info.getValue() as unknown as EvalRowOutput}
|
||||
maxTextLength={maxTextLength}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { TokenUsage } from '../../../types';
|
||||
import { EvaluateResult, TokenUsage } from '../../../types';
|
||||
|
||||
type Prompt = {
|
||||
display: string;
|
||||
@@ -10,6 +10,7 @@ export type EvalHead = {
|
||||
vars: string[];
|
||||
};
|
||||
|
||||
// TODO(ian): Remove this and replace with EvaluateResult
|
||||
export type EvalRowOutput = {
|
||||
pass: boolean;
|
||||
score: number;
|
||||
@@ -17,6 +18,7 @@ export type EvalRowOutput = {
|
||||
prompt: string;
|
||||
latencyMs: number;
|
||||
tokenUsage?: Partial<TokenUsage>;
|
||||
gradingResult: EvaluateResult['gradingResult'];
|
||||
};
|
||||
|
||||
export type EvalRow = {
|
||||
|
||||
@@ -70,12 +70,22 @@ describe('runAssertion', () => {
|
||||
|
||||
const javascriptFunctionAssertion: Assertion = {
|
||||
type: 'javascript',
|
||||
value: async (output: string) => ({ pass: true, score: 0.5, reason: 'Assertion passed' }),
|
||||
value: async (output: string) => ({
|
||||
pass: true,
|
||||
score: 0.5,
|
||||
reason: 'Assertion passed',
|
||||
assertion: null,
|
||||
}),
|
||||
};
|
||||
|
||||
const javascriptFunctionFailAssertion: Assertion = {
|
||||
type: 'javascript',
|
||||
value: async (output: string) => ({ pass: false, score: 0.5, reason: 'Assertion failed' }),
|
||||
value: async (output: string) => ({
|
||||
pass: false,
|
||||
score: 0.5,
|
||||
reason: 'Assertion failed',
|
||||
assertion: null,
|
||||
}),
|
||||
};
|
||||
|
||||
it('should pass when the equality assertion passes', async () => {
|
||||
|
||||
Reference in New Issue
Block a user