add evaluations
This commit is contained in:
@@ -1,43 +1,216 @@
|
||||
import { Text, Heading, Stack } from "@chakra-ui/react";
|
||||
import { useState } from "react";
|
||||
import {
|
||||
Text,
|
||||
Button,
|
||||
HStack,
|
||||
Heading,
|
||||
Icon,
|
||||
Input,
|
||||
Stack,
|
||||
VStack,
|
||||
FormControl,
|
||||
FormLabel,
|
||||
Select,
|
||||
FormHelperText,
|
||||
} from "@chakra-ui/react";
|
||||
import { type Evaluation, EvaluationMatchType } from "@prisma/client";
|
||||
import { useCallback, useState } from "react";
|
||||
import { BsPencil, BsX } from "react-icons/bs";
|
||||
import { api } from "~/utils/api";
|
||||
import { useExperiment, useHandledAsyncCallback } from "~/utils/hooks";
|
||||
import { useStore } from "~/utils/store";
|
||||
|
||||
type EvalValues = Pick<Evaluation, "name" | "matchString" | "matchType">;
|
||||
|
||||
export function EvaluationEditor(props: {
|
||||
evaluation: Evaluation | null;
|
||||
defaultName?: string;
|
||||
onSave: (id: string | undefined, vals: EvalValues) => void;
|
||||
onCancel: () => void;
|
||||
}) {
|
||||
const [values, setValues] = useState<EvalValues>({
|
||||
name: props.evaluation?.name ?? props.defaultName ?? "",
|
||||
matchString: props.evaluation?.matchString ?? "",
|
||||
matchType: props.evaluation?.matchType ?? "CONTAINS",
|
||||
});
|
||||
|
||||
return (
|
||||
<VStack borderTopWidth={1} borderColor="gray.200" py={4}>
|
||||
<HStack w="100%">
|
||||
<FormControl flex={1}>
|
||||
<FormLabel fontSize="sm">Evaluation Name</FormLabel>
|
||||
<Input
|
||||
size="sm"
|
||||
value={values.name}
|
||||
onChange={(e) => setValues((values) => ({ ...values, name: e.target.value }))}
|
||||
/>
|
||||
</FormControl>
|
||||
<FormControl flex={1}>
|
||||
<FormLabel fontSize="sm">Match Type</FormLabel>
|
||||
<Select
|
||||
size="sm"
|
||||
value={values.matchType}
|
||||
onChange={(e) =>
|
||||
setValues((values) => ({
|
||||
...values,
|
||||
matchType: e.target.value as EvaluationMatchType,
|
||||
}))
|
||||
}
|
||||
>
|
||||
{Object.values(EvaluationMatchType).map((type) => (
|
||||
<option key={type} value={type}>
|
||||
{type}
|
||||
</option>
|
||||
))}
|
||||
</Select>
|
||||
</FormControl>
|
||||
</HStack>
|
||||
<FormControl>
|
||||
<FormLabel fontSize="sm">Match String</FormLabel>
|
||||
<FormHelperText>
|
||||
This string will be interpreted as a regex and checked against each model output.
|
||||
</FormHelperText>
|
||||
<Input
|
||||
size="sm"
|
||||
value={values.matchString}
|
||||
onChange={(e) => setValues((values) => ({ ...values, matchString: e.target.value }))}
|
||||
/>
|
||||
</FormControl>
|
||||
<HStack alignSelf="flex-end">
|
||||
<Button size="sm" onClick={props.onCancel} colorScheme="gray">
|
||||
Cancel
|
||||
</Button>
|
||||
<Button
|
||||
size="sm"
|
||||
onClick={() => props.onSave(props.evaluation?.id, values)}
|
||||
colorScheme="blue"
|
||||
>
|
||||
Save
|
||||
</Button>
|
||||
</HStack>
|
||||
</VStack>
|
||||
);
|
||||
}
|
||||
|
||||
export default function EditEvaluations() {
|
||||
const experiment = useExperiment();
|
||||
const vars =
|
||||
api.templateVars.list.useQuery({ experimentId: experiment.data?.id ?? "" }).data ?? [];
|
||||
const evaluations =
|
||||
api.evaluations.list.useQuery({ experimentId: experiment.data?.id ?? "" }).data ?? [];
|
||||
|
||||
const [newVariable, setNewVariable] = useState<string>("");
|
||||
const newVarIsValid = newVariable.length > 0 && !vars.map((v) => v.label).includes(newVariable);
|
||||
const [editingId, setEditingId] = useState<string | null>(null);
|
||||
|
||||
const utils = api.useContext();
|
||||
const addVarMutation = api.templateVars.create.useMutation();
|
||||
const [onAddVar] = useHandledAsyncCallback(async () => {
|
||||
if (!experiment.data?.id) return;
|
||||
if (!newVarIsValid) return;
|
||||
await addVarMutation.mutateAsync({
|
||||
experimentId: experiment.data.id,
|
||||
label: newVariable,
|
||||
});
|
||||
await utils.templateVars.list.invalidate();
|
||||
setNewVariable("");
|
||||
}, [addVarMutation, experiment.data?.id, newVarIsValid, newVariable]);
|
||||
const createMutation = api.evaluations.create.useMutation();
|
||||
const updateMutation = api.evaluations.update.useMutation();
|
||||
|
||||
const deleteMutation = api.templateVars.delete.useMutation();
|
||||
const [onDeleteVar] = useHandledAsyncCallback(async (id: string) => {
|
||||
const deleteMutation = api.evaluations.delete.useMutation();
|
||||
const [onDelete] = useHandledAsyncCallback(async (id: string) => {
|
||||
await deleteMutation.mutateAsync({ id });
|
||||
await utils.templateVars.list.invalidate();
|
||||
await utils.evaluations.list.invalidate();
|
||||
await utils.evaluations.results.invalidate();
|
||||
}, []);
|
||||
|
||||
const closeDrawer = useStore((state) => state.closeDrawer);
|
||||
const [onSave] = useHandledAsyncCallback(async (id: string | undefined, vals: EvalValues) => {
|
||||
setEditingId(null);
|
||||
if (!experiment.data?.id) return;
|
||||
|
||||
if (id) {
|
||||
await updateMutation.mutateAsync({
|
||||
id,
|
||||
updates: vals,
|
||||
});
|
||||
} else {
|
||||
await createMutation.mutateAsync({
|
||||
experimentId: experiment.data.id,
|
||||
...vals,
|
||||
});
|
||||
}
|
||||
await utils.evaluations.list.invalidate();
|
||||
await utils.evaluations.results.invalidate();
|
||||
}, []);
|
||||
|
||||
const onCancel = useCallback(() => {
|
||||
setEditingId(null);
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<Stack>
|
||||
<Heading size="sm">Edit Evaluations</Heading>
|
||||
<Stack spacing={2} pt={2}>
|
||||
<Text fontSize="sm"></Text>
|
||||
<Heading size="sm">Evaluations</Heading>
|
||||
<Stack spacing={4}>
|
||||
<Text fontSize="sm">
|
||||
Evaluations allow you to compare prompt performance in an automated way.
|
||||
</Text>
|
||||
<Stack spacing={2}>
|
||||
{evaluations.map((evaluation) =>
|
||||
editingId == evaluation.id ? (
|
||||
<EvaluationEditor
|
||||
evaluation={evaluation}
|
||||
onSave={onSave}
|
||||
onCancel={onCancel}
|
||||
key={evaluation.id}
|
||||
/>
|
||||
) : (
|
||||
<HStack
|
||||
fontSize="sm"
|
||||
borderTopWidth={1}
|
||||
borderColor="gray.200"
|
||||
py={4}
|
||||
align="center"
|
||||
key={evaluation.id}
|
||||
>
|
||||
<Text fontWeight="bold">{evaluation.name}</Text>
|
||||
<Text flex={1}>
|
||||
{evaluation.matchType}: "{evaluation.matchString}"
|
||||
</Text>
|
||||
<Button
|
||||
variant="unstyled"
|
||||
color="gray.400"
|
||||
height="unset"
|
||||
width="unset"
|
||||
minW="unset"
|
||||
onClick={() => setEditingId(evaluation.id)}
|
||||
_hover={{
|
||||
color: "gray.800",
|
||||
cursor: "pointer",
|
||||
}}
|
||||
>
|
||||
<Icon as={BsPencil} boxSize={4} />
|
||||
</Button>
|
||||
<Button
|
||||
variant="unstyled"
|
||||
color="gray.400"
|
||||
height="unset"
|
||||
width="unset"
|
||||
minW="unset"
|
||||
onClick={() => onDelete(evaluation.id)}
|
||||
_hover={{
|
||||
color: "gray.800",
|
||||
cursor: "pointer",
|
||||
}}
|
||||
>
|
||||
<Icon as={BsX} boxSize={6} />
|
||||
</Button>
|
||||
</HStack>
|
||||
)
|
||||
)}
|
||||
{editingId == null && (
|
||||
<Button
|
||||
onClick={() => setEditingId("new")}
|
||||
alignSelf="flex-start"
|
||||
size="sm"
|
||||
mt={4}
|
||||
colorScheme="blue"
|
||||
>
|
||||
Add Evaluation
|
||||
</Button>
|
||||
)}
|
||||
{editingId == "new" && (
|
||||
<EvaluationEditor
|
||||
evaluation={null}
|
||||
defaultName={`Eval${evaluations.length + 1}`}
|
||||
onSave={onSave}
|
||||
onCancel={onCancel}
|
||||
/>
|
||||
)}
|
||||
</Stack>
|
||||
</Stack>
|
||||
</Stack>
|
||||
);
|
||||
|
||||
@@ -33,8 +33,8 @@ export default function EditScenarioVars() {
|
||||
|
||||
return (
|
||||
<Stack>
|
||||
<Heading size="sm">Edit Scenario Variables</Heading>
|
||||
<Stack spacing={2} pt={2}>
|
||||
<Heading size="sm">Scenario Variables</Heading>
|
||||
<Stack spacing={2}>
|
||||
<Text fontSize="sm">
|
||||
Scenario variables can be used in your prompt variants as well as evaluations. Reference
|
||||
them using <Code>{"{{curly_braces}}"}</Code>.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { Button } from "@chakra-ui/react";
|
||||
import { Button, Icon, Spinner } from "@chakra-ui/react";
|
||||
import { BsPlus } from "react-icons/bs";
|
||||
import { api } from "~/utils/api";
|
||||
import { useExperiment, useHandledAsyncCallback } from "~/utils/hooks";
|
||||
@@ -9,7 +9,7 @@ export default function NewVariantButton() {
|
||||
const mutation = api.promptVariants.create.useMutation();
|
||||
const utils = api.useContext();
|
||||
|
||||
const [onClick] = useHandledAsyncCallback(async () => {
|
||||
const [onClick, loading] = useHandledAsyncCallback(async () => {
|
||||
if (!experiment.data) return;
|
||||
await mutation.mutateAsync({
|
||||
experimentId: experiment.data.id,
|
||||
@@ -30,7 +30,7 @@ export default function NewVariantButton() {
|
||||
height="unset"
|
||||
minH={headerMinHeight}
|
||||
>
|
||||
<BsPlus size={24} />
|
||||
<Icon as={loading ? Spinner : BsPlus} boxSize={6} mr={loading ? 1 : 0} />
|
||||
Add Variant
|
||||
</Button>
|
||||
);
|
||||
|
||||
@@ -1,17 +1,18 @@
|
||||
import { api } from "~/utils/api";
|
||||
import { type PromptVariant, type Scenario } from "./types";
|
||||
import { Spinner, Text, Box, Center, Flex, Icon } from "@chakra-ui/react";
|
||||
import { Spinner, Text, Box, Center, Flex, Icon, HStack } from "@chakra-ui/react";
|
||||
import { useExperiment } from "~/utils/hooks";
|
||||
import SyntaxHighlighter from "react-syntax-highlighter";
|
||||
import { docco } from "react-syntax-highlighter/dist/cjs/styles/hljs";
|
||||
import stringify from "json-stringify-pretty-compact";
|
||||
import { useMemo, type ReactElement } from "react";
|
||||
import { BsClock } from "react-icons/bs";
|
||||
import { BsCheck, BsClock, BsX } from "react-icons/bs";
|
||||
import { type ModelOutput } from "@prisma/client";
|
||||
import { type ChatCompletion } from "openai/resources/chat";
|
||||
import { generateChannel } from "~/utils/generateChannel";
|
||||
import { isObject } from "lodash";
|
||||
import useSocket from "~/utils/useSocket";
|
||||
import { evaluateOutput } from "~/server/utils/evaluateOutput";
|
||||
|
||||
export default function OutputCell({
|
||||
scenario,
|
||||
@@ -109,7 +110,7 @@ export default function OutputCell({
|
||||
{ maxLength: 40 }
|
||||
)}
|
||||
</SyntaxHighlighter>
|
||||
<OutputStats modelOutput={output.data} />
|
||||
<OutputStats modelOutput={output.data} scenario={scenario} />
|
||||
</Box>
|
||||
);
|
||||
}
|
||||
@@ -120,18 +121,44 @@ export default function OutputCell({
|
||||
return (
|
||||
<Flex w="100%" h="100%" direction="column" justifyContent="space-between" whiteSpace="pre-wrap">
|
||||
{contentToDisplay}
|
||||
{output.data && <OutputStats modelOutput={output.data} />}
|
||||
{output.data && <OutputStats modelOutput={output.data} scenario={scenario} />}
|
||||
</Flex>
|
||||
);
|
||||
}
|
||||
|
||||
const OutputStats = ({ modelOutput }: { modelOutput: ModelOutput }) => {
|
||||
const OutputStats = ({
|
||||
modelOutput,
|
||||
scenario,
|
||||
}: {
|
||||
modelOutput: ModelOutput;
|
||||
scenario: Scenario;
|
||||
}) => {
|
||||
const timeToComplete = modelOutput.timeToComplete;
|
||||
const experiment = useExperiment();
|
||||
const evals =
|
||||
api.evaluations.list.useQuery({ experimentId: experiment.data?.id ?? "" }).data ?? [];
|
||||
|
||||
return (
|
||||
<Flex justifyContent="flex-end" alignItems="center" color="gray.500" fontSize="xs" mt={2}>
|
||||
<Icon as={BsClock} mr={0.5} />
|
||||
<Text>{(timeToComplete / 1000).toFixed(2)}s</Text>
|
||||
</Flex>
|
||||
<HStack align="center" color="gray.500" fontSize="xs" mt={2}>
|
||||
<HStack flex={1}>
|
||||
{evals.map((evaluation) => {
|
||||
const passed = evaluateOutput(modelOutput, scenario, evaluation);
|
||||
return (
|
||||
<HStack spacing={0} key={evaluation.id}>
|
||||
<Text>{evaluation.name}</Text>
|
||||
<Icon
|
||||
as={passed ? BsCheck : BsX}
|
||||
color={passed ? "green.500" : "red.500"}
|
||||
boxSize={6}
|
||||
/>
|
||||
</HStack>
|
||||
);
|
||||
})}
|
||||
</HStack>
|
||||
<HStack>
|
||||
<Icon as={BsClock} mr={0.5} />
|
||||
<Text>{(timeToComplete / 1000).toFixed(2)}s</Text>
|
||||
</HStack>
|
||||
</HStack>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -6,9 +6,11 @@ import {
|
||||
DrawerHeader,
|
||||
DrawerOverlay,
|
||||
Heading,
|
||||
Stack,
|
||||
} from "@chakra-ui/react";
|
||||
import { useStore } from "~/utils/store";
|
||||
import EditScenarioVars from "./EditScenarioVars";
|
||||
import EditEvaluations from "./EditEvaluations";
|
||||
|
||||
export default function SettingsDrawer() {
|
||||
const isOpen = useStore((state) => state.drawerOpen);
|
||||
@@ -23,8 +25,10 @@ export default function SettingsDrawer() {
|
||||
<Heading size="md">Settings</Heading>
|
||||
</DrawerHeader>
|
||||
<DrawerBody>
|
||||
<EditScenarioVars />
|
||||
{/* <EditEvaluations /> */}
|
||||
<Stack spacing={6}>
|
||||
<EditScenarioVars />
|
||||
<EditEvaluations />
|
||||
</Stack>
|
||||
</DrawerBody>
|
||||
</DrawerContent>
|
||||
</Drawer>
|
||||
|
||||
38
src/components/OutputsTable/VariantStats.tsx
Normal file
38
src/components/OutputsTable/VariantStats.tsx
Normal file
@@ -0,0 +1,38 @@
|
||||
import { HStack, Text, useToken } from "@chakra-ui/react";
|
||||
import { type PromptVariant } from "./types";
|
||||
import { cellPadding } from "../constants";
|
||||
import { api } from "~/utils/api";
|
||||
import chroma from "chroma-js";
|
||||
|
||||
export default function VariantStats(props: { variant: PromptVariant }) {
|
||||
const evalResults =
|
||||
api.evaluations.results.useQuery({
|
||||
variantId: props.variant.id,
|
||||
}).data ?? [];
|
||||
|
||||
const [passColor, neutralColor, failColor] = useToken("colors", [
|
||||
"green.500",
|
||||
"gray.500",
|
||||
"red.500",
|
||||
]);
|
||||
|
||||
const scale = chroma.scale([failColor, neutralColor, passColor]).domain([0, 0.5, 1]);
|
||||
|
||||
if (!(evalResults.length > 0)) return null;
|
||||
|
||||
return (
|
||||
<HStack px={cellPadding.x} py={cellPadding.y} fontSize="sm">
|
||||
{evalResults.map((result) => {
|
||||
const passedFrac = result.passCount / (result.passCount + result.failCount);
|
||||
return (
|
||||
<HStack key={result.id}>
|
||||
<Text>{result.evaluation.name}</Text>
|
||||
<Text color={scale(passedFrac).hex()} fontWeight="bold">
|
||||
{(passedFrac * 100).toFixed(1)}%
|
||||
</Text>
|
||||
</HStack>
|
||||
);
|
||||
})}
|
||||
</HStack>
|
||||
);
|
||||
}
|
||||
@@ -1,11 +1,4 @@
|
||||
import {
|
||||
Button,
|
||||
Grid,
|
||||
GridItem,
|
||||
HStack,
|
||||
Heading,
|
||||
type SystemStyleObject,
|
||||
} from "@chakra-ui/react";
|
||||
import { Button, Grid, GridItem, HStack, Heading, type SystemStyleObject } from "@chakra-ui/react";
|
||||
import { api } from "~/utils/api";
|
||||
import NewScenarioButton from "./NewScenarioButton";
|
||||
import NewVariantButton from "./NewVariantButton";
|
||||
@@ -15,6 +8,7 @@ import VariantHeader from "./VariantHeader";
|
||||
import { cellPadding } from "../constants";
|
||||
import { BsPencil } from "react-icons/bs";
|
||||
import { useStore } from "~/utils/store";
|
||||
import VariantStats from "./VariantStats";
|
||||
|
||||
const stickyHeaderStyle: SystemStyleObject = {
|
||||
position: "sticky",
|
||||
@@ -38,6 +32,7 @@ export default function OutputsTable({ experimentId }: { experimentId: string |
|
||||
if (!variants.data || !scenarios.data) return null;
|
||||
|
||||
const allCols = variants.data.length + 1;
|
||||
const headerRows = 3;
|
||||
|
||||
return (
|
||||
<Grid
|
||||
@@ -55,7 +50,7 @@ export default function OutputsTable({ experimentId }: { experimentId: string |
|
||||
<GridItem
|
||||
display="flex"
|
||||
alignItems="flex-end"
|
||||
rowSpan={2}
|
||||
rowSpan={headerRows}
|
||||
px={cellPadding.x}
|
||||
py={cellPadding.y}
|
||||
>
|
||||
@@ -82,7 +77,7 @@ export default function OutputsTable({ experimentId }: { experimentId: string |
|
||||
</GridItem>
|
||||
))}
|
||||
<GridItem
|
||||
rowSpan={scenarios.data.length + 2}
|
||||
rowSpan={scenarios.data.length + headerRows}
|
||||
padding={0}
|
||||
// Have to use `style` instead of emotion style props to work around css specificity issues conflicting with the "> *" selector on Grid
|
||||
style={{ borderRightWidth: 0, borderBottomWidth: 0 }}
|
||||
@@ -92,10 +87,15 @@ export default function OutputsTable({ experimentId }: { experimentId: string |
|
||||
</GridItem>
|
||||
|
||||
{variants.data.map((variant) => (
|
||||
<GridItem key={variant.uiId} padding={0}>
|
||||
<GridItem key={variant.uiId}>
|
||||
<VariantConfigEditor variant={variant} />
|
||||
</GridItem>
|
||||
))}
|
||||
{variants.data.map((variant) => (
|
||||
<GridItem key={variant.uiId}>
|
||||
<VariantStats variant={variant} />
|
||||
</GridItem>
|
||||
))}
|
||||
{scenarios.data.map((scenario) => (
|
||||
<ScenarioRow key={scenario.uiId} scenario={scenario} variants={variants.data} />
|
||||
))}
|
||||
|
||||
Reference in New Issue
Block a user