import { Text, Button, HStack, Heading, Icon, Input, Stack, VStack, FormControl, FormLabel, Select, FormHelperText, Code, } from "@chakra-ui/react"; import { type Evaluation, EvalType } from "@prisma/client"; import { useCallback, useState } from "react"; import { BsPencil, BsX } from "react-icons/bs"; import { api } from "~/utils/api"; import { useExperiment, useHandledAsyncCallback } from "~/utils/hooks"; import AutoResizeTextArea from "../AutoResizeTextArea"; type EvalValues = Pick; export function EvaluationEditor(props: { evaluation: Evaluation | null; defaultName?: string; onSave: (id: string | undefined, vals: EvalValues) => void; onCancel: () => void; }) { const [values, setValues] = useState({ label: props.evaluation?.label ?? props.defaultName ?? "", value: props.evaluation?.value ?? "", evalType: props.evaluation?.evalType ?? "CONTAINS", }); return ( Eval Name setValues((values) => ({ ...values, label: e.target.value }))} /> Eval Type {["CONTAINS", "DOES_NOT_CONTAIN"].includes(values.evalType) && ( Match String setValues((values) => ({ ...values, value: e.target.value }))} /> This string will be interpreted as a regex and checked against each model output. You can include scenario variables using {"{{curly_braces}}"} )} {values.evalType === "GPT4_EVAL" && ( GPT4 Instructions setValues((values) => ({ ...values, value: e.target.value }))} minRows={3} /> Give instructions to GPT-4 for how to evaluate your prompt. It will have access to the full scenario as well as the output it is evaluating. It will not have access to the specific prompt variant, so be sure to be clear about the task you want it to perform. )} ); } export default function EditEvaluations() { const experiment = useExperiment(); const evaluations = api.evaluations.list.useQuery({ experimentId: experiment.data?.id ?? "" }).data ?? []; const [editingId, setEditingId] = useState(null); const utils = api.useContext(); const createMutation = api.evaluations.create.useMutation(); const updateMutation = api.evaluations.update.useMutation(); const deleteMutation = api.evaluations.delete.useMutation(); const [onDelete] = useHandledAsyncCallback(async (id: string) => { await deleteMutation.mutateAsync({ id }); await utils.evaluations.list.invalidate(); await utils.promptVariants.stats.invalidate(); }, []); const [onSave] = useHandledAsyncCallback(async (id: string | undefined, vals: EvalValues) => { setEditingId(null); if (!experiment.data?.id) return; if (id) { await updateMutation.mutateAsync({ id, updates: vals, }); } else { await createMutation.mutateAsync({ experimentId: experiment.data.id, ...vals, }); } await utils.evaluations.list.invalidate(); await utils.promptVariants.stats.invalidate(); await utils.scenarioVariantCells.get.invalidate(); }, []); const onCancel = useCallback(() => { setEditingId(null); }, []); return ( Evaluations Evaluations allow you to compare prompt performance in an automated way. {evaluations.map((evaluation) => editingId == evaluation.id ? ( ) : ( {evaluation.label} {evaluation.evalType}: "{evaluation.value}" ), )} {editingId == null && ( )} {editingId == "new" && ( )} ); }