cache eval outputs and add gpt4 eval

This commit is contained in:
Kyle Corbitt
2023-07-17 17:55:36 -07:00
parent 011b12abb9
commit 7d41e94ca2
8 changed files with 168 additions and 115 deletions

View File

@@ -4,7 +4,7 @@ import { runOneEval } from "./runOneEval";
import { type Scenario } from "~/components/OutputsTable/types";
const saveResult = async (evaluation: Evaluation, scenario: Scenario, modelOutput: ModelOutput) => {
const result = runOneEval(evaluation, scenario, modelOutput);
const result = await runOneEval(evaluation, scenario, modelOutput);
return await prisma.outputEvaluation.upsert({
where: {
modelOutputId_evaluationId: {
@@ -15,10 +15,10 @@ const saveResult = async (evaluation: Evaluation, scenario: Scenario, modelOutpu
create: {
modelOutputId: modelOutput.id,
evaluationId: evaluation.id,
result,
...result,
},
update: {
result,
...result,
},
});
};
@@ -35,43 +35,6 @@ export const runEvalsForOutput = async (
await Promise.all(
evaluations.map(async (evaluation) => await saveResult(evaluation, scenario, modelOutput)),
);
// const cells = await prisma.scenarioVariantCell.findMany({
// where: {
// promptVariantId: variantId,
// retrievalStatus: "COMPLETE",
// testScenario: { visible: true },
// },
// include: { testScenario: true, modelOutput: { include: { OutputEvaluation: true } } },
// });
// await Promise.all(
// evaluations.map(async (evaluation) => {
// const passCount = cells.filter((cell) =>
// runOneEval(cell.modelOutput as ModelOutput, cell.testScenario, evaluation),
// ).length;
// const failCount = cells.length - passCount;
// await prisma.evaluationResult.upsert({
// where: {
// evaluationId_promptVariantId: {
// evaluationId: evaluation.id,
// promptVariantId: variantId,
// },
// },
// create: {
// evaluationId: evaluation.id,
// promptVariantId: variantId,
// passCount,
// failCount,
// },
// update: {
// passCount,
// failCount,
// },
// });
// }),
// );
};
export const runAllEvals = async (experimentId: string) => {
@@ -113,42 +76,4 @@ export const runAllEvals = async (experimentId: string) => {
);
}),
);
// const cells = await prisma.scenarioVariantCell.findMany({
// where: {
// promptVariantId: { in: variants.map((v) => v.id) },
// testScenario: { visible: true },
// statusCode: { notIn: [429] },
// },
// include: { testScenario: true, modelOutput: true },
// });
// await Promise.all(
// variants.map(async (variant) => {
// const variantCells = cells.filter((cell) => cell.promptVariantId === variant.id);
// const passCount = variantCells.filter((cell) =>
// runOneEval(cell.modelOutput as ModelOutput, cell.testScenario, evaluation),
// ).length;
// const failCount = variantCells.length - passCount;
// await prisma.evaluationResult.upsert({
// where: {
// evaluationId_promptVariantId: {
// evaluationId: evaluation.id,
// promptVariantId: variant.id,
// },
// },
// create: {
// evaluationId: evaluation.id,
// promptVariantId: variant.id,
// passCount,
// failCount,
// },
// update: {
// passCount,
// failCount,
// },
// });
// }),
// );
};