cache eval outputs and add gpt4 eval

2023-07-17 17:55:36 -07:00
parent 011b12abb9
commit 7d41e94ca2
8 changed files with 168 additions and 115 deletions
--- a/src/server/utils/evaluations.ts
+++ b/src/server/utils/evaluations.ts
@@ -4,7 +4,7 @@ import { runOneEval } from "./runOneEval";
 import { type Scenario } from "~/components/OutputsTable/types";

 const saveResult = async (evaluation: Evaluation, scenario: Scenario, modelOutput: ModelOutput) => {
-  const result = runOneEval(evaluation, scenario, modelOutput);
+  const result = await runOneEval(evaluation, scenario, modelOutput);
  return await prisma.outputEvaluation.upsert({
    where: {
      modelOutputId_evaluationId: {
@@ -15,10 +15,10 @@ const saveResult = async (evaluation: Evaluation, scenario: Scenario, modelOutpu
    create: {
      modelOutputId: modelOutput.id,
      evaluationId: evaluation.id,
-      result,
+      ...result,
    },
    update: {
-      result,
+      ...result,
    },
  });
 };
@@ -35,43 +35,6 @@ export const runEvalsForOutput = async (
  await Promise.all(
    evaluations.map(async (evaluation) => await saveResult(evaluation, scenario, modelOutput)),
  );
-
-  // const cells = await prisma.scenarioVariantCell.findMany({
-  //   where: {
-  //     promptVariantId: variantId,
-  //     retrievalStatus: "COMPLETE",
-  //     testScenario: { visible: true },
-  //   },
-  //   include: { testScenario: true, modelOutput: { include: { OutputEvaluation: true } } },
-  // });
-
-  // await Promise.all(
-  //   evaluations.map(async (evaluation) => {
-  //     const passCount = cells.filter((cell) =>
-  //       runOneEval(cell.modelOutput as ModelOutput, cell.testScenario, evaluation),
-  //     ).length;
-  //     const failCount = cells.length - passCount;
-
-  //     await prisma.evaluationResult.upsert({
-  //       where: {
-  //         evaluationId_promptVariantId: {
-  //           evaluationId: evaluation.id,
-  //           promptVariantId: variantId,
-  //         },
-  //       },
-  //       create: {
-  //         evaluationId: evaluation.id,
-  //         promptVariantId: variantId,
-  //         passCount,
-  //         failCount,
-  //       },
-  //       update: {
-  //         passCount,
-  //         failCount,
-  //       },
-  //     });
-  //   }),
-  // );
 };

 export const runAllEvals = async (experimentId: string) => {
@@ -113,42 +76,4 @@ export const runAllEvals = async (experimentId: string) => {
      );
    }),
  );
-
-  // const cells = await prisma.scenarioVariantCell.findMany({
-  //   where: {
-  //     promptVariantId: { in: variants.map((v) => v.id) },
-  //     testScenario: { visible: true },
-  //     statusCode: { notIn: [429] },
-  //   },
-  //   include: { testScenario: true, modelOutput: true },
-  // });
-
-  // await Promise.all(
-  //   variants.map(async (variant) => {
-  //     const variantCells = cells.filter((cell) => cell.promptVariantId === variant.id);
-  //     const passCount = variantCells.filter((cell) =>
-  //       runOneEval(cell.modelOutput as ModelOutput, cell.testScenario, evaluation),
-  //     ).length;
-  //     const failCount = variantCells.length - passCount;
-
-  //     await prisma.evaluationResult.upsert({
-  //       where: {
-  //         evaluationId_promptVariantId: {
-  //           evaluationId: evaluation.id,
-  //           promptVariantId: variant.id,
-  //         },
-  //       },
-  //       create: {
-  //         evaluationId: evaluation.id,
-  //         promptVariantId: variant.id,
-  //         passCount,
-  //         failCount,
-  //       },
-  //       update: {
-  //         passCount,
-  //         failCount,
-  //       },
-  //     });
-  //   }),
-  // );
 };
--- a/src/server/utils/runOneEval.ts
+++ b/src/server/utils/runOneEval.ts
@@ -1,32 +1,93 @@
 import { type Evaluation, type ModelOutput, type TestScenario } from "@prisma/client";
 import { type ChatCompletion } from "openai/resources/chat";
 import { type VariableMap, fillTemplate } from "./fillTemplate";
+import { openai } from "./openai";
+import dedent from "dedent";

-export const runOneEval = (
+export const runGpt4Eval = async (
+  evaluation: Evaluation,
+  scenario: TestScenario,
+  message: ChatCompletion.Choice.Message,
+): Promise<{ result: number; details: string }> => {
+  const output = await openai.chat.completions.create({
+    model: "gpt-4-0613",
+    messages: [
+      {
+        role: "system",
+        content: dedent`
+        You are a highly intelligent AI model and have been tasked with evaluating the quality of a simpler model. Your objective is to determine whether the simpler model has produced a successful and correct output. You should return "true" if the output was successful and "false" if it was not. Pay more attention to the semantics of the output than the formatting. Success is defined in the following terms:
+        ---
+        ${evaluation.value}
+        `,
+      },
+      {
+        role: "user",
+        content: `Scenario:\n---\n${JSON.stringify(scenario.variableValues, null, 2)}`,
+      },
+      {
+        role: "user",
+        content: `The full output of the simpler message:\n---\n${JSON.stringify(
+          message.content ?? message.function_call,
+          null,
+          2,
+        )}`,
+      },
+    ],
+    function_call: {
+      name: "report_success",
+    },
+    functions: [
+      {
+        name: "report_success",
+        parameters: {
+          type: "object",
+          required: ["thoughts", "success"],
+          properties: {
+            thoughts: {
+              type: "string",
+              description: "Explain your reasoning for considering this a pass or fail",
+            },
+            success: {
+              type: "boolean",
+              description:
+                "Whether the simpler model successfully completed the task for this scenario",
+            },
+          },
+        },
+      },
+    ],
+  });
+
+  try {
+    const out = JSON.parse(output.choices[0]?.message?.function_call?.arguments ?? "");
+    return { result: out.success ? 1 : 0, details: out.thoughts ?? JSON.stringify(out) };
+  } catch (e) {
+    console.error(e);
+    return { result: 0, details: "Error parsing GPT-4 output" };
+  }
+};
+
+export const runOneEval = async (
  evaluation: Evaluation,
  scenario: TestScenario,
  modelOutput: ModelOutput,
-): number => {
+): Promise<{ result: number; details?: string }> => {
  const output = modelOutput.output as unknown as ChatCompletion;

  const message = output?.choices?.[0]?.message;

-  if (!message) return 0;
+  if (!message) return { result: 0 };

  const stringifiedMessage = message.content ?? JSON.stringify(message.function_call);

  const matchRegex = fillTemplate(evaluation.value, scenario.variableValues as VariableMap);

-  let result;
-
  switch (evaluation.evalType) {
    case "CONTAINS":
-      result = stringifiedMessage.match(matchRegex) !== null ? 1 : 0;
-      break;
+      return { result: stringifiedMessage.match(matchRegex) !== null ? 1 : 0 };
    case "DOES_NOT_CONTAIN":
-      result = stringifiedMessage.match(matchRegex) === null ? 1 : 0;
-      break;
+      return { result: stringifiedMessage.match(matchRegex) === null ? 1 : 0 };
+    case "GPT4_EVAL":
+      return await runGpt4Eval(evaluation, scenario, message);
  }
-
-  return result;
 };