mirror of
https://github.com/codelion/optillm.git
synced 2025-05-28 09:39:38 +03:00
153 lines
5.7 KiB
Python
153 lines
5.7 KiB
Python
import argparse
|
|
import json
|
|
import os
|
|
import time
|
|
from typing import List, Dict
|
|
|
|
from openai import OpenAI
|
|
from datasets import load_dataset
|
|
from tqdm import tqdm
|
|
|
|
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8000/v1")
|
|
# client = OpenAI()
|
|
SLEEP_INTERVAL = 300
|
|
|
|
def load_existing_results(filename: str) -> List[Dict]:
|
|
try:
|
|
with open(filename, 'r') as f:
|
|
return json.load(f)
|
|
except FileNotFoundError:
|
|
return []
|
|
|
|
def save_result(filename: str, result: Dict):
|
|
results = load_existing_results(filename)
|
|
results.append(result)
|
|
with open(filename, 'w') as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
def get_last_processed_index(results: List[Dict]) -> int:
|
|
if not results:
|
|
return -1
|
|
return max(int(r.get('index', -1)) for r in results)
|
|
|
|
def generate_llm_prompt(prompt: str, wiki_links: List[str]) -> str:
|
|
return f"Here are the relevant Wikipedia articles:\n{wiki_links}\n\nBased on all the information, answer the query. \n\nQuery: {prompt}\n\n"
|
|
|
|
def get_llm_response(prompt: str, model: str) -> str:
|
|
response = client.with_options(timeout=1000.0).chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
max_tokens=1000,
|
|
n=1,
|
|
stop=None,
|
|
temperature=0.7,
|
|
extra_body={"optillm_approach": "readurls&memory"}
|
|
)
|
|
return response.choices[0].message.content.strip()
|
|
|
|
def evaluate_response(question: str, llm_response: str, ground_truth: str, model: str) -> Dict[str, str]:
|
|
evaluation_prompt = f"""===Task===
|
|
I need your help in evaluating an answer provided by an LLM against a ground
|
|
truth answer. Your task is to determine if the ground truth answer is present in the LLM's
|
|
response. Please analyze the provided data and make a decision.
|
|
===Instructions===
|
|
1. Carefully compare the "Predicted Answer" with the "Ground Truth Answer".
|
|
2. Consider the substance of the answers - look for equivalent information or correct answers.
|
|
Do not focus on exact wording unless the exact wording is crucial to the meaning.
|
|
3. Your final decision should be based on whether the meaning and the vital facts of the
|
|
"Ground Truth Answer" are present in the "Predicted Answer:"
|
|
===Input Data===
|
|
- Question: {question}
|
|
- Predicted Answer: {llm_response}
|
|
- Ground Truth Answer: {ground_truth}
|
|
===Output Format===
|
|
Provide your final evaluation in the following format:
|
|
"Explanation:" (How you made the decision?)
|
|
"Decision:" ("TRUE" or "FALSE" )
|
|
Please proceed with the evaluation."""
|
|
|
|
evaluation_response = client.chat.completions.create(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": evaluation_prompt}
|
|
],
|
|
max_tokens=300,
|
|
n=1,
|
|
stop=None,
|
|
temperature=0.3,
|
|
)
|
|
|
|
evaluation_text = evaluation_response.choices[0].message.content.strip()
|
|
|
|
# Extract the decision and explanation
|
|
lines = evaluation_text.split('\n')
|
|
decision = "FALSE"
|
|
explanation = ""
|
|
for line in lines:
|
|
if line.startswith("Decision:"):
|
|
decision = line.split(":")[1].strip().upper()
|
|
elif line.startswith("Explanation:"):
|
|
explanation = line.split(":", 1)[1].strip()
|
|
|
|
return {"decision": decision, "explanation": explanation}
|
|
|
|
def main(model: str):
|
|
# Load the dataset
|
|
dataset = load_dataset("google/frames-benchmark", split="test")
|
|
|
|
filename = f"evaluation_results_{model.replace('/', '_')}.json"
|
|
existing_results = load_existing_results(filename)
|
|
last_processed_index = get_last_processed_index(existing_results)
|
|
|
|
for item in tqdm(dataset, desc="Processing samples"):
|
|
index = int(item['Unnamed: 0'])
|
|
if index <= last_processed_index:
|
|
continue
|
|
|
|
prompt = generate_llm_prompt(item['Prompt'], item['wiki_links'])
|
|
llm_response = get_llm_response(prompt, model)
|
|
evaluation = evaluate_response(item['Prompt'], llm_response, item['Answer'], model)
|
|
|
|
result = {
|
|
"index": index,
|
|
"prompt": item['Prompt'],
|
|
"ground_truth": item['Answer'],
|
|
"llm_response": llm_response,
|
|
"evaluation_decision": evaluation['decision'],
|
|
"evaluation_explanation": evaluation['explanation'],
|
|
"reasoning_type": item['reasoning_types']
|
|
}
|
|
|
|
save_result(filename, result)
|
|
# print(f"Index: {index}, Decision: {result['evaluation_decision']}")
|
|
# time.sleep(SLEEP_INTERVAL)
|
|
|
|
# Calculate and print summary statistics
|
|
results = load_existing_results(filename)
|
|
total_samples = len(results)
|
|
correct_answers = sum(1 for r in results if r['evaluation_decision'] == 'TRUE')
|
|
accuracy = correct_answers / total_samples
|
|
|
|
print(f"Model: {model}")
|
|
print(f"Total samples: {total_samples}")
|
|
print(f"Correct answers: {correct_answers}")
|
|
print(f"Accuracy: {accuracy:.2%}")
|
|
|
|
# Print accuracy by reasoning type
|
|
reasoning_types = set(r['reasoning_type'] for r in results)
|
|
for rt in reasoning_types:
|
|
rt_samples = [r for r in results if r['reasoning_type'] == rt]
|
|
rt_correct = sum(1 for r in rt_samples if r['evaluation_decision'] == 'TRUE')
|
|
rt_accuracy = rt_correct / len(rt_samples)
|
|
print(f"Accuracy for {rt}: {rt_accuracy:.2%}")
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Evaluate LLM performance on google/frames-benchmark")
|
|
parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4o, gpt-4o-mini)")
|
|
args = parser.parse_args()
|
|
|
|
main(args.model) |