[eval-basic] run precommit formatting

This commit is contained in:
rishabhranawat
2025-02-09 22:40:45 -08:00
parent 75cfd31ec2
commit c214724a46
10 changed files with 53 additions and 67 deletions

View File

@@ -1,18 +1,17 @@
import argparse
from datetime import datetime
import json
import os
from openai import OpenAI
from datetime import datetime
from typing import Any, Dict, List
from openai import OpenAI
from reasoning_gym.factory import DATASETS, create_dataset
class OpenRouterEvaluator:
def __init__(self, model: str):
self.client = OpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=os.getenv('OPENROUTER_API_KEY')
)
self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY"))
self.model = model
self.extra_headers = {}
@@ -20,12 +19,7 @@ class OpenRouterEvaluator:
"""Get response from the model via OpenRouter API."""
try:
completion = self.client.chat.completions.create(
extra_headers=self.extra_headers,
model=self.model,
messages=[{
"role": "user",
"content": prompt
}]
extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}]
)
return completion.choices[0].message.content
except Exception as e:
@@ -37,7 +31,7 @@ class OpenRouterEvaluator:
all_results = []
for dataset_config in dataset_configs:
dataset_name = dataset_config.pop('name')
dataset_name = dataset_config.pop("name")
print(f"\nEvaluating dataset: {dataset_name}")
try:
@@ -47,15 +41,15 @@ class OpenRouterEvaluator:
for entry in data:
try:
response = self.get_model_response(entry['question'])
response = self.get_model_response(entry["question"])
score = data.score_answer(answer=response, entry=entry)
result = {
'question': entry['question'],
'expected_answer': entry['answer'],
'model_answer': response,
'score': score,
'metadata': entry['metadata']
"question": entry["question"],
"expected_answer": entry["answer"],
"model_answer": response,
"score": score,
"metadata": entry["metadata"],
}
results.append(result)
print(f"Processed question {len(results)}/{len(data)}. Score: {score}")
@@ -65,21 +59,18 @@ class OpenRouterEvaluator:
print(f"Error: {str(e)}")
# Calculate aggregate metrics
total_score = sum(r['score'] for r in results)
total_score = sum(r["score"] for r in results)
metrics = {
'dataset_name': dataset_name,
'model': self.model,
'size': len(data),
'average_score': total_score / len(results) if results else 0,
'total_examples': len(results),
'timestamp': datetime.now().isoformat(),
'config': dataset_config
"dataset_name": dataset_name,
"model": self.model,
"size": len(data),
"average_score": total_score / len(results) if results else 0,
"total_examples": len(results),
"timestamp": datetime.now().isoformat(),
"config": dataset_config,
}
all_results.append({
'metrics': metrics,
'results': results
})
all_results.append({"metrics": metrics, "results": results})
except Exception as e:
print(f"Error evaluating dataset {dataset_name}: {str(e)}")
@@ -89,13 +80,10 @@ class OpenRouterEvaluator:
def main():
parser = argparse.ArgumentParser(
description='Evaluate models on reasoning datasets')
parser.add_argument('--model', required=True, help='Model to evaluate')
parser.add_argument('--config', required=True,
help='Path to JSON configuration file')
parser.add_argument('--output-dir', default='results',
help='Output directory')
parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets")
parser.add_argument("--model", required=True, help="Model to evaluate")
parser.add_argument("--config", required=True, help="Path to JSON configuration file")
parser.add_argument("--output-dir", default="results", help="Output directory")
args = parser.parse_args()
@@ -103,7 +91,7 @@ def main():
os.makedirs(args.output_dir, exist_ok=True)
# Load dataset configurations
with open(args.config, 'r') as f:
with open(args.config, "r") as f:
dataset_configs = json.load(f)
evaluator = OpenRouterEvaluator(model=args.model)
@@ -111,35 +99,33 @@ def main():
# Save results
output_file = os.path.join(
args.output_dir,
f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
)
# Save detailed results
with open(output_file, 'w') as f:
with open(output_file, "w") as f:
json.dump(all_results, f, indent=2)
# Create summary
summary = []
for result in all_results:
metrics = result['metrics']
metrics = result["metrics"]
summary_entry = {
'dataset_name': metrics['dataset_name'],
'model': metrics['model'],
'average_score': metrics['average_score'],
'total_examples': metrics['total_examples'],
'timestamp': metrics['timestamp'],
'config': metrics['config']
"dataset_name": metrics["dataset_name"],
"model": metrics["model"],
"average_score": metrics["average_score"],
"total_examples": metrics["total_examples"],
"timestamp": metrics["timestamp"],
"config": metrics["config"],
}
summary.append(summary_entry)
# Save summary to a separate file
summary_file = os.path.join(
args.output_dir,
f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
)
with open(summary_file, 'w') as f:
with open(summary_file, "w") as f:
json.dump(summary, f, indent=2)
# Print summary