diff --git a/eval/eval_basic.json b/eval/eval_basic.json index 8e96863c..6240ce9d 100644 --- a/eval/eval_basic.json +++ b/eval/eval_basic.json @@ -18,4 +18,4 @@ "size": 10, "seed": 42 } - ] \ No newline at end of file + ] diff --git a/eval/eval_basic.py b/eval/eval_basic.py index 6b605487..f8952e10 100644 --- a/eval/eval_basic.py +++ b/eval/eval_basic.py @@ -1,18 +1,17 @@ import argparse -from datetime import datetime import json import os -from openai import OpenAI +from datetime import datetime from typing import Any, Dict, List +from openai import OpenAI + from reasoning_gym.factory import DATASETS, create_dataset + class OpenRouterEvaluator: def __init__(self, model: str): - self.client = OpenAI( - base_url="https://openrouter.ai/api/v1", - api_key=os.getenv('OPENROUTER_API_KEY') - ) + self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY")) self.model = model self.extra_headers = {} @@ -20,12 +19,7 @@ class OpenRouterEvaluator: """Get response from the model via OpenRouter API.""" try: completion = self.client.chat.completions.create( - extra_headers=self.extra_headers, - model=self.model, - messages=[{ - "role": "user", - "content": prompt - }] + extra_headers=self.extra_headers, model=self.model, messages=[{"role": "user", "content": prompt}] ) return completion.choices[0].message.content except Exception as e: @@ -35,27 +29,27 @@ class OpenRouterEvaluator: def evaluate_datasets(self, dataset_configs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Evaluate model on multiple datasets with their respective configurations.""" all_results = [] - + for dataset_config in dataset_configs: - dataset_name = dataset_config.pop('name') + dataset_name = dataset_config.pop("name") print(f"\nEvaluating dataset: {dataset_name}") - + try: # Create dataset with its specific configuration data = create_dataset(dataset_name, **dataset_config) results = [] - + for entry in data: try: - response = self.get_model_response(entry['question']) + response = self.get_model_response(entry["question"]) score = data.score_answer(answer=response, entry=entry) result = { - 'question': entry['question'], - 'expected_answer': entry['answer'], - 'model_answer': response, - 'score': score, - 'metadata': entry['metadata'] + "question": entry["question"], + "expected_answer": entry["answer"], + "model_answer": response, + "score": score, + "metadata": entry["metadata"], } results.append(result) print(f"Processed question {len(results)}/{len(data)}. Score: {score}") @@ -65,21 +59,18 @@ class OpenRouterEvaluator: print(f"Error: {str(e)}") # Calculate aggregate metrics - total_score = sum(r['score'] for r in results) + total_score = sum(r["score"] for r in results) metrics = { - 'dataset_name': dataset_name, - 'model': self.model, - 'size': len(data), - 'average_score': total_score / len(results) if results else 0, - 'total_examples': len(results), - 'timestamp': datetime.now().isoformat(), - 'config': dataset_config + "dataset_name": dataset_name, + "model": self.model, + "size": len(data), + "average_score": total_score / len(results) if results else 0, + "total_examples": len(results), + "timestamp": datetime.now().isoformat(), + "config": dataset_config, } - all_results.append({ - 'metrics': metrics, - 'results': results - }) + all_results.append({"metrics": metrics, "results": results}) except Exception as e: print(f"Error evaluating dataset {dataset_name}: {str(e)}") @@ -89,13 +80,10 @@ class OpenRouterEvaluator: def main(): - parser = argparse.ArgumentParser( - description='Evaluate models on reasoning datasets') - parser.add_argument('--model', required=True, help='Model to evaluate') - parser.add_argument('--config', required=True, - help='Path to JSON configuration file') - parser.add_argument('--output-dir', default='results', - help='Output directory') + parser = argparse.ArgumentParser(description="Evaluate models on reasoning datasets") + parser.add_argument("--model", required=True, help="Model to evaluate") + parser.add_argument("--config", required=True, help="Path to JSON configuration file") + parser.add_argument("--output-dir", default="results", help="Output directory") args = parser.parse_args() @@ -103,7 +91,7 @@ def main(): os.makedirs(args.output_dir, exist_ok=True) # Load dataset configurations - with open(args.config, 'r') as f: + with open(args.config, "r") as f: dataset_configs = json.load(f) evaluator = OpenRouterEvaluator(model=args.model) @@ -111,35 +99,33 @@ def main(): # Save results output_file = os.path.join( - args.output_dir, - f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + args.output_dir, f"evaluation_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" ) # Save detailed results - with open(output_file, 'w') as f: + with open(output_file, "w") as f: json.dump(all_results, f, indent=2) # Create summary summary = [] for result in all_results: - metrics = result['metrics'] + metrics = result["metrics"] summary_entry = { - 'dataset_name': metrics['dataset_name'], - 'model': metrics['model'], - 'average_score': metrics['average_score'], - 'total_examples': metrics['total_examples'], - 'timestamp': metrics['timestamp'], - 'config': metrics['config'] + "dataset_name": metrics["dataset_name"], + "model": metrics["model"], + "average_score": metrics["average_score"], + "total_examples": metrics["total_examples"], + "timestamp": metrics["timestamp"], + "config": metrics["config"], } summary.append(summary_entry) # Save summary to a separate file summary_file = os.path.join( - args.output_dir, - f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" + args.output_dir, f"summary_{args.model.replace('/', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" ) - - with open(summary_file, 'w') as f: + + with open(summary_file, "w") as f: json.dump(summary, f, indent=2) # Print summary @@ -148,10 +134,10 @@ def main(): print(f"\nDataset: {entry['dataset_name']}") print(f"Average Score: {entry['average_score']:.2%}") print(f"Total Examples: {entry['total_examples']}") - + print(f"\nDetailed results saved to: {output_file}") print(f"Summary saved to: {summary_file}") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/eval/eval_basic.sh b/eval/eval_basic.sh index 2a7eb9bc..69f5251e 100644 --- a/eval/eval_basic.sh +++ b/eval/eval_basic.sh @@ -27,4 +27,4 @@ for model in "${MODELS[@]}"; do --output-dir "$OUTPUT_DIR" done -echo "All evaluations completed!" \ No newline at end of file +echo "All evaluations completed!" diff --git a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json index 2d4a5a93..5e82c747 100644 --- a/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json +++ b/eval/results/evaluation_google_gemini-2.0-flash-001_20250209_223527.json @@ -644,4 +644,4 @@ } ] } -] \ No newline at end of file +] diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json index 92d49c86..cb8b45cc 100644 --- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json +++ b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220608.json @@ -9,4 +9,4 @@ "timestamp": "2025-02-10T06:06:08.539389" }, "results": [] -} \ No newline at end of file +} diff --git a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json index cbac4104..16bd308e 100644 --- a/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json +++ b/eval/results/leg_counting_google_gemini-2.0-flash-001_20250209_220753.json @@ -183,4 +183,4 @@ } } ] -} \ No newline at end of file +} diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json index 590a48d6..49697c4b 100644 --- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json +++ b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220610.json @@ -9,4 +9,4 @@ "timestamp": "2025-02-10T06:06:10.638347" }, "results": [] -} \ No newline at end of file +} diff --git a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json index 8144479d..13f80c7a 100644 --- a/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json +++ b/eval/results/propositional_logic_google_gemini-2.0-flash-001_20250209_220824.json @@ -198,4 +198,4 @@ } } ] -} \ No newline at end of file +} diff --git a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json index 661bdc24..8c9d6a5c 100644 --- a/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json +++ b/eval/results/summary_google_gemini-2.0-flash-001_20250209_223527.json @@ -36,4 +36,4 @@ "seed": 42 } } -] \ No newline at end of file +] diff --git a/requirements-dev.txt b/requirements-dev.txt index b082f034..18cbc82d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,4 +5,4 @@ isort>=5.13.2 flake8>=7.1.1 mypy>=1.14.1 pre-commit>=4.1.0 -openai>=1.61.1 \ No newline at end of file +openai>=1.61.1