feat: Add comprehensive visualization script for evaluation results (#376)

This commit is contained in:
Andreas Köpf
2025-03-16 12:26:27 +01:00
committed by GitHub
parent d6f399b8e4
commit 4e7d9296ee
3 changed files with 705 additions and 0 deletions

View File

@@ -1,3 +1,4 @@
openai>=1.64.0
PyYAML>=6.0
tqdm>=4.66.0
matplotlib>=3.10.0

704
eval/visualize_results.py Normal file
View File

@@ -0,0 +1,704 @@
#!/usr/bin/env python
"""
Visualization script for reasoning gym evaluation results.
This script generates visualizations from evaluation results stored in summary.json files.
Usage:
python visualize_results.py --results-dir results/ [options]
Options:
--output-dir DIR Directory to save visualizations (default: visualizations)
--plots PLOTS Comma-separated list of plots to generate (default: all)
Available: radar,bar,violin,heatmap,dashboard,distribution,top_datasets
--top-n N Number of datasets to show in top datasets plot (default: 15)
--top-mode MODE Mode for top datasets plot: hardest, easiest, variable (default: hardest)
--format FORMAT Output format for plots: png, pdf, svg (default: png)
--dpi DPI DPI for output images (default: 300)
--no-show Don't display plots, just save them
--debug Enable debug logging
"""
import argparse
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.figure import Figure
from matplotlib.patches import Patch
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler()],
)
logger = logging.getLogger("visualize_results")
def load_summaries(results_dir: str) -> Dict[str, Dict[str, Any]]:
"""Load all summary.json files from subdirectories.
Args:
results_dir: Directory containing model evaluation results
Returns:
Dictionary mapping model names to their summary data
"""
summaries = {}
results_path = Path(results_dir)
if not results_path.exists():
logger.error(f"Results directory {results_dir} does not exist")
return {}
# Find all summary.json files
for model_dir in results_path.iterdir():
if not model_dir.is_dir():
continue
summary_path = model_dir / "summary.json"
if not summary_path.exists():
logger.warning(f"No summary.json found in {model_dir}")
continue
try:
# Extract model name from directory name (remove timestamp)
model_name = re.sub(r"_\d{8}_\d{6}$", "", model_dir.name)
# Replace underscores with slashes in model name for better display
model_name = model_name.replace("_", "/")
with open(summary_path, "r") as f:
summary_data = json.load(f)
# Check if summary has required fields
if "dataset_best_scores" not in summary_data:
logger.warning(f"Summary in {model_dir} is missing required fields")
continue
summaries[model_name] = summary_data
logger.info(f"Loaded summary for {model_name}")
except Exception as e:
logger.error(f"Error loading summary from {model_dir}: {str(e)}")
if not summaries:
logger.error("No valid summary files found")
return summaries
def get_dataset_categories(results_dir: str, summaries: Dict[str, Dict[str, Any]]) -> Dict[str, List[str]]:
"""Group datasets by their categories based on directory structure.
Args:
results_dir: Directory containing model evaluation results
summaries: Dictionary of model summaries
Returns:
Dictionary mapping category names to lists of dataset names
"""
categories = {}
results_path = Path(results_dir)
# Get all dataset names from the first summary
if not summaries:
return {}
first_summary = next(iter(summaries.values()))
all_datasets = set(first_summary["dataset_best_scores"].keys())
# Find categories by looking at directory structure
for model_dir in results_path.iterdir():
if not model_dir.is_dir():
continue
# Look for category directories
for category_dir in model_dir.iterdir():
if not category_dir.is_dir():
continue
category_name = category_dir.name
if category_name not in categories:
categories[category_name] = []
# Find all dataset JSON files in this category
for dataset_file in category_dir.glob("*.json"):
dataset_name = dataset_file.stem
if dataset_name in all_datasets and dataset_name not in categories[category_name]:
categories[category_name].append(dataset_name)
# Check if we found categories for all datasets
categorized_datasets = set()
for datasets in categories.values():
categorized_datasets.update(datasets)
uncategorized = all_datasets - categorized_datasets
if uncategorized:
logger.warning(f"Found {len(uncategorized)} datasets without categories")
categories["uncategorized"] = list(uncategorized)
return categories
def create_category_radar(summaries: Dict[str, Dict[str, Any]], categories: Dict[str, List[str]]) -> Figure:
"""Create a radar chart showing performance by category.
Args:
summaries: Dictionary of model summaries
categories: Dictionary mapping categories to dataset lists
Returns:
Matplotlib figure
"""
# Calculate average score per category for each model
category_scores = {}
for model_name, summary in summaries.items():
category_scores[model_name] = {}
for category, datasets in categories.items():
scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset in datasets]
if scores: # Avoid division by zero
category_scores[model_name][category] = np.mean(scores)
else:
category_scores[model_name][category] = 0
# Create radar chart
categories_list = sorted(categories.keys())
angles = np.linspace(0, 2 * np.pi, len(categories_list), endpoint=False).tolist()
angles += angles[:1] # Close the loop
fig, ax = plt.subplots(figsize=(12, 10), subplot_kw=dict(polar=True))
# Use a color cycle for different models
colors = plt.cm.tab10.colors
for i, (model_name, scores) in enumerate(category_scores.items()):
color = colors[i % len(colors)]
values = [scores[cat] for cat in categories_list]
values += values[:1] # Close the loop
ax.plot(angles, values, linewidth=2, label=model_name, color=color)
ax.fill(angles, values, alpha=0.1, color=color)
# Set category labels
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories_list)
# Add radial grid lines at 0.2, 0.4, 0.6, 0.8
ax.set_rticks([0.2, 0.4, 0.6, 0.8])
ax.set_yticklabels(["0.2", "0.4", "0.6", "0.8"])
ax.set_rlabel_position(0) # Move radial labels away from plotted line
# Add legend and title
plt.legend(loc="upper right", bbox_to_anchor=(0.1, 0.1))
plt.title("Model Performance by Category", size=15)
return fig
def create_overall_performance_bar(summaries: Dict[str, Dict[str, Any]]) -> Figure:
"""Create a bar chart of overall model performance.
Args:
summaries: Dictionary of model summaries
Returns:
Matplotlib figure
"""
# Calculate overall average score for each model
overall_scores = {}
for model_name, summary in summaries.items():
scores = list(summary["dataset_best_scores"].values())
overall_scores[model_name] = np.mean(scores)
# Sort models by performance
sorted_models = sorted(overall_scores.items(), key=lambda x: x[1], reverse=True)
# Create bar chart
fig, ax = plt.subplots(figsize=(12, 6))
models = [m[0] for m in sorted_models]
scores = [m[1] for m in sorted_models]
# Use a color gradient based on performance
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(models)))
bars = ax.bar(models, scores, color=colors)
# Add value labels on top of bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width() / 2.0, height + 0.01, f"{height:.2%}", ha="center", va="bottom")
ax.set_ylabel("Average Score")
ax.set_ylim(0, max(scores) * 1.1) # Add some space for labels
plt.xticks(rotation=45, ha="right")
plt.title("Overall Model Performance", size=15)
plt.tight_layout()
return fig
def create_top_datasets_comparison(summaries: Dict[str, Dict[str, Any]], n: int = 15, mode: str = "hardest") -> Figure:
"""Create a bar chart comparing performance on top N datasets.
Args:
summaries: Dictionary of model summaries
n: Number of datasets to show
mode: Selection mode - 'hardest', 'easiest', or 'variable'
Returns:
Matplotlib figure
"""
if not summaries:
logger.error("No summaries provided")
return plt.figure()
# Calculate average score across all models for each dataset
dataset_avg_scores = {}
for dataset in next(iter(summaries.values()))["dataset_best_scores"].keys():
scores = [summary["dataset_best_scores"].get(dataset, 0) for summary in summaries.values()]
dataset_avg_scores[dataset] = np.mean(scores)
# Select top N datasets based on mode
if mode == "hardest":
# Select datasets with lowest average scores
selected_datasets = sorted(dataset_avg_scores.items(), key=lambda x: x[1])[:n]
elif mode == "easiest":
# Select datasets with highest average scores
selected_datasets = sorted(dataset_avg_scores.items(), key=lambda x: x[1], reverse=True)[:n]
else: # 'variable'
# Select datasets with highest variance in scores
dataset_variances = {}
for dataset in next(iter(summaries.values()))["dataset_best_scores"].keys():
scores = [summary["dataset_best_scores"].get(dataset, 0) for summary in summaries.values()]
dataset_variances[dataset] = np.var(scores)
selected_datasets = sorted(dataset_variances.items(), key=lambda x: x[1], reverse=True)[:n]
selected_datasets = [(dataset, dataset_avg_scores[dataset]) for dataset, _ in selected_datasets]
# Create horizontal bar chart
fig, ax = plt.subplots(figsize=(12, n * 0.5))
datasets = [d[0] for d in selected_datasets]
x = np.arange(len(datasets))
width = 0.8 / len(summaries)
# Use a color cycle for different models
colors = plt.cm.tab10.colors
for i, (model_name, summary) in enumerate(summaries.items()):
scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset, _ in selected_datasets]
ax.barh(
x + i * width - 0.4 + width / 2, scores, width, label=model_name, color=colors[i % len(colors)], alpha=0.8
)
ax.set_yticks(x)
ax.set_yticklabels(datasets)
ax.set_xlabel("Score")
ax.set_xlim(0, 1)
# Add legend and title
plt.legend(loc="upper right")
title = f'Model Performance on {n} {"Hardest" if mode=="hardest" else "Easiest" if mode=="easiest" else "Most Variable"} Datasets'
plt.title(title, size=15)
plt.tight_layout()
return fig
def create_performance_distribution_violin(summaries: Dict[str, Dict[str, Any]]) -> Figure:
"""Create a violin plot showing score distribution for each model.
Args:
summaries: Dictionary of model summaries
Returns:
Matplotlib figure
"""
# Prepare data for violin plot
data = []
labels = []
for model_name, summary in summaries.items():
scores = list(summary["dataset_best_scores"].values())
data.append(scores)
labels.append(model_name)
# Create violin plot
fig, ax = plt.subplots(figsize=(12, 6))
# Use a color cycle
colors = plt.cm.tab10.colors
parts = ax.violinplot(data, showmeans=True, showmedians=True)
# Customize violin plot
for i, pc in enumerate(parts["bodies"]):
pc.set_facecolor(colors[i % len(colors)])
pc.set_alpha(0.7)
# Add labels
ax.set_xticks(np.arange(1, len(labels) + 1))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_ylabel("Score Distribution")
ax.set_ylim(0, 1)
# Add grid for better readability
ax.yaxis.grid(True)
# Add mean and median to legend
legend_elements = [
Patch(facecolor="black", edgecolor="black", label="Mean", alpha=0.3),
Patch(facecolor="white", edgecolor="black", label="Median"),
]
ax.legend(handles=legend_elements, loc="upper right")
plt.title("Distribution of Scores Across All Datasets", size=15)
plt.tight_layout()
return fig
def create_performance_heatmap(summaries: Dict[str, Dict[str, Any]], categories: Dict[str, List[str]]) -> Figure:
"""Create a heatmap of model performance across datasets.
Args:
summaries: Dictionary of model summaries
categories: Dictionary mapping categories to dataset lists
Returns:
Matplotlib figure
"""
if not summaries:
logger.error("No summaries provided")
return plt.figure()
# Get all dataset names
all_datasets = []
for category, datasets in sorted(categories.items()):
all_datasets.extend(sorted(datasets))
models = list(summaries.keys())
# Create score matrix
score_matrix = np.zeros((len(models), len(all_datasets)))
for i, model in enumerate(models):
for j, dataset in enumerate(all_datasets):
score_matrix[i, j] = summaries[model]["dataset_best_scores"].get(dataset, 0)
# Create heatmap
fig, ax = plt.subplots(figsize=(max(20, len(all_datasets) * 0.25), max(8, len(models) * 0.5)))
im = ax.imshow(score_matrix, cmap="viridis", aspect="auto", vmin=0, vmax=1)
# Add colorbar
cbar = ax.figure.colorbar(im, ax=ax)
cbar.ax.set_ylabel("Score", rotation=-90, va="bottom")
# Set ticks and labels
ax.set_xticks(np.arange(len(all_datasets)))
ax.set_yticks(np.arange(len(models)))
ax.set_xticklabels(all_datasets, rotation=90, fontsize=8)
ax.set_yticklabels(models)
# Add category separators and labels
current_idx = 0
for category, datasets in sorted(categories.items()):
if datasets:
# Add vertical line after each category
next_idx = current_idx + len(datasets)
if next_idx < len(all_datasets):
ax.axvline(x=next_idx - 0.5, color="white", linestyle="-", linewidth=2)
# Add category label
middle_idx = current_idx + len(datasets) / 2 - 0.5
ax.text(
middle_idx,
-0.5,
category,
ha="center",
va="top",
fontsize=10,
bbox=dict(facecolor="white", alpha=0.7, edgecolor="none"),
)
current_idx = next_idx
# Add grid lines
ax.set_xticks(np.arange(-0.5, len(all_datasets), 1), minor=True)
ax.set_yticks(np.arange(-0.5, len(models), 1), minor=True)
ax.grid(which="minor", color="w", linestyle="-", linewidth=0.5)
plt.title("Model Performance Heatmap", size=15)
plt.tight_layout()
return fig
def create_dashboard(summaries: Dict[str, Dict[str, Any]], categories: Dict[str, List[str]]) -> Figure:
"""Create a comprehensive dashboard with multiple visualizations.
Args:
summaries: Dictionary of model summaries
categories: Dictionary mapping categories to dataset lists
Returns:
Matplotlib figure
"""
if not summaries:
logger.error("No summaries provided")
return plt.figure()
fig = plt.figure(figsize=(20, 15))
# 1. Overall performance comparison
ax1 = plt.subplot2grid((2, 2), (0, 0))
models = []
scores = []
for model_name, summary in summaries.items():
models.append(model_name)
scores.append(np.mean(list(summary["dataset_best_scores"].values())))
# Sort by performance
sorted_indices = np.argsort(scores)[::-1]
models = [models[i] for i in sorted_indices]
scores = [scores[i] for i in sorted_indices]
# Use a color gradient based on performance
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(models)))
bars = ax1.bar(models, scores, color=colors)
for bar in bars:
height = bar.get_height()
ax1.text(
bar.get_x() + bar.get_width() / 2.0, height + 0.01, f"{height:.2%}", ha="center", va="bottom", fontsize=8
)
ax1.set_ylabel("Average Score")
ax1.set_ylim(0, max(scores) * 1.1)
plt.setp(ax1.get_xticklabels(), rotation=45, ha="right", fontsize=8)
ax1.set_title("Overall Model Performance", size=12)
# 2. Top 10 hardest datasets comparison
ax2 = plt.subplot2grid((2, 2), (0, 1))
# Calculate average score across all models for each dataset
dataset_avg_scores = {}
for dataset in next(iter(summaries.values()))["dataset_best_scores"].keys():
scores = [summary["dataset_best_scores"].get(dataset, 0) for summary in summaries.values()]
dataset_avg_scores[dataset] = np.mean(scores)
# Select 10 hardest datasets
hardest_datasets = sorted(dataset_avg_scores.items(), key=lambda x: x[1])[:10]
datasets = [d[0] for d in hardest_datasets]
x = np.arange(len(datasets))
width = 0.8 / len(summaries)
# Use a color cycle for different models
colors = plt.cm.tab10.colors
for i, (model_name, summary) in enumerate(summaries.items()):
scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset, _ in hardest_datasets]
ax2.barh(
x + i * width - 0.4 + width / 2, scores, width, label=model_name, color=colors[i % len(colors)], alpha=0.8
)
ax2.set_yticks(x)
ax2.set_yticklabels(datasets, fontsize=8)
ax2.set_xlabel("Score")
ax2.set_xlim(0, 1)
ax2.set_title("Performance on 10 Hardest Datasets", size=12)
ax2.legend(fontsize=8)
# 3. Category radar chart
ax3 = plt.subplot2grid((2, 2), (1, 0), polar=True)
# Calculate average score per category for each model
category_scores = {}
for model_name, summary in summaries.items():
category_scores[model_name] = {}
for category, datasets in categories.items():
scores = [summary["dataset_best_scores"].get(dataset, 0) for dataset in datasets]
if scores: # Avoid division by zero
category_scores[model_name][category] = np.mean(scores)
else:
category_scores[model_name][category] = 0
# Create radar chart
categories_list = sorted(categories.keys())
angles = np.linspace(0, 2 * np.pi, len(categories_list), endpoint=False).tolist()
angles += angles[:1] # Close the loop
for i, (model_name, scores) in enumerate(category_scores.items()):
color = colors[i % len(colors)]
values = [scores.get(cat, 0) for cat in categories_list]
values += values[:1] # Close the loop
ax3.plot(angles, values, linewidth=2, label=model_name, color=color)
ax3.fill(angles, values, alpha=0.1, color=color)
ax3.set_xticks(angles[:-1])
ax3.set_xticklabels(categories_list, fontsize=8)
ax3.set_title("Performance by Category", size=12)
# 4. Performance distribution violin plot
ax4 = plt.subplot2grid((2, 2), (1, 1))
data = []
labels = []
for model_name, summary in summaries.items():
scores = list(summary["dataset_best_scores"].values())
data.append(scores)
labels.append(model_name)
parts = ax4.violinplot(data, showmeans=True, showmedians=True)
for i, pc in enumerate(parts["bodies"]):
pc.set_facecolor(colors[i % len(colors)])
pc.set_alpha(0.7)
ax4.set_xticks(np.arange(1, len(labels) + 1))
ax4.set_xticklabels(labels, rotation=45, ha="right", fontsize=8)
ax4.set_ylabel("Score Distribution")
ax4.set_ylim(0, 1)
ax4.yaxis.grid(True)
ax4.set_title("Distribution of Scores", size=12)
plt.tight_layout()
plt.suptitle("Model Evaluation Dashboard", size=16, y=0.98)
plt.subplots_adjust(top=0.9)
return fig
def save_figure(fig: Figure, output_dir: str, name: str, fmt: str = "png", dpi: int = 300) -> str:
"""Save a figure to a file.
Args:
fig: Matplotlib figure to save
output_dir: Directory to save the figure
name: Base name for the figure file
fmt: File format (png, pdf, svg)
dpi: DPI for raster formats
Returns:
Path to the saved file
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Create filename
filename = f"{name}.{fmt}"
filepath = os.path.join(output_dir, filename)
# Save figure
fig.savefig(filepath, dpi=dpi, bbox_inches="tight")
logger.info(f"Saved {filepath}")
return filepath
def main():
"""Main function."""
parser = argparse.ArgumentParser(description="Generate visualizations from evaluation results")
parser.add_argument("--results-dir", required=True, help="Directory containing evaluation results")
parser.add_argument("--output-dir", default="visualizations", help="Directory to save visualizations")
parser.add_argument("--plots", default="all", help="Comma-separated list of plots to generate")
parser.add_argument("--top-n", type=int, default=15, help="Number of datasets to show in top datasets plot")
parser.add_argument(
"--top-mode", default="hardest", choices=["hardest", "easiest", "variable"], help="Mode for top datasets plot"
)
parser.add_argument("--format", default="png", choices=["png", "pdf", "svg"], help="Output format for plots")
parser.add_argument("--dpi", type=int, default=300, help="DPI for output images")
parser.add_argument("--no-show", action="store_true", help="Don't display plots, just save them")
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
args = parser.parse_args()
# Configure logging
if args.debug:
logger.setLevel(logging.DEBUG)
# Load summaries
logger.info(f"Loading summaries from {args.results_dir}")
summaries = load_summaries(args.results_dir)
if not summaries:
logger.error("No valid summaries found. Exiting.")
return 1
logger.info(f"Found {len(summaries)} model summaries")
# Get dataset categories
categories = get_dataset_categories(args.results_dir, summaries)
logger.info(f"Found {len(categories)} dataset categories")
# Determine which plots to generate
if args.plots.lower() == "all":
plots_to_generate = ["radar", "bar", "violin", "heatmap", "dashboard", "top_datasets"]
else:
plots_to_generate = [p.strip().lower() for p in args.plots.split(",")]
logger.info(f"Generating plots: {', '.join(plots_to_generate)}")
# Generate and save plots
for plot_type in plots_to_generate:
try:
if plot_type == "radar":
fig = create_category_radar(summaries, categories)
save_figure(fig, args.output_dir, "category_radar", args.format, args.dpi)
elif plot_type == "bar":
fig = create_overall_performance_bar(summaries)
save_figure(fig, args.output_dir, "overall_performance", args.format, args.dpi)
elif plot_type == "violin":
fig = create_performance_distribution_violin(summaries)
save_figure(fig, args.output_dir, "score_distribution", args.format, args.dpi)
elif plot_type == "heatmap":
fig = create_performance_heatmap(summaries, categories)
save_figure(fig, args.output_dir, "performance_heatmap", args.format, args.dpi)
elif plot_type == "dashboard":
fig = create_dashboard(summaries, categories)
save_figure(fig, args.output_dir, "evaluation_dashboard", args.format, args.dpi)
elif plot_type == "top_datasets":
fig = create_top_datasets_comparison(summaries, args.top_n, args.top_mode)
save_figure(fig, args.output_dir, f"top_{args.top_n}_{args.top_mode}_datasets", args.format, args.dpi)
else:
logger.warning(f"Unknown plot type: {plot_type}")
continue
# Show plot if requested
if not args.no_show:
plt.show()
else:
plt.close(fig)
except Exception as e:
logger.error(f"Error generating {plot_type} plot: {str(e)}")
if args.debug:
import traceback
traceback.print_exc()
logger.info(f"All visualizations saved to {args.output_dir}")
return 0
if __name__ == "__main__":
exit_code = main()
import sys
sys.exit(exit_code)