Files
autothink/optillm/plugins/spl/evaluation.py
Asankhaya Sharma 6d6f8503cc fix edge cases
2025-05-16 10:05:18 +08:00

256 lines
11 KiB
Python

"""
Functions for evaluating strategies in the System Prompt Learning (SPL) plugin.
"""
import logging
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from optillm.plugins.spl.strategy import Strategy
from optillm.plugins.spl.utils import extract_thinking
from optillm.plugins.spl.prompts import (
STRATEGY_EVALUATION_PROMPT,
STRATEGY_REFINEMENT_PROMPT
)
from optillm.plugins.spl.config import (
DEFAULT_MAX_TOKENS,
MAX_STRATEGIES_FOR_INFERENCE,
MIN_SUCCESS_RATE_FOR_INFERENCE
)
# Setup logging
logger = logging.getLogger(__name__)
def select_relevant_strategies(query: str, problem_type: str, db: Any, learning_mode: bool = False, max_strategies: int = MAX_STRATEGIES_FOR_INFERENCE) -> List[Strategy]:
"""
Select the most relevant strategies for a given problem to be used during inference.
This controls how many strategies are included in the system prompt augmentation.
When in inference mode (not learning_mode), only strategies with:
- A matching problem type
- Success rate >= MIN_SUCCESS_RATE_FOR_INFERENCE
- At least 5 attempts
are selected.
In learning mode, strategies with fewer attempts are also considered.
Args:
query: The problem/query text
problem_type: The type of problem
db: Strategy database
learning_mode: Whether we're in learning mode (affects filtering criteria)
max_strategies: Maximum number of strategies to return
Returns:
List[Strategy]: The selected strategies (may be empty if none meet criteria)
"""
# First, get strategies specifically for this problem type
type_specific = db.get_strategies_for_problem(problem_type)
logger.info(f"Found {len(type_specific)} strategies for problem type '{problem_type}'")
# Filter strategies by minimum success rate and attempts
qualified_strategies = []
for strategy in type_specific:
# In learning mode, we're more lenient with new strategies
if learning_mode and strategy.total_attempts < 5:
logger.info(f"Strategy {strategy.strategy_id} included (learning mode - only {strategy.total_attempts} attempts so far)")
qualified_strategies.append(strategy)
# For inference or well-tested strategies, we require minimum success rate
elif strategy.success_rate >= MIN_SUCCESS_RATE_FOR_INFERENCE and strategy.total_attempts >= 5:
logger.info(f"Strategy {strategy.strategy_id} qualified - success rate {strategy.success_rate:.2f} >= minimum {MIN_SUCCESS_RATE_FOR_INFERENCE:.2f} with {strategy.total_attempts} attempts")
qualified_strategies.append(strategy)
else:
if strategy.total_attempts < 5:
logger.info(f"Strategy {strategy.strategy_id} skipped - insufficient attempts ({strategy.total_attempts} < 5) in inference mode")
else:
logger.info(f"Strategy {strategy.strategy_id} skipped - success rate {strategy.success_rate:.2f} < minimum {MIN_SUCCESS_RATE_FOR_INFERENCE:.2f}")
if not qualified_strategies:
logger.info(f"No strategies meet the minimum success rate threshold ({MIN_SUCCESS_RATE_FOR_INFERENCE:.2f}) for problem type '{problem_type}'")
return []
logger.info(f"Found {len(qualified_strategies)} strategies that meet minimum success rate requirement")
# If we have more qualified strategies than needed, sort and select the best ones
if len(qualified_strategies) > max_strategies:
# Score each strategy based on success rate and recency
scored_strategies = []
for strategy in qualified_strategies:
recency_score = 0
if strategy.last_used:
# Calculate days since last use
last_used = datetime.fromisoformat(strategy.last_used)
days_since = (datetime.now() - last_used).days
recency_score = max(0, 1.0 - min(1.0, days_since / 30.0)) # Higher for more recent
# Combined score with success rate weighing more
score = (0.7 * strategy.success_rate) + (0.3 * recency_score)
scored_strategies.append((strategy, score))
# Sort by score (descending) and take top strategies
scored_strategies.sort(key=lambda x: x[1], reverse=True)
selected = [s[0] for s in scored_strategies[:max_strategies]]
# Log which strategies we're using
for i, strategy in enumerate(selected, 1):
logger.info(f"Selected strategy {i}/{max_strategies} for inference: {strategy.strategy_id} (success rate: {strategy.success_rate:.2f})")
return selected
# If we have fewer or equal to the maximum, use all qualified strategies
for i, strategy in enumerate(qualified_strategies, 1):
logger.info(f"Selected strategy {i}/{len(qualified_strategies)} for inference: {strategy.strategy_id} (success rate: {strategy.success_rate:.2f})")
return qualified_strategies
def evaluate_strategy_effectiveness(response: str, thinking: Optional[str], selected_strategies: List[Strategy], client, model: str) -> Dict[str, bool]:
"""
Evaluate how effective each strategy was in generating the response.
Args:
response: The LLM's final response to the query
thinking: The LLM's reasoning process (if any)
selected_strategies: The strategies that were used
client: LLM client for making API calls
model: Model identifier
Returns:
Dict[str, bool]: Mapping from strategy ID to effectiveness (True/False)
"""
if not selected_strategies:
return {}
results = {}
try:
for strategy in selected_strategies:
# Include thinking in the evaluation if available
full_response = thinking + "\n\n" + response if thinking else response
messages = [
{
"role": "system",
"content": STRATEGY_EVALUATION_PROMPT
},
{
"role": "user",
"content": (
f"Strategy:\n{strategy.strategy_text}\n\n"
f"Response (including reasoning):\n{full_response}\n\n"
f"Does the response show clear evidence that the strategy was effectively applied? "
f"Answer with ONLY YES or NO."
)
}
]
eval_response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0.1, # Low temperature for more deterministic output
max_tokens=DEFAULT_MAX_TOKENS # Increased token limit for reasoning LLMs
)
# Get the response and extract final answer (remove thinking blocks)
result_text = eval_response.choices[0].message.content
final_result, eval_thinking = extract_thinking(result_text)
# Clean up and normalize the result
final_result = final_result.strip().upper()
logger.debug(f"Strategy evaluation - raw response: '{result_text}'")
logger.debug(f"Strategy evaluation - final result after removing thinking: '{final_result}'")
# Check for YES in the final answer (not in thinking blocks)
is_effective = "YES" in final_result
results[strategy.strategy_id] = is_effective
logger.info(f"Strategy {strategy.strategy_id} evaluation: {final_result} -> {is_effective}")
except Exception as e:
logger.error(f"Error evaluating strategy effectiveness: {str(e)}")
# Default to neutral results if evaluation fails
for strategy in selected_strategies:
results[strategy.strategy_id] = False
return results
def refine_strategy(strategy: Strategy, problem: str, response: str, thinking: Optional[str], client, model: str) -> Strategy:
"""
Refine a strategy based on its application to a specific problem.
Args:
strategy: The strategy to refine
problem: The problem that was solved
response: The LLM's final response to the problem
thinking: The LLM's reasoning process (if any)
client: LLM client for making API calls
model: Model identifier
Returns:
Strategy: The refined strategy
"""
try:
# Include thinking in refinement if available
full_response = thinking + "\n\n" + response if thinking else response
messages = [
{
"role": "system",
"content": STRATEGY_REFINEMENT_PROMPT
},
{
"role": "user",
"content": (
f"Original strategy for {strategy.problem_type} problems:\n{strategy.strategy_text}\n\n"
f"New problem:\n{problem}\n\n"
f"Solution process (including reasoning):\n{full_response}\n\n"
f"Provide a refined version of the original strategy that incorporates "
f"any insights from this new example."
)
}
]
refine_response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0.5,
max_tokens=DEFAULT_MAX_TOKENS # Increased token limit for reasoning LLMs
)
response_text = refine_response.choices[0].message.content
# Extract refined strategy and thinking
refined_text, refinement_thinking = extract_thinking(response_text)
if not refined_text.strip():
refined_text = response_text # Use full response if extraction failed
logger.debug(f"Strategy refinement - raw response: '{response_text}'")
logger.debug(f"Strategy refinement - final text after removing thinking: '{refined_text}'")
# Create a copy of the strategy with the refined text
refined_strategy = Strategy(
strategy_id=strategy.strategy_id,
problem_type=strategy.problem_type,
strategy_text=refined_text.strip(),
examples=strategy.examples + [problem],
success_count=strategy.success_count,
total_attempts=strategy.total_attempts,
created_at=strategy.created_at,
last_used=datetime.now().isoformat(),
last_updated=datetime.now().isoformat(),
confidence=strategy.confidence,
tags=strategy.tags,
reasoning_examples=strategy.reasoning_examples.copy()
)
# Add the refinement thinking if available
if refinement_thinking:
refined_strategy.add_reasoning_example(refinement_thinking)
return refined_strategy
except Exception as e:
logger.error(f"Error refining strategy: {str(e)}")
# Return the original strategy if refinement fails
return strategy