update evals

This commit is contained in:
Asankhaya Sharma
2024-10-01 16:45:04 -07:00
parent e831db3c5b
commit 083a058b27
5 changed files with 52 additions and 37 deletions

View File

@@ -22,7 +22,7 @@ from optillm.moa import mixture_of_agents
from optillm.rto import round_trip_optimization
from optillm.self_consistency import advanced_self_consistency_approach
from optillm.pvg import inference_time_pv_game
from optillm.z3_solver import Z3SolverSystem
from optillm.z3_solver import Z3SymPySolverSystem
from optillm.rstar import RStar
from optillm.cot_reflection import cot_reflection
from optillm.plansearch import plansearch
@@ -147,7 +147,7 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
elif approach == 'rto':
return round_trip_optimization(system_prompt, initial_query, client, model)
elif approach == 'z3':
z3_solver = Z3SolverSystem(system_prompt, client, model)
z3_solver = Z3SymPySolverSystem(system_prompt, client, model)
return z3_solver.process_query(initial_query)
elif approach == "self_consistency":
return advanced_self_consistency_approach(system_prompt, initial_query, client, model)

View File

@@ -59,7 +59,7 @@ def round_trip_optimization(system_prompt: str, initial_query: str, client, mode
c2 = extract_code_from_prompt(c2)
if c1.strip() == c2.strip():
return c1
return c1, rto_completion_tokens
messages = [{"role": "system", "content": system_prompt},
{"role": "user", "content": f"Initial query: {initial_query}\n\nFirst generated code (C1):\n{c1}\n\nSecond generated code (C2):\n{c2}\n\nBased on the initial query and these two different code implementations, generate a final, optimized version of the code. Only respond with the final code, do not return anything else."}]

View File

@@ -1,5 +1,6 @@
from typing import Dict, Any
from z3 import *
import sympy
import io
import re
import contextlib
@@ -52,6 +53,7 @@ def prepare_safe_globals():
def execute_code_in_process(code: str):
import z3
import sympy
import math
import itertools
from fractions import Fraction
@@ -62,9 +64,14 @@ def execute_code_in_process(code: str):
z3_whitelist = set(dir(z3))
safe_globals.update({name: getattr(z3, name) for name in z3_whitelist})
# Ensure key Z3 components are available
# Add SymPy specific functions
sympy_whitelist = set(dir(sympy))
safe_globals.update({name: getattr(sympy, name) for name in sympy_whitelist})
# Ensure key Z3 and SymPy components are available
safe_globals.update({
'z3': z3,
'sympy': sympy,
'Solver': z3.Solver,
'solver': z3.Solver,
'Optimize': z3.Optimize,
@@ -83,6 +90,15 @@ def execute_code_in_process(code: str):
'ForAll': z3.ForAll,
'Exists': z3.Exists,
'model': z3.Model,
'Symbol': sympy.Symbol,
'solve': sympy.solve,
'simplify': sympy.simplify,
'expand': sympy.expand,
'factor': sympy.factor,
'diff': sympy.diff,
'integrate': sympy.integrate,
'limit': sympy.limit,
'series': sympy.series,
})
# Add custom functions
@@ -114,41 +130,38 @@ def execute_code_in_process(code: str):
return ("error", traceback.format_exc())
return ("success", output_buffer.getvalue())
class Z3SolverSystem:
class Z3SymPySolverSystem:
def __init__(self, system_prompt: str, client, model: str, timeout: int = 30):
self.system_prompt = system_prompt
self.model = model
self.client = client
self.timeout = timeout
self.z3_completion_tokens = 0
self.solver_completion_tokens = 0
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def process_query(self, query: str) -> str:
try:
analysis = self.analyze_query(query)
# print("Analysis: "+ analysis)
if "SOLVER_CAN_BE_APPLIED: True" not in analysis:
return self.standard_llm_inference(query) , self.z3_completion_tokens
return self.standard_llm_inference(query), self.solver_completion_tokens
formulation = self.extract_and_validate_expressions(analysis)
# print("Formulation: "+ formulation)
solver_result = self.solve_with_z3(formulation)
# print(solver_result)
solver_result = self.solve_with_z3_sympy(formulation)
return self.generate_response(query, analysis, solver_result), self.z3_completion_tokens
return self.generate_response(query, analysis, solver_result), self.solver_completion_tokens
except Exception as e:
logging.error(f"An error occurred while processing the query with Z3, returning standard llm inference results: {str(e)}")
return self.standard_llm_inference(query), self.z3_completion_tokens
logging.error(f"An error occurred while processing the query with Z3 and SymPy, returning standard llm inference results: {str(e)}")
return self.standard_llm_inference(query), self.solver_completion_tokens
def analyze_query(self, query: str) -> str:
analysis_prompt = f"""Analyze the given query and determine if it can be solved using Z3:
analysis_prompt = f"""Analyze the given query and determine if it can be solved using Z3 or SymPy:
1. Identify variables, constraints, and objectives.
2. Determine the problem type (e.g., SAT, optimization).
3. Decide if Z3 is suitable.
2. Determine the problem type (e.g., SAT, optimization, symbolic manipulation).
3. Decide if Z3, SymPy, or a combination of both is suitable.
If Z3 can be applied, provide Python code using Z3 to solve the problem. Make sure you define any additional methods you need for solving the problem.
The code will be executed in an environment with only Z3 available, so do not include any other libraries or modules.
If Z3 or SymPy can be applied, provide Python code using the appropriate library (or both) to solve the problem. Make sure you define any additional methods you need for solving the problem.
The code will be executed in an environment with Z3 and SymPy available, so do not include any other libraries or modules.
Query: {query}
@@ -157,7 +170,7 @@ SOLVER_CAN_BE_APPLIED: [True/False]
SOLVER_FORMULATION:
```python
# Z3 code here
# Z3 and/or SymPy code here
```
Analysis:
@@ -174,7 +187,7 @@ Analysis:
n=1,
temperature=0.1
)
self.z3_completion_tokens = analysis_response.usage.completion_tokens
self.solver_completion_tokens = analysis_response.usage.completion_tokens
return analysis_response.choices[0].message.content
def generate_response(self, query: str, analysis: str, solver_result: Dict[str, Any]) -> str:
@@ -202,7 +215,7 @@ Response:
n=1,
temperature=0.1
)
self.z3_completion_tokens = response.usage.completion_tokens
self.solver_completion_tokens = response.usage.completion_tokens
return response.choices[0].message.content
def standard_llm_inference(self, query: str) -> str:
@@ -216,27 +229,27 @@ Response:
n=1,
temperature=0.1
)
self.z3_completion_tokens = response.usage.completion_tokens
self.solver_completion_tokens = response.usage.completion_tokens
return response.choices[0].message.content
def extract_and_validate_expressions(self, analysis: str) -> str:
formulation = re.search(r"```python\n([\s\S]+?)```", analysis)
if formulation:
return formulation.group(1).strip()
raise ValueError("No valid Z3 formulation found in the analysis.")
raise ValueError("No valid Z3 or SymPy formulation found in the analysis.")
def solve_with_z3(self, formulation: str, max_attempts: int = 3) -> Dict[str, Any]:
def solve_with_z3_sympy(self, formulation: str, max_attempts: int = 3) -> Dict[str, Any]:
for attempt in range(max_attempts):
output = self.execute_solver_code(formulation)
if "Error:" not in output:
return {"status": "success", "output": output}
error_prompt = f"""Fix the Z3 code that resulted in an error. Follow these steps:
error_prompt = f"""Fix the Z3 or SymPy code that resulted in an error. Follow these steps:
1. Review the original code and the error message carefully.
2. Analyze the error and identify its root cause.
3. Think through the necessary changes to fix the error.
4. Generate a corrected version of the Z3 code.
4. Generate a corrected version of the code.
Original Code:
{formulation}
@@ -247,9 +260,10 @@ Response:
Step-by-Step Analysis:
[Provide your step-by-step analysis here]
Corrected Z3 Code:
Corrected Z3 or SymPy Code:
```python
# Corrected Z3 code here
# Corrected code here
```
"""
response = self.client.chat.completions.create(
model=self.model,
@@ -261,13 +275,13 @@ Response:
n=1,
temperature=0.1
)
self.z3_completion_tokens = response.usage.completion_tokens
self.solver_completion_tokens = response.usage.completion_tokens
formulation = self.extract_and_validate_expressions(response.choices[0].message.content)
return {"status": "failed", "output": "Failed to solve after multiple attempts."}
def execute_solver_code(self, code: str) -> str:
logging.info("Executing Z3 solver code")
logging.info("Executing Z3 and SymPy solver code")
logging.info(f"Code: {code}")
# Parse the code into an AST
@@ -292,5 +306,5 @@ Response:
logging.error(f"Execution error: {result}")
return f"Error: {result}"
logging.info("Z3 solver code executed successfully")
logging.info("Z3 and SymPy solver code executed successfully")
return result

View File

@@ -137,9 +137,9 @@ def main(model: str):
print(f"Accuracy: {accuracy:.2%}")
# Print accuracy by reasoning type
reasoning_types = set(r['reasoning_types'] for r in results)
reasoning_types = set(r['reasoning_type'] for r in results)
for rt in reasoning_types:
rt_samples = [r for r in results if r['reasoning_types'] == rt]
rt_samples = [r for r in results if r['reasoning_type'] == rt]
rt_correct = sum(1 for r in rt_samples if r['evaluation_decision'] == 'TRUE')
rt_accuracy = rt_correct / len(rt_samples)
print(f"Accuracy for {rt}: {rt_accuracy:.2%}")

View File

@@ -26,7 +26,7 @@ async def generate_response(prompt: str, approach: str) -> Dict[str, Any]:
}
else:
# Use OptILM with the specified approach
client = AsyncOpenAI(api_key="none", base_url="http://localhost:8000/v1")
client = AsyncOpenAI(api_key="none", base_url="http://localhost:8080/v1")
response = await client.chat.completions.create(
model=f"{approach}-gpt-4o-mini", # Assuming OptILM uses this naming convention
messages=[{"role": "user", "content": prompt}],
@@ -48,7 +48,7 @@ async def rank_responses(prompt: str, responses: List[Dict[str, Any]]) -> List[i
)
ranking_str = ranking_response.choices[0].message.content.strip()
print(ranking_str)
print(f"Ranking str: {ranking_str}")
return [int(idx) for idx in ranking_str.split(",")]
async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
@@ -66,6 +66,7 @@ async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
rankings = await rank_responses(prompt, results)
# Add rankings to results
print(rankings)
for rank, idx in enumerate(rankings):
results[idx]["rank"] = rank
@@ -79,7 +80,7 @@ async def generate_dataset(num_samples: int, output_file: str):
dataset = load_dataset("lmsys/arena-hard-auto-v0.1", split="train")
with open(output_file, "w") as f:
for sample in tqdm(dataset.select(range(num_samples)), total=num_samples):
for sample in tqdm(dataset.select(range(29, 29 + num_samples)), total=num_samples):
result = await process_sample(sample)
f.write(json.dumps(result) + "\n")