mirror of
https://github.com/codelion/optillm.git
synced 2025-05-28 09:39:38 +03:00
update evals
This commit is contained in:
@@ -22,7 +22,7 @@ from optillm.moa import mixture_of_agents
|
||||
from optillm.rto import round_trip_optimization
|
||||
from optillm.self_consistency import advanced_self_consistency_approach
|
||||
from optillm.pvg import inference_time_pv_game
|
||||
from optillm.z3_solver import Z3SolverSystem
|
||||
from optillm.z3_solver import Z3SymPySolverSystem
|
||||
from optillm.rstar import RStar
|
||||
from optillm.cot_reflection import cot_reflection
|
||||
from optillm.plansearch import plansearch
|
||||
@@ -147,7 +147,7 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
|
||||
elif approach == 'rto':
|
||||
return round_trip_optimization(system_prompt, initial_query, client, model)
|
||||
elif approach == 'z3':
|
||||
z3_solver = Z3SolverSystem(system_prompt, client, model)
|
||||
z3_solver = Z3SymPySolverSystem(system_prompt, client, model)
|
||||
return z3_solver.process_query(initial_query)
|
||||
elif approach == "self_consistency":
|
||||
return advanced_self_consistency_approach(system_prompt, initial_query, client, model)
|
||||
|
||||
@@ -59,7 +59,7 @@ def round_trip_optimization(system_prompt: str, initial_query: str, client, mode
|
||||
c2 = extract_code_from_prompt(c2)
|
||||
|
||||
if c1.strip() == c2.strip():
|
||||
return c1
|
||||
return c1, rto_completion_tokens
|
||||
|
||||
messages = [{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": f"Initial query: {initial_query}\n\nFirst generated code (C1):\n{c1}\n\nSecond generated code (C2):\n{c2}\n\nBased on the initial query and these two different code implementations, generate a final, optimized version of the code. Only respond with the final code, do not return anything else."}]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from typing import Dict, Any
|
||||
from z3 import *
|
||||
import sympy
|
||||
import io
|
||||
import re
|
||||
import contextlib
|
||||
@@ -52,6 +53,7 @@ def prepare_safe_globals():
|
||||
|
||||
def execute_code_in_process(code: str):
|
||||
import z3
|
||||
import sympy
|
||||
import math
|
||||
import itertools
|
||||
from fractions import Fraction
|
||||
@@ -62,9 +64,14 @@ def execute_code_in_process(code: str):
|
||||
z3_whitelist = set(dir(z3))
|
||||
safe_globals.update({name: getattr(z3, name) for name in z3_whitelist})
|
||||
|
||||
# Ensure key Z3 components are available
|
||||
# Add SymPy specific functions
|
||||
sympy_whitelist = set(dir(sympy))
|
||||
safe_globals.update({name: getattr(sympy, name) for name in sympy_whitelist})
|
||||
|
||||
# Ensure key Z3 and SymPy components are available
|
||||
safe_globals.update({
|
||||
'z3': z3,
|
||||
'sympy': sympy,
|
||||
'Solver': z3.Solver,
|
||||
'solver': z3.Solver,
|
||||
'Optimize': z3.Optimize,
|
||||
@@ -83,6 +90,15 @@ def execute_code_in_process(code: str):
|
||||
'ForAll': z3.ForAll,
|
||||
'Exists': z3.Exists,
|
||||
'model': z3.Model,
|
||||
'Symbol': sympy.Symbol,
|
||||
'solve': sympy.solve,
|
||||
'simplify': sympy.simplify,
|
||||
'expand': sympy.expand,
|
||||
'factor': sympy.factor,
|
||||
'diff': sympy.diff,
|
||||
'integrate': sympy.integrate,
|
||||
'limit': sympy.limit,
|
||||
'series': sympy.series,
|
||||
})
|
||||
|
||||
# Add custom functions
|
||||
@@ -114,41 +130,38 @@ def execute_code_in_process(code: str):
|
||||
return ("error", traceback.format_exc())
|
||||
return ("success", output_buffer.getvalue())
|
||||
|
||||
class Z3SolverSystem:
|
||||
class Z3SymPySolverSystem:
|
||||
def __init__(self, system_prompt: str, client, model: str, timeout: int = 30):
|
||||
self.system_prompt = system_prompt
|
||||
self.model = model
|
||||
self.client = client
|
||||
self.timeout = timeout
|
||||
self.z3_completion_tokens = 0
|
||||
self.solver_completion_tokens = 0
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def process_query(self, query: str) -> str:
|
||||
try:
|
||||
analysis = self.analyze_query(query)
|
||||
# print("Analysis: "+ analysis)
|
||||
if "SOLVER_CAN_BE_APPLIED: True" not in analysis:
|
||||
return self.standard_llm_inference(query) , self.z3_completion_tokens
|
||||
return self.standard_llm_inference(query), self.solver_completion_tokens
|
||||
|
||||
formulation = self.extract_and_validate_expressions(analysis)
|
||||
# print("Formulation: "+ formulation)
|
||||
solver_result = self.solve_with_z3(formulation)
|
||||
# print(solver_result)
|
||||
solver_result = self.solve_with_z3_sympy(formulation)
|
||||
|
||||
return self.generate_response(query, analysis, solver_result), self.z3_completion_tokens
|
||||
return self.generate_response(query, analysis, solver_result), self.solver_completion_tokens
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred while processing the query with Z3, returning standard llm inference results: {str(e)}")
|
||||
return self.standard_llm_inference(query), self.z3_completion_tokens
|
||||
logging.error(f"An error occurred while processing the query with Z3 and SymPy, returning standard llm inference results: {str(e)}")
|
||||
return self.standard_llm_inference(query), self.solver_completion_tokens
|
||||
|
||||
def analyze_query(self, query: str) -> str:
|
||||
analysis_prompt = f"""Analyze the given query and determine if it can be solved using Z3:
|
||||
analysis_prompt = f"""Analyze the given query and determine if it can be solved using Z3 or SymPy:
|
||||
|
||||
1. Identify variables, constraints, and objectives.
|
||||
2. Determine the problem type (e.g., SAT, optimization).
|
||||
3. Decide if Z3 is suitable.
|
||||
2. Determine the problem type (e.g., SAT, optimization, symbolic manipulation).
|
||||
3. Decide if Z3, SymPy, or a combination of both is suitable.
|
||||
|
||||
If Z3 can be applied, provide Python code using Z3 to solve the problem. Make sure you define any additional methods you need for solving the problem.
|
||||
The code will be executed in an environment with only Z3 available, so do not include any other libraries or modules.
|
||||
If Z3 or SymPy can be applied, provide Python code using the appropriate library (or both) to solve the problem. Make sure you define any additional methods you need for solving the problem.
|
||||
The code will be executed in an environment with Z3 and SymPy available, so do not include any other libraries or modules.
|
||||
|
||||
Query: {query}
|
||||
|
||||
@@ -157,7 +170,7 @@ SOLVER_CAN_BE_APPLIED: [True/False]
|
||||
|
||||
SOLVER_FORMULATION:
|
||||
```python
|
||||
# Z3 code here
|
||||
# Z3 and/or SymPy code here
|
||||
```
|
||||
|
||||
Analysis:
|
||||
@@ -174,7 +187,7 @@ Analysis:
|
||||
n=1,
|
||||
temperature=0.1
|
||||
)
|
||||
self.z3_completion_tokens = analysis_response.usage.completion_tokens
|
||||
self.solver_completion_tokens = analysis_response.usage.completion_tokens
|
||||
return analysis_response.choices[0].message.content
|
||||
|
||||
def generate_response(self, query: str, analysis: str, solver_result: Dict[str, Any]) -> str:
|
||||
@@ -202,7 +215,7 @@ Response:
|
||||
n=1,
|
||||
temperature=0.1
|
||||
)
|
||||
self.z3_completion_tokens = response.usage.completion_tokens
|
||||
self.solver_completion_tokens = response.usage.completion_tokens
|
||||
return response.choices[0].message.content
|
||||
|
||||
def standard_llm_inference(self, query: str) -> str:
|
||||
@@ -216,27 +229,27 @@ Response:
|
||||
n=1,
|
||||
temperature=0.1
|
||||
)
|
||||
self.z3_completion_tokens = response.usage.completion_tokens
|
||||
self.solver_completion_tokens = response.usage.completion_tokens
|
||||
return response.choices[0].message.content
|
||||
|
||||
def extract_and_validate_expressions(self, analysis: str) -> str:
|
||||
formulation = re.search(r"```python\n([\s\S]+?)```", analysis)
|
||||
if formulation:
|
||||
return formulation.group(1).strip()
|
||||
raise ValueError("No valid Z3 formulation found in the analysis.")
|
||||
raise ValueError("No valid Z3 or SymPy formulation found in the analysis.")
|
||||
|
||||
def solve_with_z3(self, formulation: str, max_attempts: int = 3) -> Dict[str, Any]:
|
||||
def solve_with_z3_sympy(self, formulation: str, max_attempts: int = 3) -> Dict[str, Any]:
|
||||
for attempt in range(max_attempts):
|
||||
output = self.execute_solver_code(formulation)
|
||||
if "Error:" not in output:
|
||||
return {"status": "success", "output": output}
|
||||
|
||||
error_prompt = f"""Fix the Z3 code that resulted in an error. Follow these steps:
|
||||
error_prompt = f"""Fix the Z3 or SymPy code that resulted in an error. Follow these steps:
|
||||
|
||||
1. Review the original code and the error message carefully.
|
||||
2. Analyze the error and identify its root cause.
|
||||
3. Think through the necessary changes to fix the error.
|
||||
4. Generate a corrected version of the Z3 code.
|
||||
4. Generate a corrected version of the code.
|
||||
|
||||
Original Code:
|
||||
{formulation}
|
||||
@@ -247,9 +260,10 @@ Response:
|
||||
Step-by-Step Analysis:
|
||||
[Provide your step-by-step analysis here]
|
||||
|
||||
Corrected Z3 Code:
|
||||
Corrected Z3 or SymPy Code:
|
||||
```python
|
||||
# Corrected Z3 code here
|
||||
# Corrected code here
|
||||
```
|
||||
"""
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
@@ -261,13 +275,13 @@ Response:
|
||||
n=1,
|
||||
temperature=0.1
|
||||
)
|
||||
self.z3_completion_tokens = response.usage.completion_tokens
|
||||
self.solver_completion_tokens = response.usage.completion_tokens
|
||||
formulation = self.extract_and_validate_expressions(response.choices[0].message.content)
|
||||
|
||||
return {"status": "failed", "output": "Failed to solve after multiple attempts."}
|
||||
|
||||
def execute_solver_code(self, code: str) -> str:
|
||||
logging.info("Executing Z3 solver code")
|
||||
logging.info("Executing Z3 and SymPy solver code")
|
||||
logging.info(f"Code: {code}")
|
||||
|
||||
# Parse the code into an AST
|
||||
@@ -292,5 +306,5 @@ Response:
|
||||
logging.error(f"Execution error: {result}")
|
||||
return f"Error: {result}"
|
||||
|
||||
logging.info("Z3 solver code executed successfully")
|
||||
logging.info("Z3 and SymPy solver code executed successfully")
|
||||
return result
|
||||
@@ -137,9 +137,9 @@ def main(model: str):
|
||||
print(f"Accuracy: {accuracy:.2%}")
|
||||
|
||||
# Print accuracy by reasoning type
|
||||
reasoning_types = set(r['reasoning_types'] for r in results)
|
||||
reasoning_types = set(r['reasoning_type'] for r in results)
|
||||
for rt in reasoning_types:
|
||||
rt_samples = [r for r in results if r['reasoning_types'] == rt]
|
||||
rt_samples = [r for r in results if r['reasoning_type'] == rt]
|
||||
rt_correct = sum(1 for r in rt_samples if r['evaluation_decision'] == 'TRUE')
|
||||
rt_accuracy = rt_correct / len(rt_samples)
|
||||
print(f"Accuracy for {rt}: {rt_accuracy:.2%}")
|
||||
|
||||
@@ -26,7 +26,7 @@ async def generate_response(prompt: str, approach: str) -> Dict[str, Any]:
|
||||
}
|
||||
else:
|
||||
# Use OptILM with the specified approach
|
||||
client = AsyncOpenAI(api_key="none", base_url="http://localhost:8000/v1")
|
||||
client = AsyncOpenAI(api_key="none", base_url="http://localhost:8080/v1")
|
||||
response = await client.chat.completions.create(
|
||||
model=f"{approach}-gpt-4o-mini", # Assuming OptILM uses this naming convention
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
@@ -48,7 +48,7 @@ async def rank_responses(prompt: str, responses: List[Dict[str, Any]]) -> List[i
|
||||
)
|
||||
|
||||
ranking_str = ranking_response.choices[0].message.content.strip()
|
||||
print(ranking_str)
|
||||
print(f"Ranking str: {ranking_str}")
|
||||
return [int(idx) for idx in ranking_str.split(",")]
|
||||
|
||||
async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@@ -66,6 +66,7 @@ async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
rankings = await rank_responses(prompt, results)
|
||||
|
||||
# Add rankings to results
|
||||
print(rankings)
|
||||
for rank, idx in enumerate(rankings):
|
||||
results[idx]["rank"] = rank
|
||||
|
||||
@@ -79,7 +80,7 @@ async def generate_dataset(num_samples: int, output_file: str):
|
||||
dataset = load_dataset("lmsys/arena-hard-auto-v0.1", split="train")
|
||||
|
||||
with open(output_file, "w") as f:
|
||||
for sample in tqdm(dataset.select(range(num_samples)), total=num_samples):
|
||||
for sample in tqdm(dataset.select(range(29, 29 + num_samples)), total=num_samples):
|
||||
result = await process_sample(sample)
|
||||
f.write(json.dumps(result) + "\n")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user