update evals

2025-05-28 09:39:38 +03:00 · 2024-10-01 16:45:04 -07:00
parent e831db3c5b
commit 083a058b27
5 changed files with 52 additions and 37 deletions
--- a/optillm.py
+++ b/optillm.py
@@ -22,7 +22,7 @@ from optillm.moa import mixture_of_agents
 from optillm.rto import round_trip_optimization
 from optillm.self_consistency import advanced_self_consistency_approach
 from optillm.pvg import inference_time_pv_game
-from optillm.z3_solver import Z3SolverSystem
+from optillm.z3_solver import Z3SymPySolverSystem
 from optillm.rstar import RStar
 from optillm.cot_reflection import cot_reflection
 from optillm.plansearch import plansearch
@@ -147,7 +147,7 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
        elif approach == 'rto':
            return round_trip_optimization(system_prompt, initial_query, client, model)
        elif approach == 'z3':
-            z3_solver = Z3SolverSystem(system_prompt, client, model)
+            z3_solver = Z3SymPySolverSystem(system_prompt, client, model)
            return z3_solver.process_query(initial_query)
        elif approach == "self_consistency":
            return advanced_self_consistency_approach(system_prompt, initial_query, client, model)
--- a/optillm/rto.py
+++ b/optillm/rto.py
@@ -59,7 +59,7 @@ def round_trip_optimization(system_prompt: str, initial_query: str, client, mode
    c2 = extract_code_from_prompt(c2)

    if c1.strip() == c2.strip():
-        return c1
+        return c1, rto_completion_tokens

    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Initial query: {initial_query}\n\nFirst generated code (C1):\n{c1}\n\nSecond generated code (C2):\n{c2}\n\nBased on the initial query and these two different code implementations, generate a final, optimized version of the code. Only respond with the final code, do not return anything else."}]
--- a/optillm/z3_solver.py
+++ b/optillm/z3_solver.py
@@ -1,5 +1,6 @@
 from typing import Dict, Any
 from z3 import *
+import sympy
 import io
 import re
 import contextlib
@@ -52,6 +53,7 @@ def prepare_safe_globals():

 def execute_code_in_process(code: str):
    import z3
+    import sympy
    import math
    import itertools
    from fractions import Fraction
@@ -62,9 +64,14 @@ def execute_code_in_process(code: str):
    z3_whitelist = set(dir(z3))
    safe_globals.update({name: getattr(z3, name) for name in z3_whitelist})

-    # Ensure key Z3 components are available
+    # Add SymPy specific functions
+    sympy_whitelist = set(dir(sympy))
+    safe_globals.update({name: getattr(sympy, name) for name in sympy_whitelist})
+
+    # Ensure key Z3 and SymPy components are available
    safe_globals.update({
        'z3': z3,
+        'sympy': sympy,
        'Solver': z3.Solver,
        'solver': z3.Solver,
        'Optimize': z3.Optimize,
@@ -83,6 +90,15 @@ def execute_code_in_process(code: str):
        'ForAll': z3.ForAll,
        'Exists': z3.Exists,
        'model': z3.Model,
+        'Symbol': sympy.Symbol,
+        'solve': sympy.solve,
+        'simplify': sympy.simplify,
+        'expand': sympy.expand,
+        'factor': sympy.factor,
+        'diff': sympy.diff,
+        'integrate': sympy.integrate,
+        'limit': sympy.limit,
+        'series': sympy.series,
    })
    
    # Add custom functions
@@ -114,41 +130,38 @@ def execute_code_in_process(code: str):
            return ("error", traceback.format_exc())
    return ("success", output_buffer.getvalue())

-class Z3SolverSystem:
+class Z3SymPySolverSystem:
    def __init__(self, system_prompt: str, client, model: str, timeout: int = 30):
        self.system_prompt = system_prompt
        self.model = model
        self.client = client
        self.timeout = timeout
-        self.z3_completion_tokens = 0
+        self.solver_completion_tokens = 0
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

    def process_query(self, query: str) -> str:
        try:
            analysis = self.analyze_query(query)
-            # print("Analysis: "+ analysis)
            if "SOLVER_CAN_BE_APPLIED: True" not in analysis:
-                return self.standard_llm_inference(query) , self.z3_completion_tokens
+                return self.standard_llm_inference(query), self.solver_completion_tokens
            
            formulation = self.extract_and_validate_expressions(analysis)
-            # print("Formulation: "+ formulation)
-            solver_result = self.solve_with_z3(formulation)
-            # print(solver_result)
+            solver_result = self.solve_with_z3_sympy(formulation)
             
-            return self.generate_response(query, analysis, solver_result), self.z3_completion_tokens
+            return self.generate_response(query, analysis, solver_result), self.solver_completion_tokens
        except Exception as e:
-            logging.error(f"An error occurred while processing the query with Z3, returning standard llm inference results: {str(e)}")
-            return self.standard_llm_inference(query), self.z3_completion_tokens
+            logging.error(f"An error occurred while processing the query with Z3 and SymPy, returning standard llm inference results: {str(e)}")
+            return self.standard_llm_inference(query), self.solver_completion_tokens

    def analyze_query(self, query: str) -> str:
-        analysis_prompt = f"""Analyze the given query and determine if it can be solved using Z3:
+        analysis_prompt = f"""Analyze the given query and determine if it can be solved using Z3 or SymPy:

 1. Identify variables, constraints, and objectives.
-2. Determine the problem type (e.g., SAT, optimization).
-3. Decide if Z3 is suitable.
+2. Determine the problem type (e.g., SAT, optimization, symbolic manipulation).
+3. Decide if Z3, SymPy, or a combination of both is suitable.

-If Z3 can be applied, provide Python code using Z3 to solve the problem. Make sure you define any additional methods you need for solving the problem.
-The code will be executed in an environment with only Z3 available, so do not include any other libraries or modules.
+If Z3 or SymPy can be applied, provide Python code using the appropriate library (or both) to solve the problem. Make sure you define any additional methods you need for solving the problem.
+The code will be executed in an environment with Z3 and SymPy available, so do not include any other libraries or modules.

 Query: {query}

@@ -157,7 +170,7 @@ SOLVER_CAN_BE_APPLIED: [True/False]

 SOLVER_FORMULATION:
 ```python
-# Z3 code here
+# Z3 and/or SymPy code here
 ```

 Analysis:
@@ -174,7 +187,7 @@ Analysis:
            n=1,
            temperature=0.1
        )
-        self.z3_completion_tokens  = analysis_response.usage.completion_tokens
+        self.solver_completion_tokens = analysis_response.usage.completion_tokens
        return analysis_response.choices[0].message.content

    def generate_response(self, query: str, analysis: str, solver_result: Dict[str, Any]) -> str:
@@ -202,7 +215,7 @@ Response:
            n=1,
            temperature=0.1
        )
-        self.z3_completion_tokens  = response.usage.completion_tokens
+        self.solver_completion_tokens = response.usage.completion_tokens
        return response.choices[0].message.content

    def standard_llm_inference(self, query: str) -> str:
@@ -216,27 +229,27 @@ Response:
            n=1,
            temperature=0.1
        )
-        self.z3_completion_tokens  = response.usage.completion_tokens
+        self.solver_completion_tokens = response.usage.completion_tokens
        return response.choices[0].message.content

    def extract_and_validate_expressions(self, analysis: str) -> str:
        formulation = re.search(r"```python\n([\s\S]+?)```", analysis)
        if formulation:
            return formulation.group(1).strip()
-        raise ValueError("No valid Z3 formulation found in the analysis.")
+        raise ValueError("No valid Z3 or SymPy formulation found in the analysis.")

-    def solve_with_z3(self, formulation: str, max_attempts: int = 3) -> Dict[str, Any]:
+    def solve_with_z3_sympy(self, formulation: str, max_attempts: int = 3) -> Dict[str, Any]:
        for attempt in range(max_attempts):
            output = self.execute_solver_code(formulation)
            if "Error:" not in output:
                return {"status": "success", "output": output}
        
-            error_prompt = f"""Fix the Z3 code that resulted in an error. Follow these steps:
+            error_prompt = f"""Fix the Z3 or SymPy code that resulted in an error. Follow these steps:

    1. Review the original code and the error message carefully.
    2. Analyze the error and identify its root cause.
    3. Think through the necessary changes to fix the error.
-    4. Generate a corrected version of the Z3 code.
+    4. Generate a corrected version of the code.

    Original Code:
    {formulation}
@@ -247,9 +260,10 @@ Response:
    Step-by-Step Analysis:
    [Provide your step-by-step analysis here]

-    Corrected Z3 Code:
+    Corrected Z3 or SymPy Code:
    ```python
-    # Corrected Z3 code here
+    # Corrected code here
+    ```
    """
            response = self.client.chat.completions.create(
                model=self.model,
@@ -261,13 +275,13 @@ Response:
                n=1,
                temperature=0.1
            )
-            self.z3_completion_tokens  = response.usage.completion_tokens
+            self.solver_completion_tokens = response.usage.completion_tokens
            formulation = self.extract_and_validate_expressions(response.choices[0].message.content)

        return {"status": "failed", "output": "Failed to solve after multiple attempts."}

    def execute_solver_code(self, code: str) -> str:
-        logging.info("Executing Z3 solver code")
+        logging.info("Executing Z3 and SymPy solver code")
        logging.info(f"Code: {code}")
        
        # Parse the code into an AST
@@ -292,5 +306,5 @@ Response:
            logging.error(f"Execution error: {result}")
            return f"Error: {result}"

-        logging.info("Z3 solver code executed successfully")
+        logging.info("Z3 and SymPy solver code executed successfully")
        return result
--- a/scripts/eval_frames_benchmark.py
+++ b/scripts/eval_frames_benchmark.py
@@ -137,9 +137,9 @@ def main(model: str):
    print(f"Accuracy: {accuracy:.2%}")
    
    # Print accuracy by reasoning type
-    reasoning_types = set(r['reasoning_types'] for r in results)
+    reasoning_types = set(r['reasoning_type'] for r in results)
    for rt in reasoning_types:
-        rt_samples = [r for r in results if r['reasoning_types'] == rt]
+        rt_samples = [r for r in results if r['reasoning_type'] == rt]
        rt_correct = sum(1 for r in rt_samples if r['evaluation_decision'] == 'TRUE')
        rt_accuracy = rt_correct / len(rt_samples)
        print(f"Accuracy for {rt}: {rt_accuracy:.2%}")
--- a/scripts/gen_optillm_dataset.py
+++ b/scripts/gen_optillm_dataset.py
@@ -26,7 +26,7 @@ async def generate_response(prompt: str, approach: str) -> Dict[str, Any]:
        }
    else:
        # Use OptILM with the specified approach
-        client = AsyncOpenAI(api_key="none", base_url="http://localhost:8000/v1")
+        client = AsyncOpenAI(api_key="none", base_url="http://localhost:8080/v1")
        response = await client.chat.completions.create(
            model=f"{approach}-gpt-4o-mini",  # Assuming OptILM uses this naming convention
            messages=[{"role": "user", "content": prompt}],
@@ -48,7 +48,7 @@ async def rank_responses(prompt: str, responses: List[Dict[str, Any]]) -> List[i
    )
    
    ranking_str = ranking_response.choices[0].message.content.strip()
-    print(ranking_str)
+    print(f"Ranking str: {ranking_str}")
    return [int(idx) for idx in ranking_str.split(",")]

 async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
@@ -66,6 +66,7 @@ async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
    rankings = await rank_responses(prompt, results)

    # Add rankings to results
+    print(rankings)
    for rank, idx in enumerate(rankings):
        results[idx]["rank"] = rank

@@ -79,7 +80,7 @@ async def generate_dataset(num_samples: int, output_file: str):
    dataset = load_dataset("lmsys/arena-hard-auto-v0.1", split="train")
    
    with open(output_file, "w") as f:
-        for sample in tqdm(dataset.select(range(num_samples)), total=num_samples):
+        for sample in tqdm(dataset.select(range(29, 29 + num_samples)), total=num_samples):
            result = await process_sample(sample)
            f.write(json.dumps(result) + "\n")