Add reasoning API and cleanup documentation

This commit is contained in:
DebarghaG
2025-10-02 17:38:16 -04:00
parent 312e408f9e
commit 366d1b825e
27 changed files with 4345 additions and 584 deletions

17
.env.example Normal file
View File

@@ -0,0 +1,17 @@
# Azure OpenAI Configuration
# Copy this file to .env and fill in your actual values
# Azure OpenAI Endpoint
AZURE_OPENAI_ENDPOINT=https://your-resource.cognitiveservices.azure.com/
# Azure OpenAI API Key
AZURE_OPENAI_KEY=your-api-key-here
# Azure Deployment Name
AZURE_DEPLOYMENT_NAME=gpt-5
# Azure API Version
AZURE_API_VERSION=2024-12-01-preview
# Standard OpenAI API Key (optional, for non-Azure usage)
OPENAI_API_KEY=your-openai-api-key-here

1
.gitignore vendored
View File

@@ -149,6 +149,7 @@ activemq-data/
# Environments
.env
.env.local
.envrc
.venv
env/

258
README.md
View File

@@ -1,234 +1,48 @@
# Z3 DSL Interpreter
# ProofOfThought
A JSON-based Domain-Specific Language (DSL) for the Z3 theorem prover, providing a declarative interface for formal verification and optimization.
LLM-based reasoning using Z3 theorem proving.
## Project Structure
## Quick Start
```python
from openai import OpenAI
from z3dsl.reasoning import ProofOfThought
client = OpenAI(api_key="...")
pot = ProofOfThought(llm_client=client)
result = pot.query("Would Nancy Pelosi publicly denounce abortion?")
print(result.answer) # False
```
proofofthought/
├── z3dsl/ # Main package
│ ├── __init__.py # Package exports
│ ├── interpreter.py # Main interpreter orchestration
│ ├── cli.py # Command-line interface
│ │
│ ├── solvers/ # Solver abstractions
│ │ ├── __init__.py
│ ├── abstract.py # AbstractSolver interface
│ └── z3_solver.py # Z3Solver implementation
│ │
│ ├── security/ # Security validation
│ │ ├── __init__.py
│ │ └── validator.py # Expression validation (AST checks)
│ │
│ ├── dsl/ # DSL components
│ │ ├── __init__.py
│ │ ├── sorts.py # Sort creation & topological sorting
│ │ └── expressions.py # Expression parsing & evaluation
│ │
│ ├── verification/ # Verification logic
│ │ ├── __init__.py
│ │ └── verifier.py # Verification condition checking
│ │
│ └── optimization/ # Optimization logic
│ ├── __init__.py
│ └── optimizer.py # Optimization problem solving
├── tests/ # Test files
│ └── 3.json # Example JSON configuration
├── run_interpreter.py # Convenience script
├── main.py # Legacy monolithic version (deprecated)
└── README.md # This file
## Batch Evaluation
```python
from z3dsl.reasoning import EvaluationPipeline
evaluator = EvaluationPipeline(pot, output_dir="results/")
result = evaluator.evaluate(
dataset="strategyqa_train.json",
max_samples=10
)
print(f"Accuracy: {result.metrics.accuracy:.2%}")
```
## Installation
```bash
pip install z3-solver openai scikit-learn numpy
```
## Architecture
### Core Components
The system has two layers:
1. **Interpreter** (`interpreter.py`)
- Orchestrates the entire interpretation pipeline
- Coordinates between all sub-components
- Manages configuration loading and validation
1. **High-level API** (`z3dsl.reasoning`) - Simple Python interface for reasoning tasks
2. **Low-level DSL** (`z3dsl`) - JSON-based Z3 theorem prover interface
2. **Solvers** (`solvers/`)
- `AbstractSolver`: Interface for solver implementations
- `Z3Solver`: Z3-specific solver wrapper
- Allows pluggable solver backends
Most users should use the high-level API.
3. **Security** (`security/`)
- `ExpressionValidator`: AST-based security checks
- Prevents code injection via dunder attributes, imports, eval/exec
- Validates expressions before evaluation
## Examples
4. **DSL** (`dsl/`)
- `SortManager`: Creates and manages Z3 sorts with dependency resolution
- `ExpressionParser`: Parses expressions with context management
- Handles sorts, functions, constants, variables
5. **Verification** (`verification/`)
- `Verifier`: Manages verification conditions
- Supports ForAll, Exists, and constraint-based verification
- Checks satisfiability with timeout support
6. **Optimization** (`optimization/`)
- `OptimizerRunner`: Handles optimization problems
- Supports maximize/minimize objectives
- Separate from main solver for independent problems
## Features
### Security Enhancements
- ✅ AST-based expression validation (blocks imports, dunder access, eval/exec)
- ✅ Restricted builtin access
- ✅ Safe expression evaluation with whitelisted operators
### Correctness Fixes
- ✅ Topological sorting for sort dependencies
- ✅ Proper quantifier validation (no empty ForAll/Exists)
- ✅ Context caching with lazy initialization
- ✅ Type-safe expression parsing (returns ExprRef not BoolRef)
- ✅ BitVecSort size validation (0 < size <= 65536)
### Code Quality
- Modular architecture with separation of concerns
- Explicit imports (no wildcard imports)
- Comprehensive logging
- Type hints throughout
- Proper error handling and messages
## Usage
### Command Line
```bash
# Basic usage
python run_interpreter.py tests/3.json
# With custom timeouts
python run_interpreter.py tests/3.json \
--verify-timeout 20000 \
--optimize-timeout 50000
# With debug logging
python run_interpreter.py tests/3.json --log-level DEBUG
```
### As a Library
```python
from z3dsl import Z3JSONInterpreter
interpreter = Z3JSONInterpreter(
"config.json",
verify_timeout=10000,
optimize_timeout=100000
)
interpreter.run()
```
### Custom Solver
```python
from z3dsl import Z3JSONInterpreter, AbstractSolver
class CustomSolver(AbstractSolver):
# Implement abstract methods
pass
interpreter = Z3JSONInterpreter(
"config.json",
solver=CustomSolver()
)
interpreter.run()
```
## JSON Configuration Format
```json
{
"sorts": [
{"name": "MySort", "type": "DeclareSort"},
{"name": "MyBitVec", "type": "BitVecSort(8)"}
],
"functions": [
{"name": "f", "domain": ["MySort"], "range": "IntSort"}
],
"constants": {
"category": {
"sort": "MySort",
"members": ["const1", "const2"]
}
},
"variables": [
{"name": "x", "sort": "IntSort"}
],
"knowledge_base": [
"f(const1) > 0",
{"assertion": "f(const2) < 100", "value": true}
],
"rules": [
{
"forall": [{"name": "y", "sort": "MySort"}],
"constraint": "f(y) >= 0"
}
],
"verifications": [
{
"name": "Check Property",
"constraint": "f(const1) > f(const2)"
}
],
"actions": ["verify_conditions"]
}
```
## Dependencies
- Python 3.7+
- z3-solver
Install with:
```bash
pip install z3-solver
```
## Bug Fixes from Original
The refactored version fixes 16 critical bugs:
1. Wildcard import pollution
2. Type safety violations
3. Context cache timing issues
4. Variable shadowing
5. Security sandbox bypasses
6. Empty quantifier handling
7. Sort dependency ordering
8. Constants dict semantics
9. Optimization context isolation
10. Verification isolation
11. Logging race conditions
12. BitVecSort validation
See commit history for detailed explanations of each fix.
## Testing
```bash
# Run example test
python run_interpreter.py tests/3.json
# Expected output:
# INFO: Starting interpretation of tests/3.json
# INFO: Executing action: verify_conditions
# INFO: Checking 1 verification condition(s)
# INFO: Compare Unemployment Rates: SAT
# INFO: Model: [...]
# INFO: Interpretation completed successfully
```
## License
[Your license here]
## Contributing
Please see CONTRIBUTING.md for guidelines.
See `examples/` directory for complete examples including Azure OpenAI support.

61
REASONING_API.md Normal file
View File

@@ -0,0 +1,61 @@
# Reasoning API
## Overview
Simple Python API for LLM-based reasoning with Z3 theorem proving. Inspired by DSPy.
## API
### ProofOfThought
```python
from openai import OpenAI
from z3dsl.reasoning import ProofOfThought
client = OpenAI(api_key="...")
pot = ProofOfThought(
llm_client=client,
model="gpt-4o",
max_attempts=3,
verify_timeout=10000
)
result = pot.query("Your question here")
print(result.answer) # True/False/None
```
### EvaluationPipeline
```python
from z3dsl.reasoning import EvaluationPipeline
evaluator = EvaluationPipeline(pot, output_dir="results/")
result = evaluator.evaluate(
dataset="data.json",
max_samples=100
)
print(f"Accuracy: {result.metrics.accuracy:.2%}")
print(f"F1: {result.metrics.f1_score:.4f}")
```
## Result Objects
**QueryResult:**
- `answer: bool | None` - The answer
- `success: bool` - Query succeeded
- `num_attempts: int` - Attempts taken
- `error: str | None` - Error if failed
**EvaluationMetrics:**
- accuracy, precision, recall, f1_score
- tp, fp, tn, fn (confusion matrix)
- total_samples, correct_answers, wrong_answers, failed_answers
## Error Handling
Automatic retry with feedback for:
- JSON extraction failures
- Z3 execution errors
- Ambiguous results (SAT + UNSAT)
- LLM API errors

View File

@@ -1,359 +0,0 @@
# Testing Documentation
## Test Suite Overview
The Z3 DSL Interpreter has a comprehensive test suite with **109 tests** covering all components and bug fixes.
```
tests/
├── unit/ # Unit tests for individual components
│ ├── test_security_validator.py # 18 tests
│ ├── test_sort_manager.py # 29 tests
│ ├── test_expression_parser.py # 18 tests
│ ├── test_verifier.py # 12 tests
│ └── test_optimizer.py # 10 tests
├── integration/ # Integration tests
│ ├── test_interpreter.py # 16 tests
│ └── test_bug_fixes.py # 16 tests
└── fixtures/ # Test data
├── simple_test.json
├── bitvec_test.json
├── enum_test.json
└── 3.json (original test)
```
## Running Tests
```bash
# Run all tests
python run_tests.py
# Run specific test file
python -m unittest tests.unit.test_security_validator
# Run specific test case
python -m unittest tests.unit.test_security_validator.TestExpressionValidator.test_check_safe_ast_blocks_dunder_attributes
# Run with verbose output
python -m unittest discover -s tests -p "test_*.py" -v
```
## Test Categories
### 1. Security Validator Tests (18 tests)
Tests for AST-based expression validation:
- ✅ Valid expression parsing
- ✅ Dunder attribute blocking (`__class__`, `__bases__`, etc.)
- ✅ Import statement blocking
-`eval()`, `exec()`, `compile()`, `__import__()` blocking
- ✅ Built-in access prevention
- ✅ Context and safe_globals usage
- ✅ Error handling (syntax, name errors)
- ✅ Lambda and comprehension support
**Key Tests:**
- `test_check_safe_ast_blocks_dunder_attributes` - Prevents `obj.__class__` exploits
- `test_check_safe_ast_blocks_eval_call` - Blocks code injection via eval
- `test_safe_eval_blocks_builtins` - Ensures no file system access
### 2. Sort Manager Tests (29 tests)
Tests for Z3 sort creation and management:
- ✅ Built-in sort initialization
- ✅ DeclareSort, EnumSort, BitVecSort, ArraySort creation
- ✅ BitVecSort size validation (>0, <=65536)
- ✅ Topological sorting for dependencies
- ✅ Circular dependency detection
- ✅ Function creation with proper domains
- ✅ Constant creation (list and dict formats)
- ✅ Variable creation
- ✅ Undefined sort detection
**Key Tests:**
- `test_create_bitvec_sort_zero_size` - Validates BitVecSort(0) fails
- `test_topological_sort_chain` - Ensures dependency ordering works
- `test_create_array_sort_undefined_domain` - Catches undefined references
### 3. Expression Parser Tests (18 tests)
Tests for expression parsing and evaluation:
- ✅ Simple arithmetic parsing
- ✅ Function call parsing
- ✅ Z3 operator usage (And, Or, Not, etc.)
- ✅ Quantified variable handling
- ✅ Context caching
- ✅ Variable shadowing warnings
- ✅ Knowledge base assertion parsing
- ✅ Rule parsing (ForAll, Implies)
- ✅ Empty quantifier validation
- ✅ Error handling
**Key Tests:**
- `test_quantified_var_shadows_constant_warning` - Detects shadowing
- `test_add_rules_empty_forall_raises_error` - Prevents vacuous quantification
- `test_build_context_with_symbols_loaded` - Verifies caching works
### 4. Verifier Tests (12 tests)
Tests for verification condition handling:
- ✅ Simple constraint verification
- ✅ Existential quantification (Exists)
- ✅ Universal quantification (ForAll)
- ✅ Empty quantifier detection
- ✅ SAT/UNSAT result checking
- ✅ Timeout configuration
- ✅ Unnamed verification handling
- ✅ Undefined sort detection
**Key Tests:**
- `test_verify_conditions_sat` - Checks satisfiable conditions
- `test_verify_conditions_unsat` - Checks unsatisfiable conditions
- `test_add_verification_empty_exists_raises_error` - Validates quantifiers
### 5. Optimizer Tests (10 tests)
Tests for optimization problem solving:
- ✅ No configuration handling
- ✅ Maximize objectives
- ✅ Minimize objectives
- ✅ Multiple constraints
- ✅ Global constant reference
- ✅ Unknown objective type warnings
- ✅ Invalid constraint syntax detection
- ✅ Timeout configuration
**Key Tests:**
- `test_optimize_references_global_constants` - Ensures global context access
- `test_optimize_simple_maximize` - Basic optimization works
- `test_optimize_unknown_objective_type` - Handles invalid configs
### 6. Integration Tests (16 tests)
End-to-end tests for the full interpreter:
- ✅ Loading and running various configurations
- ✅ File not found handling
- ✅ Invalid JSON handling
- ✅ Custom timeout configuration
- ✅ Missing section defaults
- ✅ Invalid constants structure handling
- ✅ Unknown action warnings
- ✅ verify_conditions action
- ✅ optimize action
- ✅ Topological sort integration
**Key Tests:**
- `test_load_and_run_existing_test` - Original test still works
- `test_load_invalid_json` - Proper error for malformed JSON
- `test_topological_sort_of_sorts` - Dependencies resolved correctly
### 7. Bug Fix Verification Tests (16 tests)
Tests verifying all 16 critical bugs are fixed:
1. ✅ Wildcard import elimination
2. ✅ Type annotation correctness (ExprRef not BoolRef)
3. ✅ Context cache timing
4. ✅ Variable shadowing warnings
5. ✅ AST-based security checking
6. ✅ Empty quantifier validation
7. ✅ Topological sort implementation
8. ✅ Constants dict semantics
9. ✅ Optimization global context
10. ✅ Verification check semantics
11. ✅ Logging configuration location
12. ✅ BitVecSort validation
13. ✅ Implication requires ForAll
14. ✅ eval/exec/compile blocking
15. ✅ Function definition blocking
16. ✅ Sort dependency validation
**Key Tests:**
- `test_bug5_security_sandbox_ast_based` - Confirms AST checking works
- `test_bug7_topological_sort_implemented` - Dependency resolution
- `test_bug12_bitvec_validation` - Size bounds checking
## Test Coverage
### Component Coverage
| Component | Tests | Coverage |
|-----------|-------|----------|
| Security Validator | 18 | 100% |
| Sort Manager | 29 | 98% |
| Expression Parser | 18 | 95% |
| Verifier | 12 | 100% |
| Optimizer | 10 | 95% |
| Interpreter | 16 | 90% |
| Bug Fixes | 16 | 100% |
### Feature Coverage
- **Security**: Comprehensive (dunder, imports, eval, builtins)
- **Sort Types**: All types covered (Declare, Enum, BitVec, Array, built-ins)
- **Quantifiers**: ForAll, Exists, empty validation
- **Rules**: Implications, constraints, quantification
- **Verification**: SAT/UNSAT checking, timeouts
- **Optimization**: Maximize, minimize, constraints
- **Error Handling**: All error paths tested
## Test Patterns
### 1. Positive Tests
Test that valid inputs work correctly:
```python
def test_create_declare_sort(self):
sort_defs = [{'name': 'MySort', 'type': 'DeclareSort'}]
self.sort_manager.create_sorts(sort_defs)
self.assertIn('MySort', self.sort_manager.sorts)
```
### 2. Negative Tests
Test that invalid inputs raise appropriate errors:
```python
def test_create_bitvec_sort_zero_size(self):
sort_defs = [{'name': 'MyBV0', 'type': 'BitVecSort(0)'}]
with self.assertRaises(ValueError) as ctx:
self.sort_manager.create_sorts(sort_defs)
self.assertIn("must be positive", str(ctx.exception))
```
### 3. Log Verification
Test that warnings/errors are logged:
```python
def test_quantified_var_shadows_constant_warning(self):
shadow_var = Const('x', IntSort())
with self.assertLogs(level='WARNING') as cm:
context = self.parser.build_context([shadow_var])
self.assertTrue(any('shadows' in msg for msg in cm.output))
```
### 4. Integration Tests
Test complete workflows:
```python
def test_load_and_run_simple_config(self):
interpreter = Z3JSONInterpreter('tests/fixtures/simple_test.json')
interpreter.run() # Should not raise
```
## Common Test Utilities
### Temporary JSON Files
```python
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
json.dump(config, f)
temp_file = f.name
try:
interpreter = Z3JSONInterpreter(temp_file)
interpreter.run()
finally:
os.unlink(temp_file)
```
### Log Capturing
```python
with self.assertLogs(level='INFO') as cm:
interpreter.run()
self.assertTrue(any('SAT' in msg for msg in cm.output))
```
### Exception Checking
```python
with self.assertRaises(ValueError) as ctx:
parser.parse_expression("invalid syntax +")
self.assertIn("Syntax error", str(ctx.exception))
```
## Continuous Testing
### Pre-commit Hook
Add to `.git/hooks/pre-commit`:
```bash
#!/bin/bash
python run_tests.py
if [ $? -ne 0 ]; then
echo "Tests failed. Commit aborted."
exit 1
fi
```
### CI/CD Integration
```yaml
# .github/workflows/test.yml
name: Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.9'
- run: pip install z3-solver
- run: python run_tests.py
```
## Test Results
```
Ran 109 tests in 0.055s
OK
```
**All tests passing! ✅**
- 0 failures
- 0 errors
- 109 successes
- 100% pass rate
## Adding New Tests
When adding new features:
1. **Add unit tests** for the component
2. **Add integration test** for end-to-end workflow
3. **Add fixture** if new JSON format needed
4. **Update this document** with test descriptions
Example:
```python
def test_new_feature(self):
"""Test description of what this verifies."""
# Arrange
setup_test_data()
# Act
result = component.new_method()
# Assert
self.assertEqual(result, expected)
```
## Known Limitations
- **Z3 Global State**: Enum sorts persist across tests (handled with unique names)
- **Timeout Tests**: Hard to test actual timeouts without long-running tests
- **Model Validation**: Can't easily validate specific model values, only SAT/UNSAT
## Conclusion
The test suite provides comprehensive coverage of:
- ✅ All 16 critical bug fixes
- ✅ Security validation
- ✅ Sort management
- ✅ Expression parsing
- ✅ Verification logic
- ✅ Optimization logic
- ✅ End-to-end workflows
- ✅ Error handling
**Total: 109 tests, 100% passing**

976
benchmark_pipeline.py Normal file
View File

@@ -0,0 +1,976 @@
import io
import json
import logging
import os
import re
import time
import traceback
from contextlib import redirect_stderr, redirect_stdout
from typing import Any
import numpy as np
from openai import OpenAI
from sklearn.metrics import (
accuracy_score,
confusion_matrix,
precision_score,
recall_score,
)
from z3dsl.interpreter import Z3JSONInterpreter
def calculate_metrics(y_true: list[Any], y_pred: list[Any]) -> dict[str, Any]:
y_true = np.array(y_true)
y_pred = np.array(y_pred)
# Handle the case where there's only one class
if len(np.unique(y_true)) == 1 or len(np.unique(y_pred)) == 1:
accuracy = accuracy_score(y_true, y_pred)
if np.array_equal(y_true, y_pred):
if y_true[0] == 1: # All positive
tp, fp, tn, fn = len(y_true), 0, 0, 0
else: # All negative
tp, fp, tn, fn = 0, 0, len(y_true), 0
else:
if y_true[0] == 1: # All true positive, some false negative
tp = np.sum(y_pred)
fn = len(y_true) - tp
fp, tn = 0, 0
else: # All true negative, some false positive
y_pred_arr = np.array(y_pred)
tn = int(np.sum(~y_pred_arr))
fp = len(y_true) - tn
tp, fn = 0, 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
else:
cm = confusion_matrix(y_true, y_pred)
tn, fp, fn, tp = cm.ravel()
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
return {
"TP": int(tp),
"FP": int(fp),
"TN": int(tn),
"FN": int(fn),
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1 Score": f1,
"Specificity": specificity,
"False Positive Rate": false_positive_rate,
"False Negative Rate": false_negative_rate,
}
# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Function to extract JSON from markdown content
def extract_json_from_markdown(markdown_content: str) -> dict[str, Any] | None:
json_code_block_pattern = r"```json\s*(\{[\s\S]*?\})\s*```"
match = re.search(json_code_block_pattern, markdown_content)
if match:
json_content = match.group(1)
return json.loads(json_content) # Parse JSON content
return None
def run_z3_interpreter(output_json_path: str) -> tuple[bool | None, str]:
# Capture both stdout and stderr
stdout_capture = io.StringIO()
stderr_capture = io.StringIO()
with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture):
interpreter = Z3JSONInterpreter(output_json_path)
interpreter.run()
# Combine stdout and stderr
full_output = stdout_capture.getvalue() + stderr_capture.getvalue()
# Analyze the output
sat_occurrences = full_output.count(": SAT")
unsat_occurrences = full_output.count(": UNSAT")
# Determine the answer
predicted_answer: bool | None
if sat_occurrences > 0 and unsat_occurrences == 0:
predicted_answer = True
elif unsat_occurrences > 0 and sat_occurrences == 0:
predicted_answer = False
else:
predicted_answer = None
return predicted_answer, full_output
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError(
"OPENAI_API_KEY is not set. " "Please set it in your .env file or environment variables."
)
client = OpenAI(api_key=api_key)
# Load the StrategyQA dataset
with open("strategyqa_train.json") as f:
data = json.load(f)
# Set up output directories
output_dir = "strategyqa_outputs"
json_dir = os.path.join(output_dir, "json")
response_dir = os.path.join(output_dir, "responses")
os.makedirs(json_dir, exist_ok=True)
os.makedirs(response_dir, exist_ok=True)
max_questions = 10
correct_answers = 0
wrong_answers = 0
programgenerror = 0
y_true = []
y_pred = []
for idx, question_data in enumerate(data[:max_questions]):
qid = question_data["qid"]
question_text = question_data["question"]
actual_answer = question_data["answer"] # True or False
logger.info(f"Processing question {idx+1}/{max_questions}: {qid}")
logger.info(f"Question: {question_text}")
logger.info(f"Actual Answer: {actual_answer}")
output_json_path = os.path.join(json_dir, f"{qid}_extracted.json")
if os.path.exists(output_json_path):
logger.info(f"Skipping question {qid}, already processed.")
continue
num_attempts = 0
max_attempts = 3
success = False
# Define your prompt template here
initial_prompt_content = """
** Instructions for Generating JSON-Based DSL Programs for Theorem Proving**
**Introduction**
This document provides comprehensive guidelines for generating JSON-based Domain-Specific Language (DSL) programs designed to solve complex reasoning tasks using a theorem prover. The goal is to translate reasoning problems into structured JSON programs that can be parsed by a custom interpreter and reliably solved. This guide includes detailed explanations of each section, examples, and emphasizes common pitfalls to avoid to ensure that the generated programs are error-free and adhere strictly to the expected format.
---
### **Important Guidelines to Avoid Common Errors**
1. **Variable Definitions**
- **Understand the difference between FREE variables and QUANTIFIED variables:**
- **Free Variables**: Declared in the global `"variables"` section. These are variables used in non-quantified contexts (e.g., directly in assertions without ForAll/Exists).
- **Quantified Variables**: Variables bound by `ForAll` or `Exists` quantifiers. These are automatically bound by the quantifier itself and should NOT be declared in a separate `"variables"` field.
- **Example of Correct Variable Declaration:**
```json
"variables": [
{"name": "p", "sort": "Person"},
{"name": "i", "sort": "Issue"}
]
```
This declares `p` and `i` as free variables available throughout the program.
For quantified variables in ForAll/Exists:
```json
"knowledge_base": [
"ForAll([p, i], Implies(supports(p, i), Not(publicly_denounce(p, i))))"
]
```
Here, `p` and `i` are automatically bound by the `ForAll` quantifier. If `p` and `i` are declared in the global `"variables"` section, they can be referenced as free variables within the quantified expression.
2. **Context in Evaluations**
- The interpreter evaluates expressions in a context that includes:
- **Functions**: Defined in the `"functions"` section.
- **Constants**: Defined in the `"constants"` section.
- **Free Variables**: Defined in the global `"variables"` section.
- **Quantified Variables**: Temporarily added to context when evaluating quantified expressions (ForAll/Exists).
- **Understanding Variable Scope**:
- **Free variables** in the global `"variables"` section are available throughout the entire program.
- **Quantified variables** (e.g., in `ForAll([x], ...)`) are automatically bound by the quantifier and available only within that quantified expression.
- You can reference free variables inside quantified expressions, creating nested scopes.
3. **Valid JSON Output**
- **Ensure that the JSON output is valid and can be parsed without errors.**
- Use proper syntax, including commas, quotation marks, and matching brackets.
- Avoid trailing commas or missing commas between elements.
- **Common JSON Errors to Avoid**:
- Missing commas between array elements or object properties.
- Unmatched brackets or braces.
- Incorrect use of quotation marks.
- **Recommendation**:
- Use a JSON validator or formatter to check the generated JSON before finalizing it.
4. **Correct Syntax in Logical Expressions**
- **Use Proper Python Syntax for Expressions**:
- When writing expressions that will be evaluated, ensure they are valid Python expressions.
- For example, in the assertion `ForAll([p], ...)`, `p` must be defined in the context or within the quantifier.
- **Avoid Using Unrecognized Variables**:
- Do not use variables in expressions that have not been defined.
- If a variable is introduced in a quantifier, ensure it is properly included.
- **Example of Correct Usage**:
```json
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"ForAll([p], Implies(is_law_enforcement(p), can_arrest(p)))"
]
```
Where `p` is declared as a free variable in the global `"variables"` section, then bound by the `ForAll` quantifier in the assertion.
---
### **Detailed Explanation of Each Section**
1. **Sorts**
- **Purpose**: Define the types or domains (e.g., integers, booleans, custom types like "Person").
- **Structure**:
```json
{
"name": "SortName",
"type": "SortType"
}
```
- **Sort Types**:
- `"BoolSort"`: Boolean type.
- `"IntSort"`: Integer type.
- `"RealSort"`: Real number type.
- `"DeclareSort"`: Custom, uninterpreted sort.
- `"EnumSort"`: Enumerated sort with specified values.
- `"BitVecSort(n)"`: Bit vector of size `n`.
- `"ArraySort(DomainSort, RangeSort)"`: Array mapping from `DomainSort` to `RangeSort`.
- **Example**:
```json
{"name": "Person", "type": "DeclareSort"}
```
2. **Functions**
- **Purpose**: Define operations or relations between sorts.
- **Structure**:
```json
{
"name": "FunctionName",
"domain": ["Sort1", "Sort2"],
"range": "ReturnSort"
}
```
- **Example**:
```json
{"name": "num_children", "domain": ["Person"], "range": "Int"}
```
3. **Constants**
- **Purpose**: Represent fixed elements within sorts.
- **Structure**:
```json
{
"Category": {
"sort": "SortName",
"members": ["Const1", "Const2"]
}
}
```
- **Example**:
```json
{
"persons": {
"sort": "Person",
"members": ["genghis_khan", "julius_caesar"]
}
}
```
4. **Variables**
- **Purpose**: Define FREE variables that can be used throughout the program. These are symbols that can be referenced in assertions, rules, and verifications. They are particularly useful when you want to use the same variable symbol in multiple quantified expressions.
- **Structure**:
```json
{
"name": "VariableName",
"sort": "SortName"
}
```
- **Example**:
```json
{"name": "x", "sort": "Int"}
```
- **Note**: Variables declared here can be used as free variables OR can be bound by quantifiers (ForAll/Exists) in assertions. When bound by a quantifier, they become quantified variables within that scope.
5. **Knowledge Base**
- **Purpose**: A set of axioms or facts that are assumed to be true.
- **Structure**: An array of assertions, each representing a logical expression.
- **Assertions** can be simple strings or dictionaries specifying the assertion and its truth value.
- **Example**:
```json
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"ForAll([p], Implies(is_law_enforcement(p), can_arrest(p)))",
"num_children(genghis_khan) == 16",
{
"assertion": "can_fly(superman)",
"value": true
}
]
```
- **Note**: When using quantifiers like ForAll/Exists, the variables must be declared in the global `"variables"` section to be available in the evaluation context.
6. **Rules**
- **Purpose**: Define general logical implications or constraints.
- **Structure**:
```json
{
"name": "RuleName",
"forall": [
{"name": "Var1", "sort": "Sort1"},
{"name": "Var2", "sort": "Sort2"}
],
"implies": {
"antecedent": "LogicalExpression",
"consequent": "LogicalExpression"
}
}
```
Or for simple constraints:
```json
{
"name": "RuleName",
"constraint": "LogicalExpression"
}
```
- **Example**:
```json
{
"name": "Greater Than Rule",
"forall": [
{"name": "a", "sort": "Int"},
{"name": "b", "sort": "Int"}
],
"implies": {
"antecedent": "a > b",
"consequent": "Not(b > a)"
}
}
```
- **Important**: Rules with `"implies"` MUST have a `"forall"` field with at least one variable. The `"forall"` field cannot be empty. For rules without quantification, use `"constraint"` instead.
7. **Verifications**
- **Purpose**: Specify properties or conditions that need to be verified by the theorem prover.
- **Three Types of Verifications**:
**Type 1: Simple Constraint (no quantifiers)**
```json
{
"name": "VerificationName",
"constraint": "LogicalExpression"
}
```
Example:
```json
{
"name": "Compare Descendants",
"constraint": "num_descendants(genghis_khan) > num_descendants(julius_caesar)"
}
```
**Type 2: Existential Verification (checking if there exists a value)**
```json
{
"name": "VerificationName",
"exists": [
{"name": "Var", "sort": "Sort"}
],
"constraint": "LogicalExpression"
}
```
Example:
```json
{
"name": "Find Positive Number",
"exists": [
{"name": "x", "sort": "Int"}
],
"constraint": "And(x > 0, x < 10)"
}
```
**Type 3: Universal Verification (checking if property holds for all values)**
```json
{
"name": "VerificationName",
"forall": [
{"name": "Var", "sort": "Sort"}
],
"implies": {
"antecedent": "LogicalExpression",
"consequent": "LogicalExpression"
}
}
```
Example:
```json
{
"name": "All Positive Numbers Greater Than Zero",
"forall": [
{"name": "x", "sort": "Int"}
],
"implies": {
"antecedent": "x > 0",
"consequent": "x >= 1"
}
}
```
- **Important**: The `"exists"` and `"forall"` fields cannot be empty. They must contain at least one variable definition.
8. **Optimization** (Optional)
- **Purpose**: Define optimization problems with variables, constraints, and objectives.
- **Structure**:
```json
{
"variables": [
{"name": "Var", "sort": "Sort"}
],
"constraints": ["LogicalExpression"],
"objectives": [
{
"type": "minimize" or "maximize",
"expression": "ArithmeticExpression"
}
]
}
```
- **Example**:
```json
{
"variables": [
{"name": "x", "sort": "Int"}
],
"constraints": [
"x >= 0",
"x <= 10"
],
"objectives": [
{
"type": "maximize",
"expression": "x"
}
]
}
```
9. **Actions**
- **Purpose**: Specify which actions the interpreter should perform.
- **Possible Actions**:
- `"verify_conditions"`: Runs verifications.
- `"optimize"`: Solves optimization problems.
- **Structure**: An array of action strings.
- **Example**:
```json
["verify_conditions"]
```
---
### **Understanding Verification Semantics**
**Important: What Does SAT/UNSAT Mean?**
When you run verifications, the interpreter checks the satisfiability of your constraint given the knowledge base:
- **SAT (Satisfiable)**: The constraint CAN be true given the knowledge base. The solver found a model where both the knowledge base AND the constraint are satisfied simultaneously.
- This means the constraint is CONSISTENT with the knowledge base.
- A model (example values) will be shown.
- **UNSAT (Unsatisfiable)**: The constraint CONTRADICTS the knowledge base. There is no possible model where both the knowledge base and the constraint can be true together.
- This means the constraint is INCONSISTENT with the knowledge base.
- **UNKNOWN**: The solver timed out or couldn't determine satisfiability.
**Checking Different Types of Properties:**
1. **To check if a property CAN be true** (satisfiability):
- Add it directly to verifications
- SAT = yes, it's possible
- UNSAT = no, it contradicts the knowledge base
2. **To check if a property MUST be true** (entailment, KB ⊨ φ):
- Verify that the NEGATION of the property is UNSAT
- If KB ∧ ¬φ is UNSAT, then KB ⊨ φ (the knowledge base entails the property)
- Example: To prove "publicly_denounce(nancy_pelosi, abortion)" is false given KB, check if "publicly_denounce(nancy_pelosi, abortion)" is UNSAT
**Example**:
```json
"verifications": [
{
"name": "Can Pelosi Denounce Abortion",
"constraint": "publicly_denounce(nancy_pelosi, abortion)"
}
]
```
- If this returns SAT: Pelosi denouncing abortion is consistent with the knowledge base
- If this returns UNSAT: Pelosi denouncing abortion contradicts the knowledge base (meaning she definitely won't)
---
### **Available Operators and Functions**
- **Logical Operators**:
- `And(expr1, expr2, ...)`
- `Or(expr1, expr2, ...)`
- `Not(expr)`
- `Implies(antecedent, consequent)`
- `If(condition, true_expr, false_expr)`
- `Distinct(expr1, expr2, ...)`
- **Quantifiers**:
- `ForAll([vars], expr)`
- `Exists([vars], expr)`
- **Arithmetic Operators**:
- `+`, `-`, `*`, `/`
- Comparison: `==`, `!=`, `<`, `<=`, `>`, `>=`
- **Custom Functions**: Defined in the `"functions"` section.
---
### **Good Examples of JSON Programming**
1. **Example: Correctly Defining Variables for Quantified Expressions**
**Incorrect - Missing Variable Declaration**:
```json
"knowledge_base": [
"ForAll([p], Implies(is_law_enforcement(p), can_arrest(p)))"
]
```
**Error**: `p` is not defined in the evaluation context, causing a NameError.
**Corrected Version - Declare in Global Variables Section**:
```json
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"ForAll([p], Implies(is_law_enforcement(p), can_arrest(p)))"
]
```
**Explanation**: By declaring `p` in the global `"variables"` section, it becomes available in the evaluation context. The `ForAll` quantifier then binds this variable within its scope.
---
### **Common Pitfalls to Avoid**
- **Undefined Variables**: Always define variables in the global `"variables"` section if they will be used in quantified expressions (ForAll/Exists). The quantifier binds the variable, but it must exist in the evaluation context first.
- **Using 'variables' field in assertions**: Do NOT add a `"variables"` field inside knowledge_base assertions. Variables should be declared in the global `"variables"` section only.
- **Using 'variables' instead of 'forall' in rules**: Rules must use `"forall"` for quantified variables, not `"variables"`.
- **Empty quantifier lists**: If you specify `"forall"` or `"exists"` in rules or verifications, they must contain at least one variable. Empty lists will cause errors.
- **Syntax Errors in Expressions**: Use correct syntax in logical expressions. Avoid typos and ensure that all parentheses and commas are correctly placed.
- **Invalid JSON**: Double-check the JSON structure for validity. Use a JSON linter or validator if necessary.
- **Misunderstanding SAT/UNSAT**: Remember that SAT means "possible/consistent" and UNSAT means "contradicts". To prove something MUST be true, check if its negation is UNSAT.
---
### **Revised Guidelines for Writing Good Programs**
1. **Always Declare Variables in the Global Section**
- All variables used in quantified expressions (ForAll/Exists) must be declared in the global `"variables"` section.
- The quantifier automatically binds the variable within its scope, but the variable must exist in the evaluation context.
2. **Use Correct Field Names**
- In rules: Use `"forall"` for quantified variables, NOT `"variables"`
- In verifications: Use `"forall"`, `"exists"`, or no quantifier field for simple constraints
- In knowledge_base: Do NOT add a `"variables"` field in assertion dictionaries
3. **Understanding Quantifier Binding**
- When you write `ForAll([p], ...)`, the variable `p` must be declared in the global `"variables"` section
- The `ForAll` quantifier then binds `p` within its scope
- Example:
```json
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"ForAll([p], Implies(condition(p), result(p)))"
]
```
4. **Ensure Variables are in Evaluation Context**
- The interpreter builds an evaluation context from functions, constants, and the global variables section
- Quantified variables are temporarily added to this context when evaluating their scope
- Always declare variables globally to make them available for quantification
---
### **Example of Corrected JSON Program**
**Question**: Would Nancy Pelosi publicly denounce abortion?
**Decomposition**:
- Nancy Pelosi is known to support abortion rights.
- People do not publicly denounce issues they support.
- Therefore, Nancy Pelosi would not publicly denounce abortion.
**JSON Program**:
```json
{
"sorts": [
{"name": "Person", "type": "DeclareSort"},
{"name": "Issue", "type": "DeclareSort"},
{"name": "Bool", "type": "BoolSort"}
],
"functions": [
{"name": "supports", "domain": ["Person", "Issue"], "range": "Bool"},
{"name": "publicly_denounce", "domain": ["Person", "Issue"], "range": "Bool"}
],
"constants": {
"persons": {
"sort": "Person",
"members": ["nancy_pelosi"]
},
"issues": {
"sort": "Issue",
"members": ["abortion"]
}
},
"variables": [
{"name": "p", "sort": "Person"},
{"name": "i", "sort": "Issue"}
],
"knowledge_base": [
{
"assertion": "supports(nancy_pelosi, abortion)"
},
{
"assertion": "ForAll([p, i], Implies(supports(p, i), Not(publicly_denounce(p, i))))"
}
],
"verifications": [
{
"name": "Pelosi Denounce Abortion",
"constraint": "publicly_denounce(nancy_pelosi, abortion)"
}
],
"actions": ["verify_conditions"]
}
```
**Explanation**:
- **Variables Defined**:
- `p` and `i` are defined in the global `"variables"` section, making them available in the evaluation context.
- When the `ForAll([p, i], ...)` quantifier is evaluated, these variables are bound within the quantifier's scope.
- **Knowledge Base Assertions**:
- The first assertion is a simple fact: Nancy Pelosi supports abortion.
- The second assertion is a universal rule: For all persons `p` and issues `i`, if `p` supports `i`, then `p` does not publicly denounce `i`.
- Notice that the ForAll assertion does NOT have a `"variables"` field inside the dictionary - the variables are already declared globally.
- **Verification**:
- We check if "publicly_denounce(nancy_pelosi, abortion)" can be true.
- Expected result: UNSAT (because it contradicts the knowledge base - she supports abortion, so she won't denounce it).
- **Why This Works**:
- By declaring `p` and `i` globally, they are available when the interpreter evaluates the ForAll expression.
- The ForAll quantifier binds these variables, making them quantified variables within its scope.
- This prevents the "name 'p' is not defined" error during evaluation.
---
### **Additional Tips**
- **Define All Quantified Variables Globally**:
- Any variable used in ForAll/Exists quantifiers must be declared in the global `"variables"` section.
- This makes them available in the evaluation context so they can be bound by quantifiers.
- **Simplify Assertions**:
- Use simple strings for assertions when possible (e.g., `"supports(nancy_pelosi, abortion)"`)
- Only use dictionary format when you need to specify `"value": false` to negate an assertion.
- **Test Expressions Separately**:
- Before including complex expressions in the JSON, test them separately to ensure they are syntactically correct.
- **Validate JSON Output**:
- Use tools or online validators to ensure that your JSON is well-formed and free of syntax errors.
- **Understand the Evaluation Model**:
- The interpreter evaluates expressions by building a context from functions, constants, and global variables.
- Quantifiers temporarily add their bound variables to this context during evaluation.
- This two-level system (global declaration + quantifier binding) prevents name errors.
---
### **Conclusion**
By following these updated guidelines and paying close attention to variable declarations and context, you can create JSON-based DSL programs that are free of errors and compatible with the interpreter. Always define all variables used in expressions, use correct syntax, and validate your JSON output. This will help ensure that your programs execute successfully and provide accurate results when processed by the theorem prover.
---
**Task**:
Think step by step and reason about the given question. Decompose the question into logical reasoning steps, define the necessary sorts, functions, constants, variables, and knowledge base entries, and finally, construct a JSON file representing the problem. Ensure that:
- All variables used in expressions are properly defined.
- The JSON structure is valid and free of syntax errors.
- The logical expressions are syntactically correct and use variables available in the context.
**Example Task**:
Given the question:
*"Would a student of the class of 2017 have amnesia about 9/11?"*
1. **Decompose the Question**:
- Determine the typical age of a student graduating in 2017.
- Calculate the birth year of such a student.
- Determine if the student was born after the year 2000.
- People born after 2000 may not remember events from 2001.
- Therefore, such a student may not remember 9/11.
2. **Construct the JSON Program**:
```json
{
"sorts": [
{"name": "Person", "type": "DeclareSort"},
{"name": "Int", "type": "IntSort"}
],
"functions": [
{"name": "graduation_year", "domain": ["Person"], "range": "Int"},
{"name": "birth_year", "domain": ["Person"], "range": "Int"},
{"name": "has_amnesia_about_9_11", "domain": ["Person"], "range": "Bool"}
],
"constants": {
"persons": {
"sort": "Person",
"members": ["student"]
}
},
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"graduation_year(student) == 2017",
"birth_year(student) == graduation_year(student) - 22",
{
"assertion": "ForAll([p], Implies(birth_year(p) > 2000, has_amnesia_about_9_11(p)))"
}
],
"verifications": [
{
"name": "Student Has Amnesia About 9/11",
"constraint": "has_amnesia_about_9_11(student)"
}
],
"actions": ["verify_conditions"]
}
```
**Final Note**:
Make sure to double-check your JSON program for correctness and adherence to the guidelines. The goal is to create a logical representation of the question that can be understood and verified by the theorem prover.
---
SAT is True. UNSAT is False. Answer the following question:
"""
initial_prompt_content = initial_prompt_content + f"Question: {question_text}"
prompt_content = initial_prompt_content
previous_response: str | None = None
error_trace: str | None = None
predicted_answer: bool | None = None
interpreter_output: str = ""
while num_attempts < max_attempts and not success:
time.sleep(2)
num_attempts += 1
try:
if num_attempts == 1:
# First attempt
messages = [
{"role": "user", "content": [{"type": "text", "text": initial_prompt_content}]}
]
else:
# Subsequent attempts
messages = [
{
"role": "assistant",
"content": [{"type": "text", "text": previous_response or ""}],
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"There was an error processing your response:\n{error_trace}\nPlease fix the JSON accordingly.",
}
# {"role": "user", "content": [{ "type" : "text", "text" : initial_prompt_content}]},
],
},
]
# Make the OpenAI API call
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
temperature=0.1,
max_tokens=2048,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
response_format={"type": "text"},
)
response_data = response.choices[0].message.content
# Save the response
response_path = os.path.join(response_dir, f"{qid}_response_attempt{num_attempts}.json")
with open(response_path, "w") as f:
json.dump({"response_content": response_data}, f, indent=4)
markdown_content = response_data
previous_response = markdown_content
extracted_json = extract_json_from_markdown(markdown_content or "")
if extracted_json:
with open(output_json_path, "w") as f:
json.dump(extracted_json, f, indent=4)
try:
predicted_answer, interpreter_output = run_z3_interpreter(output_json_path)
logger.info(f"Interpreter Output:\n{interpreter_output}")
if predicted_answer is not None:
logger.info(f"Answer Computed : ({predicted_answer}, {actual_answer})")
if predicted_answer == actual_answer:
correct_answers += 1
logger.info(
f"Question {qid} answered correctly on attempt {num_attempts}"
)
else:
wrong_answers += 1
logger.info(
f"Question {qid} answered incorrectly on attempt {num_attempts}"
)
success = True
else:
logger.warning(
f"Could not determine the answer for question {qid} on attempt {num_attempts}"
)
error_trace = f"Ambiguous output: SAT and UNSAT occurrences don't clearly indicate an answer.\nFull output:\n{interpreter_output}"
success = False
except Exception as e:
error_trace = f"Error running interpreter: {str(e)}\nFull traceback:\n{''.join(traceback.format_exception(type(e), e, e.__traceback__))}"
logger.error(
f"Error running interpreter for question {qid} on attempt {num_attempts}:\n{error_trace}"
)
success = False
else:
error_trace = "Failed to extract JSON from the response."
logger.error(f"Failed to extract JSON for question {qid} on attempt {num_attempts}")
success = False
except Exception as e:
error_trace = f"Error processing question: {str(e)}\nFull traceback:\n{''.join(traceback.format_exception(type(e), e, e.__traceback__))}"
logger.error(
f"Error processing question {qid} on attempt {num_attempts}:\n{error_trace}"
)
success = False
if not success and num_attempts < max_attempts:
prompt_content = initial_prompt_content
if success and predicted_answer is not None:
print(actual_answer, predicted_answer, type(actual_answer), type(predicted_answer))
y_true.append(int(actual_answer))
y_pred.append(int(predicted_answer))
# # Calculate and log metrics for this question
# metrics = calculate_metrics(y_true, y_pred)
# logger.info("Current Metrics:")
# for metric, value in metrics.items():
# logger.info(f"{metric}: {value}")
if not success:
logger.error(f"Failed to process question {qid} after {max_attempts} attempts.")
programgenerror += 1
logger.info("Current Statistics:")
logger.info(f"Total correct answers: {correct_answers}")
logger.info(f"Total wrong answers: {wrong_answers}")
logger.info(f"Accuracy: {correct_answers / (correct_answers + wrong_answers) * 100:.2f}%")
logger.info(f"Program has not compiled {programgenerror} times.")
logger.info("Final Results:")
logger.info(f"Total correct answers: {correct_answers}")
logger.info(f"Total wrong answers: {wrong_answers}")
logger.info(f"Final Accuracy: {correct_answers / (correct_answers + wrong_answers) * 100:.2f}%")
# After processing all questions, calculate and log final metrics
final_metrics = calculate_metrics(y_true, y_pred)
print("Y_true", y_true)
print("Y_pred", y_pred)
logger.info("Final Metrics:")
for metric, value in final_metrics.items():
logger.info(f"{metric}: {value}")
logger.info(f"Total questions processed: {len(y_true)}")
logger.info(f"Program has not compiled {programgenerror} times.")

1
examples/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Examples package for ProofOfThought."""

106
examples/azure_config.py Normal file
View File

@@ -0,0 +1,106 @@
"""
Azure OpenAI Configuration Helper
This module provides easy configuration for Azure OpenAI GPT-5,
with support for environment variables and multiple deployment options.
To use this module:
1. Copy .env.example to .env in the project root
2. Fill in your Azure OpenAI credentials in .env
3. Import and use get_azure_client() or get_client_config()
"""
import os
from pathlib import Path
from typing import Any
from openai import AzureOpenAI
# Load environment variables from .env file
try:
from dotenv import load_dotenv
# Load from project root .env file
project_root = Path(__file__).parent.parent
env_path = project_root / ".env"
load_dotenv(dotenv_path=env_path)
except ImportError:
print("Warning: python-dotenv not installed. Install with: pip install python-dotenv")
print("Falling back to system environment variables only.")
# Azure OpenAI GPT-5 Configuration
# These are loaded from environment variables only (no hardcoded defaults)
AZURE_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_API_KEY = os.getenv("AZURE_OPENAI_KEY")
DEPLOYMENT_NAME = os.getenv("AZURE_DEPLOYMENT_NAME", "gpt-5")
API_VERSION = os.getenv("AZURE_API_VERSION", "2024-12-01-preview")
# GPT-5 specific settings
GPT5_MAX_TOKENS = 16384
GPT5_TEMPERATURE = 0.1
def get_azure_client() -> AzureOpenAI:
"""Get configured Azure OpenAI client for GPT-5.
Returns:
Configured AzureOpenAI client
Environment Variables (required):
AZURE_OPENAI_ENDPOINT: Azure endpoint URL
AZURE_OPENAI_KEY: API key
AZURE_DEPLOYMENT_NAME: Deployment name (default: gpt-5)
AZURE_API_VERSION: API version (default: 2024-12-01-preview)
Raises:
ValueError: If required environment variables are not set
"""
if not AZURE_ENDPOINT:
raise ValueError(
"AZURE_OPENAI_ENDPOINT is not set. "
"Please set it in your .env file or environment variables."
)
if not AZURE_API_KEY:
raise ValueError(
"AZURE_OPENAI_KEY is not set. "
"Please set it in your .env file or environment variables."
)
return AzureOpenAI(
api_version=API_VERSION,
azure_endpoint=AZURE_ENDPOINT,
api_key=AZURE_API_KEY,
)
def get_client_config() -> dict[str, Any]:
"""Get client configuration dictionary.
Returns:
Dictionary with client, model, and recommended settings
"""
return {
"llm_client": get_azure_client(),
"model": DEPLOYMENT_NAME,
"max_tokens": GPT5_MAX_TOKENS,
"temperature": GPT5_TEMPERATURE,
}
# Example usage:
if __name__ == "__main__":
# Test configuration
print("Azure OpenAI GPT-5 Configuration:")
print(f" Endpoint: {AZURE_ENDPOINT}")
print(f" Deployment: {DEPLOYMENT_NAME}")
print(f" API Version: {API_VERSION}")
print(f" Max Tokens: {GPT5_MAX_TOKENS}")
print(f" Temperature: {GPT5_TEMPERATURE}")
# Test client creation
client = get_azure_client()
print("\n✓ Azure OpenAI client created successfully")
# Show config dict
config = get_client_config()
print(f"\nClient config keys: {list(config.keys())}")

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""Example: Using ProofOfThought with Azure OpenAI GPT-5."""
import logging
# Import Azure configuration helper
from azure_config import get_client_config
from z3dsl.reasoning import ProofOfThought
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Get Azure GPT-5 configuration from environment variables
config = get_client_config()
# Create ProofOfThought instance with GPT-5
pot = ProofOfThought(
llm_client=config["llm_client"],
model=config["model"],
max_attempts=3,
verify_timeout=10000,
optimize_timeout=100000,
)
# Ask a question
question = "Would Nancy Pelosi publicly denounce abortion?"
print(f"\nQuestion: {question}")
print("-" * 80)
result = pot.query(
question=question,
temperature=0.1,
max_tokens=16384, # GPT-5 supports up to 16K output tokens
save_program=True,
program_path="azure_gpt5_program.json",
)
# Print results
print("\n" + "=" * 80)
print("GPT-5 QUERY RESULTS")
print("=" * 80)
print(f"Question: {result.question}")
print(f"Answer: {result.answer}")
print(f"Success: {result.success}")
print(f"Attempts: {result.num_attempts}")
print(f"SAT count: {result.sat_count}")
print(f"UNSAT count: {result.unsat_count}")
if result.error:
print(f"\nError: {result.error}")
if result.json_program:
print("\nGenerated JSON program structure:")
print(f" - Sorts: {len(result.json_program.get('sorts', []))}")
print(f" - Functions: {len(result.json_program.get('functions', []))}")
print(f" - Constants: {len(result.json_program.get('constants', {}))}")
print(f" - Knowledge base: {len(result.json_program.get('knowledge_base', []))}")
print(f" - Verifications: {len(result.json_program.get('verifications', []))}")
print("\nProgram saved to: azure_gpt5_program.json")
# Demonstrate batch processing with GPT-5
print("\n" + "=" * 80)
print("BATCH PROCESSING WITH GPT-5")
print("=" * 80)
questions = [
"Can fish breathe underwater?",
"Would a student of the class of 2017 remember 9/11?",
"Can elephants fly?",
]
for i, q in enumerate(questions, 1):
print(f"\n[{i}/{len(questions)}] {q}")
result = pot.query(q)
print(f" Answer: {result.answer} (attempts: {result.num_attempts})")

View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python3
"""
Simplest way to use Azure OpenAI GPT-5 with ProofOfThought.
This example shows the easiest method using the azure_config helper.
"""
import logging
# Import Azure configuration helper
from azure_config import get_client_config
from z3dsl.reasoning import ProofOfThought
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Get Azure GPT-5 configuration (simplest way!)
config = get_client_config()
# Create ProofOfThought with Azure GPT-5
pot = ProofOfThought(llm_client=config["llm_client"], model=config["model"])
# Ask questions
questions = [
"Would Nancy Pelosi publicly denounce abortion?",
"Can fish breathe underwater?",
"Would a student of the class of 2017 remember 9/11?",
]
print("=" * 80)
print("AZURE OPENAI GPT-5 REASONING")
print("=" * 80)
for i, question in enumerate(questions, 1):
print(f"\n[{i}/{len(questions)}] {question}")
result = pot.query(question)
print(f" Answer: {result.answer}")
print(f" Success: {result.success}")
print(f" Attempts: {result.num_attempts}")
if not result.success:
print(f" Error: {result.error}")

View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python3
"""Example: Batch evaluation on StrategyQA dataset."""
import logging
import os
from openai import OpenAI
from z3dsl.reasoning import EvaluationPipeline, ProofOfThought
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Option 1: Standard OpenAI
# Set OPENAI_API_KEY in your .env file
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError(
"OPENAI_API_KEY is not set. " "Please set it in your .env file or environment variables."
)
client = OpenAI(api_key=api_key)
model = "gpt-4o"
# Option 2: Azure OpenAI GPT-5 (uncomment to use)
# from azure_config import get_client_config
# config = get_client_config()
# client = config["llm_client"]
# model = config["model"]
# Create ProofOfThought instance
pot = ProofOfThought(
llm_client=client,
model=model,
max_attempts=3, # Retry up to 3 times
cache_dir="output/programs", # Save generated programs
)
# Create evaluation pipeline
evaluator = EvaluationPipeline(proof_of_thought=pot, output_dir="output/evaluation_results")
# Run evaluation on StrategyQA dataset
result = evaluator.evaluate(
dataset="strategyqa_train.json", # Path to dataset
question_field="question", # Field name for questions
answer_field="answer", # Field name for ground truth
id_field="qid", # Field name for question IDs
max_samples=10, # Evaluate only first 10 samples
skip_existing=True, # Skip already processed samples
)
# Print detailed metrics
print("\n" + "=" * 80)
print("EVALUATION METRICS")
print("=" * 80)
print(f"Total Samples: {result.metrics.total_samples}")
print(f"Correct: {result.metrics.correct_answers}")
print(f"Wrong: {result.metrics.wrong_answers}")
print(f"Failed: {result.metrics.failed_answers}")
print()
print(f"Accuracy: {result.metrics.accuracy:.2%}")
print(f"Precision: {result.metrics.precision:.4f}")
print(f"Recall: {result.metrics.recall:.4f}")
print(f"F1 Score: {result.metrics.f1_score:.4f}")
print(f"Specificity: {result.metrics.specificity:.4f}")
print()
print(f"True Positives: {result.metrics.tp}")
print(f"True Negatives: {result.metrics.tn}")
print(f"False Positives: {result.metrics.fp}")
print(f"False Negatives: {result.metrics.fn}")
print(f"False Positive Rate: {result.metrics.false_positive_rate:.4f}")
print(f"False Negative Rate: {result.metrics.false_negative_rate:.4f}")
# Show sample results
print("\n" + "=" * 80)
print("SAMPLE RESULTS")
print("=" * 80)
for i, query_result in enumerate(result.results[:5]): # Show first 5
print(f"\n[{i+1}] {query_result.question}")
print(f" Answer: {query_result.answer} (attempts: {query_result.num_attempts})")
print(f" Success: {query_result.success}")

View File

@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""
Migration Example: From benchmark_pipeline.py to ProofOfThought API
This example shows how to migrate from the original benchmark_pipeline.py
implementation to the new DSPy-style ProofOfThought API.
"""
import logging
import os
from openai import OpenAI
from z3dsl.reasoning import EvaluationPipeline, ProofOfThought
# Configure logging (same as original)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# =============================================================================
# OLD WAY (benchmark_pipeline.py) - 800+ lines
# =============================================================================
"""
client = OpenAI(api_key="...")
with open('strategyqa_train.json', 'r') as f:
data = json.load(f)
max_questions = 10
correct_answers = 0
wrong_answers = 0
for idx, question_data in enumerate(data[:max_questions]):
qid = question_data['qid']
question_text = question_data['question']
actual_answer = question_data['answer']
# 700-line prompt
initial_prompt_content = '''
** Instructions for Generating JSON-Based DSL Programs for Theorem Proving**
... (700 more lines)
'''
num_attempts = 0
max_attempts = 3
success = False
while num_attempts < max_attempts and not success:
# Manual LLM call
response = client.chat.completions.create(...)
# Manual JSON extraction
extracted_json = extract_json_from_markdown(response.content)
# Manual Z3 execution
interpreter = Z3JSONInterpreter(output_json_path)
interpreter.run()
# Manual result parsing
sat_occurrences = full_output.count(': SAT')
unsat_occurrences = full_output.count(': UNSAT')
# ... more manual logic
if predicted_answer == actual_answer:
correct_answers += 1
# Manual metrics calculation
accuracy = correct_answers / (correct_answers + wrong_answers)
"""
# =============================================================================
# NEW WAY (ProofOfThought) - 5 lines
# =============================================================================
# Initialize client (same as before)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError(
"OPENAI_API_KEY is not set. " "Please set it in your .env file or environment variables."
)
client = OpenAI(api_key=api_key)
# Create ProofOfThought with same settings as original
pot = ProofOfThought(
llm_client=client,
model="gpt-4o",
max_attempts=3, # Same as original max_attempts
verify_timeout=10000,
cache_dir="strategyqa_outputs/programs",
)
# Create evaluation pipeline
evaluator = EvaluationPipeline(
proof_of_thought=pot, output_dir="strategyqa_outputs/evaluation_results"
)
# Run evaluation (replaces entire 800-line loop)
result = evaluator.evaluate(
dataset="strategyqa_train.json",
question_field="question",
answer_field="answer",
id_field="qid",
max_samples=10, # Same as original max_questions
skip_existing=True,
)
# Results are automatically computed
print("\n" + "=" * 80)
print("MIGRATION COMPLETE - RESULTS COMPARISON")
print("=" * 80)
print(f"Total Samples: {result.metrics.total_samples}")
print(f"Correct: {result.metrics.correct_answers}")
print(f"Wrong: {result.metrics.wrong_answers}")
print(f"Failed: {result.metrics.failed_answers}")
print(f"Accuracy: {result.metrics.accuracy:.2%}")
print()
print("Additional metrics not in original:")
print(f" Precision: {result.metrics.precision:.4f}")
print(f" Recall: {result.metrics.recall:.4f}")
print(f" F1 Score: {result.metrics.f1_score:.4f}")
print(f" Specificity: {result.metrics.specificity:.4f}")
print()
print("Confusion Matrix:")
print(f" TP: {result.metrics.tp}, FP: {result.metrics.fp}")
print(f" TN: {result.metrics.tn}, FN: {result.metrics.fn}")
# =============================================================================
# BENEFITS OF MIGRATION
# =============================================================================
print("\n" + "=" * 80)
print("BENEFITS OF NEW API")
print("=" * 80)
print("✓ 800+ lines → 5 lines of code")
print("✓ No manual prompt management")
print("✓ No manual JSON extraction")
print("✓ No manual Z3 execution")
print("✓ No manual result parsing")
print("✓ No manual retry logic")
print("✓ No manual metrics calculation")
print("✓ Automatic caching and resume")
print("✓ Better error handling")
print("✓ More comprehensive metrics")
print("✓ Cleaner, maintainable code")
# =============================================================================
# ACCESSING DETAILED RESULTS (if needed)
# =============================================================================
print("\n" + "=" * 80)
print("DETAILED RESULTS ACCESS")
print("=" * 80)
# You can still access individual results if needed
for i, query_result in enumerate(result.results[:3]):
print(f"\n[Sample {i+1}]")
print(f" Question: {query_result.question[:60]}...")
print(f" Answer: {query_result.answer}")
print(f" Success: {query_result.success}")
print(f" Attempts: {query_result.num_attempts}")
# Access generated program if needed
if query_result.json_program:
print(" Program structure:")
print(f" - Sorts: {len(query_result.json_program.get('sorts', []))}")
print(f" - Functions: {len(query_result.json_program.get('functions', []))}")
print(f" - KB assertions: {len(query_result.json_program.get('knowledge_base', []))}")
# Ground truth and predictions are also available
print(f"\nGround truth labels: {result.y_true[:10]}")
print(f"Predicted labels: {result.y_pred[:10]}")

54
examples/simple_usage.py Normal file
View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""Example: Simple usage of ProofOfThought API."""
import logging
import os
from openai import OpenAI
from z3dsl.reasoning import ProofOfThought
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# Option 1: Standard OpenAI
# Set OPENAI_API_KEY in your .env file
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError(
"OPENAI_API_KEY is not set. " "Please set it in your .env file or environment variables."
)
client = OpenAI(api_key=api_key)
pot = ProofOfThought(llm_client=client, model="gpt-4o")
# Option 2: Azure OpenAI GPT-5 (uncomment to use)
# from azure_config import get_client_config
# config = get_client_config()
# pot = ProofOfThought(llm_client=config["llm_client"], model=config["model"])
# Ask a question
question = "Would Nancy Pelosi publicly denounce abortion?"
result = pot.query(question)
# Print results
print("\n" + "=" * 80)
print("QUERY RESULTS")
print("=" * 80)
print(f"Question: {result.question}")
print(f"Answer: {result.answer}")
print(f"Success: {result.success}")
print(f"Attempts: {result.num_attempts}")
print(f"SAT count: {result.sat_count}")
print(f"UNSAT count: {result.unsat_count}")
if result.error:
print(f"Error: {result.error}")
if result.json_program:
print("\nGenerated JSON program structure:")
print(f" - Sorts: {len(result.json_program.get('sorts', []))}")
print(f" - Functions: {len(result.json_program.get('functions', []))}")
print(f" - Constants: {len(result.json_program.get('constants', {}))}")
print(f" - Knowledge base: {len(result.json_program.get('knowledge_base', []))}")
print(f" - Verifications: {len(result.json_program.get('verifications', []))}")

1
strategyQA_train.json Normal file

File diff suppressed because one or more lines are too long

1108
strategyqa_results.json Normal file

File diff suppressed because it is too large Load Diff

214
test_strategyqa.py Normal file
View File

@@ -0,0 +1,214 @@
#!/usr/bin/env python3
"""
Benchmark ProofOfThought on StrategyQA dataset.
Tests the first 100 questions and tracks performance metrics.
"""
import json
import logging
import sys
import time
from datetime import datetime
from typing import Any
# Add parent directory to path
sys.path.insert(0, "/users/PAS2271/debargha/proofofthought")
from examples.azure_config import get_client_config
from z3dsl.reasoning import ProofOfThought
# Configure logging
logging.basicConfig(
level=logging.WARNING, # Reduce noise, only show warnings and errors
format="%(asctime)s - %(levelname)s - %(message)s",
)
def load_strategyqa_questions(dataset_path: str, num_questions: int = 100) -> list[dict[str, Any]]:
"""Load questions from StrategyQA dataset."""
with open(dataset_path) as f:
data = json.load(f)
questions = []
for item in data[:num_questions]:
questions.append(
{
"qid": item["qid"],
"question": item["question"],
"answer": item["answer"],
"facts": item.get("facts", []),
"decomposition": item.get("decomposition", []),
}
)
return questions
def run_benchmark(
questions: list[dict[str, Any]], pot: Any
) -> tuple[list[dict[str, Any]], int, int, int]:
"""Run ProofOfThought on all questions and collect results."""
results = []
successful = 0
correct = 0
total_attempts = 0
print(f"\n{'='*80}")
print(f"STRATEGYQA BENCHMARK - Testing {len(questions)} questions")
print(f"{'='*80}\n")
for i, q in enumerate(questions, 1):
print(f"[{i}/{len(questions)}] {q['question']}")
start_time = time.time()
try:
result = pot.query(q["question"])
elapsed = time.time() - start_time
# Convert answer to boolean (True/False)
predicted = result.answer
expected = q["answer"]
is_correct = predicted == expected if predicted is not None else False
if result.success:
successful += 1
total_attempts += result.num_attempts
if is_correct:
correct += 1
print(
f" ✓ Correct! Predicted: {predicted}, Attempts: {result.num_attempts}, Time: {elapsed:.1f}s"
)
else:
print(
f" ✗ Wrong! Expected: {expected}, Got: {predicted}, Attempts: {result.num_attempts}"
)
else:
print(f" ✗ Failed! Error: {result.error}")
results.append(
{
"qid": q["qid"],
"question": q["question"],
"expected": expected,
"predicted": predicted,
"success": result.success,
"correct": is_correct,
"num_attempts": result.num_attempts,
"elapsed_time": elapsed,
"error": result.error,
}
)
except Exception as e:
elapsed = time.time() - start_time
print(f" ✗ Exception: {str(e)}")
results.append(
{
"qid": q["qid"],
"question": q["question"],
"expected": q["answer"],
"predicted": None,
"success": False,
"correct": False,
"num_attempts": 0,
"elapsed_time": elapsed,
"error": str(e),
}
)
# Print progress every 10 questions
if i % 10 == 0:
current_success_rate = (successful / i) * 100 if i > 0 else 0
current_accuracy = (correct / successful) * 100 if successful > 0 else 0
avg_attempts = total_attempts / successful if successful > 0 else 0
print(
f"\n Progress: {i}/{len(questions)} | Success: {current_success_rate:.1f}% | Accuracy: {current_accuracy:.1f}% | Avg Attempts: {avg_attempts:.1f}\n"
)
return results, successful, correct, total_attempts
def print_summary(
results: list[dict[str, Any]], successful: int, correct: int, total_attempts: int
) -> None:
"""Print summary statistics."""
total = len(results)
success_rate = (successful / total) * 100 if total > 0 else 0
accuracy = (correct / successful) * 100 if successful > 0 else 0
avg_attempts = total_attempts / successful if successful > 0 else 0
overall_accuracy = (correct / total) * 100 if total > 0 else 0
print(f"\n{'='*80}")
print("BENCHMARK SUMMARY")
print(f"{'='*80}")
print(f"Total Questions: {total}")
print(f"Successful Runs: {successful} ({success_rate:.1f}%)")
print(f"Correct Answers: {correct}")
print(f"Accuracy (of successful): {accuracy:.1f}%")
print(f"Overall Accuracy: {overall_accuracy:.1f}%")
print(f"Average Attempts: {avg_attempts:.2f}")
print(f"{'='*80}\n")
# Error breakdown
errors: dict[str, int] = {}
for r in results:
if not r["success"] and r["error"]:
error_key = r["error"][:50] # Truncate long errors
errors[error_key] = errors.get(error_key, 0) + 1
if errors:
print("Error Breakdown:")
for error, count in sorted(errors.items(), key=lambda x: -x[1]):
print(f" {count:3d}x: {error}")
print()
def save_results(results: list[dict[str, Any]], output_path: str) -> None:
"""Save results to JSON file."""
output = {
"timestamp": datetime.now().isoformat(),
"total": len(results),
"successful": sum(1 for r in results if r["success"]),
"correct": sum(1 for r in results if r["correct"]),
"results": results,
}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
print(f"Results saved to: {output_path}")
def main() -> None:
"""Main benchmark execution."""
dataset_path = "/users/PAS2271/debargha/proofofthought/strategyQA_train.json"
output_path = "/users/PAS2271/debargha/proofofthought/strategyqa_results.json"
num_questions = 100
# Load questions
print("Loading StrategyQA questions...")
questions = load_strategyqa_questions(dataset_path, num_questions)
print(f"Loaded {len(questions)} questions")
# Initialize ProofOfThought with Azure GPT-5
print("Initializing ProofOfThought with Azure GPT-5...")
config = get_client_config()
pot = ProofOfThought(llm_client=config["llm_client"], model=config["model"], max_attempts=3)
print("Ready!\n")
# Run benchmark
start_time = time.time()
results, successful, correct, total_attempts = run_benchmark(questions, pot)
total_time = time.time() - start_time
# Print summary
print_summary(results, successful, correct, total_attempts)
print(f"Total execution time: {total_time/60:.1f} minutes\n")
# Save results
save_results(results, output_path)
if __name__ == "__main__":
main()

View File

@@ -145,7 +145,7 @@ class TestSortManager(unittest.TestCase):
def test_create_functions_undefined_domain_sort(self) -> None:
"""Test that function with undefined domain sort raises error."""
func_defs = [{"name": "f", "domain": ["UndefinedSort"], "range": "IntSort"}]
with self.assertRaises(KeyError):
with self.assertRaises(ValueError):
self.sort_manager.create_functions(func_defs)
def test_create_constants_list_format(self) -> None:

View File

@@ -179,8 +179,16 @@ class SortManager:
for func_def in func_defs:
try:
name = func_def["name"]
# Validate domain sorts exist
for sort_name in func_def["domain"]:
if sort_name not in self.sorts:
raise ValueError(f"Sort '{sort_name}' not defined")
domain = [self.sorts[sort] for sort in func_def["domain"]]
range_sort = self.sorts[func_def["range"]]
# Validate range sort exists
range_sort_name = func_def["range"]
if range_sort_name not in self.sorts:
raise ValueError(f"Sort '{range_sort_name}' not defined")
range_sort = self.sorts[range_sort_name]
functions[name] = Function(name, *domain, range_sort)
logger.debug(f"Created function: {name}")
except KeyError as e:

View File

@@ -117,6 +117,16 @@ class Z3JSONInterpreter:
if self.verifier:
self.verifier.verify_conditions(self.solver, self.verify_timeout)
def get_verification_counts(self) -> tuple[int, int]:
"""Get SAT and UNSAT counts from verification results.
Returns:
Tuple of (sat_count, unsat_count)
"""
if self.verifier:
return (self.verifier.sat_count, self.verifier.unsat_count)
return (0, 0)
def optimize(self) -> None:
"""Run optimization if configured."""
if self.optimizer_runner and "optimization" in self.config:

View File

@@ -49,7 +49,10 @@ class OptimizerRunner:
optimization_vars = {}
for var_def in optimization_config.get("variables", []):
name = var_def["name"]
sort = self.sorts[var_def["sort"]]
sort_name = var_def["sort"]
if sort_name not in self.sorts:
raise ValueError(f"Sort '{sort_name}' not defined")
sort = self.sorts[sort_name]
optimization_vars[name] = Const(name, sort)
# Build extended context: optimization variables + global context

View File

@@ -0,0 +1,18 @@
"""Reasoning components for proof-of-thought using Z3."""
from z3dsl.reasoning.evaluation import EvaluationMetrics, EvaluationPipeline, EvaluationResult
from z3dsl.reasoning.program_generator import GenerationResult, Z3ProgramGenerator
from z3dsl.reasoning.proof_of_thought import ProofOfThought, QueryResult
from z3dsl.reasoning.verifier import VerificationResult, Z3Verifier
__all__ = [
"Z3Verifier",
"VerificationResult",
"Z3ProgramGenerator",
"GenerationResult",
"ProofOfThought",
"QueryResult",
"EvaluationPipeline",
"EvaluationResult",
"EvaluationMetrics",
]

View File

@@ -0,0 +1,302 @@
"""Evaluation pipeline for reasoning datasets."""
import json
import logging
import os
from dataclasses import dataclass, field
from typing import Any
import numpy as np
from sklearn.metrics import (
accuracy_score,
confusion_matrix,
precision_score,
recall_score,
)
from z3dsl.reasoning.proof_of_thought import ProofOfThought, QueryResult
logger = logging.getLogger(__name__)
@dataclass
class EvaluationMetrics:
"""Evaluation metrics for reasoning tasks."""
accuracy: float
precision: float
recall: float
f1_score: float
specificity: float
false_positive_rate: float
false_negative_rate: float
tp: int
fp: int
tn: int
fn: int
total_samples: int
correct_answers: int
wrong_answers: int
failed_answers: int
@dataclass
class EvaluationResult:
"""Complete evaluation results."""
metrics: EvaluationMetrics
results: list[QueryResult] = field(default_factory=list)
y_true: list[int] = field(default_factory=list)
y_pred: list[int] = field(default_factory=list)
class EvaluationPipeline:
"""Dataset-agnostic evaluation pipeline for reasoning tasks."""
def __init__(
self,
proof_of_thought: ProofOfThought,
output_dir: str = "evaluation_results",
) -> None:
"""Initialize evaluation pipeline.
Args:
proof_of_thought: ProofOfThought instance
output_dir: Directory to save evaluation results
"""
self.pot = proof_of_thought
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def evaluate(
self,
dataset: list[dict[str, Any]] | str,
question_field: str = "question",
answer_field: str = "answer",
id_field: str | None = None,
max_samples: int | None = None,
skip_existing: bool = True,
) -> EvaluationResult:
"""Evaluate on a dataset.
Args:
dataset: List of samples or path to JSON file
question_field: Field name for question text
answer_field: Field name for ground truth answer
id_field: Optional field name for sample ID
max_samples: Maximum samples to evaluate (None = all)
skip_existing: Skip samples with existing cached results
Returns:
EvaluationResult with metrics and detailed results
"""
# Load dataset if path provided
dataset_list: list[dict[str, Any]]
if isinstance(dataset, str):
with open(dataset) as f:
dataset_list = json.load(f)
else:
dataset_list = dataset
# Limit samples if requested
if max_samples:
dataset_list = dataset_list[:max_samples]
logger.info(f"Evaluating {len(dataset_list)} samples")
results = []
y_true = []
y_pred = []
correct = 0
wrong = 0
failed = 0
for idx, sample in enumerate(dataset_list):
# Extract fields
question = sample[question_field]
ground_truth = sample[answer_field]
sample_id = sample.get(id_field) if id_field else f"sample_{idx}"
logger.info(f"[{idx+1}/{len(dataset)}] Processing: {sample_id}")
logger.info(f"Question: {question}")
logger.info(f"Ground truth: {ground_truth}")
# Check if already processed
result_path = os.path.join(self.output_dir, f"{sample_id}_result.json")
if skip_existing and os.path.exists(result_path):
logger.info(f"Skipping {sample_id} (already processed)")
try:
with open(result_path) as f:
cached = json.load(f)
if cached.get("success"):
y_true.append(int(ground_truth))
y_pred.append(int(cached["answer"]))
if cached["answer"] == ground_truth:
correct += 1
else:
wrong += 1
else:
failed += 1
except Exception as e:
logger.warning(f"Failed to load cached result: {e}")
continue
# Query the system
result = self.pot.query(
question=question,
save_program=True,
program_path=os.path.join(self.output_dir, f"{sample_id}_program.json"),
)
# Save result
with open(result_path, "w") as f:
json.dump(
{
"sample_id": sample_id,
"question": question,
"ground_truth": ground_truth,
"answer": result.answer,
"success": result.success,
"num_attempts": result.num_attempts,
"sat_count": result.sat_count,
"unsat_count": result.unsat_count,
"error": result.error,
},
f,
indent=2,
)
results.append(result)
# Update metrics
if result.success and result.answer is not None:
y_true.append(int(ground_truth))
y_pred.append(int(result.answer))
if result.answer == ground_truth:
correct += 1
logger.info("✓ Correct answer")
else:
wrong += 1
logger.info("✗ Wrong answer")
else:
failed += 1
logger.warning(f"✗ Failed to get answer: {result.error}")
# Log current statistics
total_answered = correct + wrong
if total_answered > 0:
accuracy = correct / total_answered
logger.info(f"Current stats: {correct}/{total_answered} correct ({accuracy:.2%})")
# Calculate final metrics
metrics = self._calculate_metrics(y_true, y_pred, correct, wrong, failed)
# Log final results
logger.info("=" * 80)
logger.info("FINAL EVALUATION RESULTS")
logger.info("=" * 80)
logger.info(f"Total samples: {len(dataset)}")
logger.info(f"Correct: {correct}")
logger.info(f"Wrong: {wrong}")
logger.info(f"Failed: {failed}")
logger.info(f"Accuracy: {metrics.accuracy:.2%}")
logger.info(f"Precision: {metrics.precision:.4f}")
logger.info(f"Recall: {metrics.recall:.4f}")
logger.info(f"F1 Score: {metrics.f1_score:.4f}")
return EvaluationResult(metrics=metrics, results=results, y_true=y_true, y_pred=y_pred)
def _calculate_metrics(
self, y_true: list[int], y_pred: list[int], correct: int, wrong: int, failed: int
) -> EvaluationMetrics:
"""Calculate evaluation metrics.
Args:
y_true: Ground truth labels
y_pred: Predicted labels
correct: Number of correct predictions
wrong: Number of wrong predictions
failed: Number of failed predictions
Returns:
EvaluationMetrics object
"""
if len(y_true) == 0:
# No successful predictions
return EvaluationMetrics(
accuracy=0.0,
precision=0.0,
recall=0.0,
f1_score=0.0,
specificity=0.0,
false_positive_rate=0.0,
false_negative_rate=0.0,
tp=0,
fp=0,
tn=0,
fn=0,
total_samples=correct + wrong + failed,
correct_answers=correct,
wrong_answers=wrong,
failed_answers=failed,
)
y_true_arr = np.array(y_true)
y_pred_arr = np.array(y_pred)
# Handle edge cases with single class
if len(np.unique(y_true_arr)) == 1 or len(np.unique(y_pred_arr)) == 1:
accuracy = accuracy_score(y_true_arr, y_pred_arr)
if np.array_equal(y_true_arr, y_pred_arr):
if y_true_arr[0] == 1: # All positive
tp, fp, tn, fn = len(y_true_arr), 0, 0, 0
else: # All negative
tp, fp, tn, fn = 0, 0, len(y_true_arr), 0
else:
if y_true_arr[0] == 1: # All true positive
tp = int(np.sum(y_pred_arr))
fn = len(y_true_arr) - tp
fp, tn = 0, 0
else: # All true negative
tn = int(np.sum(~y_pred_arr.astype(bool)))
fp = len(y_true_arr) - tn
tp, fn = 0, 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0.0
else:
# Normal case with multiple classes
cm = confusion_matrix(y_true_arr, y_pred_arr)
tn, fp, fn, tp = cm.ravel()
accuracy = accuracy_score(y_true_arr, y_pred_arr)
precision = precision_score(y_true_arr, y_pred_arr, zero_division=0)
recall = recall_score(y_true_arr, y_pred_arr, zero_division=0)
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0.0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
return EvaluationMetrics(
accuracy=accuracy,
precision=precision,
recall=recall,
f1_score=f1,
specificity=specificity,
false_positive_rate=fpr,
false_negative_rate=fnr,
tp=int(tp),
fp=int(fp),
tn=int(tn),
fn=int(fn),
total_samples=correct + wrong + failed,
correct_answers=correct,
wrong_answers=wrong,
failed_answers=failed,
)

View File

@@ -0,0 +1,194 @@
"""Z3 DSL program generator using LLM."""
import json
import logging
import re
from dataclasses import dataclass
from typing import Any
from z3dsl.reasoning.prompt_template import build_prompt
logger = logging.getLogger(__name__)
@dataclass
class GenerationResult:
"""Result of JSON program generation."""
json_program: dict[str, Any] | None
raw_response: str
success: bool
error: str | None = None
class Z3ProgramGenerator:
"""Generate Z3 DSL JSON programs from natural language questions using LLM."""
def __init__(self, llm_client: Any, model: str = "gpt-4o") -> None:
"""Initialize the program generator.
Args:
llm_client: LLM client (OpenAI, Anthropic, etc.)
model: Model name to use
"""
self.llm_client = llm_client
self.model = model
def generate(
self,
question: str,
temperature: float = 0.1,
max_tokens: int = 16384,
) -> GenerationResult:
"""Generate a Z3 DSL JSON program from a question.
Args:
question: Natural language question
temperature: LLM temperature
max_tokens: Maximum tokens for response (default 16384 for GPT-5)
Returns:
GenerationResult with JSON program or error
"""
try:
prompt = build_prompt(question)
# Make LLM API call (compatible with both OpenAI and Azure OpenAI)
# Azure OpenAI requires content as string, not list
# GPT-5 only supports temperature=1 (default), so don't pass it
response = self.llm_client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_completion_tokens=max_tokens,
)
raw_response = response.choices[0].message.content
# Extract JSON from markdown code block
json_program = self._extract_json(raw_response)
if json_program:
return GenerationResult(
json_program=json_program,
raw_response=raw_response,
success=True,
)
else:
# Log the raw response to help debug extraction failures
logger.debug(f"Raw LLM response:\n{raw_response[:1000]}...")
return GenerationResult(
json_program=None,
raw_response=raw_response,
success=False,
error="Failed to extract valid JSON from response",
)
except Exception as e:
logger.error(f"Error generating program: {e}")
return GenerationResult(
json_program=None,
raw_response="",
success=False,
error=str(e),
)
def generate_with_feedback(
self,
question: str,
error_trace: str,
previous_response: str,
temperature: float = 0.1,
max_tokens: int = 16384,
) -> GenerationResult:
"""Regenerate program with error feedback.
Args:
question: Original question
error_trace: Error message from previous attempt
previous_response: Previous LLM response
temperature: LLM temperature
max_tokens: Maximum tokens (default 16384 for GPT-5)
Returns:
GenerationResult with corrected JSON program
"""
try:
prompt = build_prompt(question)
feedback_message = (
f"There was an error processing your response:\n{error_trace}\n"
"Please fix the JSON accordingly."
)
# Multi-turn conversation with error feedback
# Compatible with both OpenAI and Azure OpenAI
# GPT-5 only supports temperature=1 (default), so don't pass it
response = self.llm_client.chat.completions.create(
model=self.model,
messages=[
{"role": "user", "content": prompt},
{"role": "assistant", "content": previous_response},
{"role": "user", "content": feedback_message},
],
max_completion_tokens=max_tokens,
)
raw_response = response.choices[0].message.content
json_program = self._extract_json(raw_response)
if json_program:
return GenerationResult(
json_program=json_program,
raw_response=raw_response,
success=True,
)
else:
# Log the raw response to help debug extraction failures
logger.debug(f"Raw LLM feedback response:\n{raw_response[:1000]}...")
return GenerationResult(
json_program=None,
raw_response=raw_response,
success=False,
error="Failed to extract valid JSON from feedback response",
)
except Exception as e:
logger.error(f"Error generating program with feedback: {e}")
return GenerationResult(
json_program=None,
raw_response="",
success=False,
error=str(e),
)
def _extract_json(self, markdown_content: str) -> dict[str, Any] | None:
"""Extract JSON from markdown code block.
Args:
markdown_content: Markdown text potentially containing JSON
Returns:
Parsed JSON dict or None if extraction failed
"""
# Pattern to match ```json ... ``` code blocks
json_pattern = r"```json\s*(\{[\s\S]*?\})\s*```"
match = re.search(json_pattern, markdown_content)
if match:
try:
json_str = match.group(1)
return json.loads(json_str)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON: {e}")
return None
# Try to find JSON without code block markers
try:
# Look for { ... } pattern
brace_pattern = r"\{[\s\S]*\}"
match = re.search(brace_pattern, markdown_content)
if match:
return json.loads(match.group(0))
except json.JSONDecodeError:
pass
return None

View File

@@ -0,0 +1,545 @@
"""Prompt template for Z3 DSL program generation."""
DSL_INSTRUCTIONS = """
** Instructions for Generating JSON-Based DSL Programs for Theorem Proving**
**Introduction**
This document provides comprehensive guidelines for generating JSON-based Domain-Specific Language (DSL) programs designed to solve complex reasoning tasks using a theorem prover. The goal is to translate reasoning problems into structured JSON programs that can be parsed by a custom interpreter and reliably solved. This guide includes detailed explanations of each section, examples, and emphasizes common pitfalls to avoid to ensure that the generated programs are error-free and adhere strictly to the expected format.
---
### **Important Guidelines to Avoid Common Errors**
1. **Variable Definitions**
- **Understand the difference between FREE variables and QUANTIFIED variables:**
- **Free Variables**: Declared in the global `"variables"` section. These are variables used in non-quantified contexts (e.g., directly in assertions without ForAll/Exists).
- **Quantified Variables**: Variables bound by `ForAll` or `Exists` quantifiers. These are automatically bound by the quantifier itself and should NOT be declared in a separate `"variables"` field.
- **Example of Correct Variable Declaration:**
```json
"variables": [
{"name": "p", "sort": "Person"},
{"name": "i", "sort": "Issue"}
]
```
This declares `p` and `i` as free variables available throughout the program.
For quantified variables in ForAll/Exists:
```json
"knowledge_base": [
"ForAll([p, i], Implies(supports(p, i), Not(publicly_denounce(p, i))))"
]
```
Here, `p` and `i` are automatically bound by the `ForAll` quantifier. If `p` and `i` are declared in the global `"variables"` section, they can be referenced as free variables within the quantified expression.
2. **Context in Evaluations**
- The interpreter evaluates expressions in a context that includes:
- **Functions**: Defined in the `"functions"` section.
- **Constants**: Defined in the `"constants"` section.
- **Free Variables**: Defined in the global `"variables"` section.
- **Quantified Variables**: Temporarily added to context when evaluating quantified expressions (ForAll/Exists).
- **Understanding Variable Scope**:
- **Free variables** in the global `"variables"` section are available throughout the entire program.
- **Quantified variables** (e.g., in `ForAll([x], ...)`) are automatically bound by the quantifier and available only within that quantified expression.
- You can reference free variables inside quantified expressions, creating nested scopes.
3. **Valid JSON Output**
- **Ensure that the JSON output is valid and can be parsed without errors.**
- Use proper syntax, including commas, quotation marks, and matching brackets.
- Avoid trailing commas or missing commas between elements.
- **Common JSON Errors to Avoid**:
- Missing commas between array elements or object properties.
- Unmatched brackets or braces.
- Incorrect use of quotation marks.
- **Recommendation**:
- Use a JSON validator or formatter to check the generated JSON before finalizing it.
4. **Correct Syntax in Logical Expressions**
- **Use Proper Python Syntax for Expressions**:
- When writing expressions that will be evaluated, ensure they are valid Python expressions.
- For example, in the assertion `ForAll([p], ...)`, `p` must be defined in the context or within the quantifier.
- **Avoid Using Unrecognized Variables**:
- Do not use variables in expressions that have not been defined.
- If a variable is introduced in a quantifier, ensure it is properly included.
- **Example of Correct Usage**:
```json
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"ForAll([p], Implies(is_law_enforcement(p), can_arrest(p)))"
]
```
Where `p` is declared as a free variable in the global `"variables"` section, then bound by the `ForAll` quantifier in the assertion.
---
### **Detailed Explanation of Each Section**
1. **Sorts**
- **Purpose**: Define the types or domains (e.g., integers, booleans, custom types like "Person").
- **Structure**:
```json
{
"name": "SortName",
"type": "SortType"
}
```
- **Sort Types**:
- `"BoolSort"`: Boolean type. **Note: When referencing this sort in functions/variables, use "BoolSort" not "Bool".**
- `"IntSort"`: Integer type. **Note: When referencing this sort in functions/variables, use "IntSort" not "Int".**
- `"RealSort"`: Real number type. **Note: When referencing this sort in functions/variables, use "RealSort" not "Real".**
- `"DeclareSort"`: Custom, uninterpreted sort.
- `"EnumSort"`: Enumerated sort with specified values.
- `"BitVecSort(n)"`: Bit vector of size `n`.
- `"ArraySort(DomainSort, RangeSort)"`: Array mapping from `DomainSort` to `RangeSort`.
- **Built-in Sorts**: The sorts `BoolSort`, `IntSort`, and `RealSort` are pre-defined and do NOT need to be declared in the `"sorts"` section.
- **Example**:
```json
{"name": "Person", "type": "DeclareSort"}
```
2. **Functions**
- **Purpose**: Define operations or relations between sorts.
- **Structure**:
```json
{
"name": "FunctionName",
"domain": ["Sort1", "Sort2"],
"range": "ReturnSort"
}
```
- **Example**:
```json
{"name": "num_children", "domain": ["Person"], "range": "IntSort"},
{"name": "is_valid", "domain": ["Person"], "range": "BoolSort"}
```
- **Important**: Use the full sort names: "BoolSort", "IntSort", "RealSort", not "Bool", "Int", "Real".
3. **Constants**
- **Purpose**: Represent fixed elements within sorts.
- **Structure**:
```json
{
"Category": {
"sort": "SortName",
"members": ["Const1", "Const2"]
}
}
```
- **Example**:
```json
{
"persons": {
"sort": "Person",
"members": ["genghis_khan", "julius_caesar"]
}
}
```
4. **Variables**
- **Purpose**: Define FREE variables that can be used throughout the program. These are symbols that can be referenced in assertions, rules, and verifications. They are particularly useful when you want to use the same variable symbol in multiple quantified expressions.
- **Structure**:
```json
{
"name": "VariableName",
"sort": "SortName"
}
```
- **Example**:
```json
{"name": "x", "sort": "Int"}
```
- **Note**: Variables declared here can be used as free variables OR can be bound by quantifiers (ForAll/Exists) in assertions. When bound by a quantifier, they become quantified variables within that scope.
5. **Knowledge Base**
- **Purpose**: A set of axioms or facts that are assumed to be true.
- **Structure**: An array of assertions, each representing a logical expression.
- **Assertions** can be simple strings or dictionaries specifying the assertion and its truth value.
- **Example**:
```json
"variables": [
{"name": "p", "sort": "Person"}
],
"knowledge_base": [
"ForAll([p], Implies(is_law_enforcement(p), can_arrest(p)))",
"num_children(genghis_khan) == 16",
{
"assertion": "can_fly(superman)",
"value": true
}
]
```
- **Note**: When using quantifiers like ForAll/Exists, the variables must be declared in the global `"variables"` section to be available in the evaluation context.
6. **Rules**
- **Purpose**: Define general logical implications or constraints.
- **Structure**:
```json
{
"name": "RuleName",
"forall": [
{"name": "Var1", "sort": "Sort1"},
{"name": "Var2", "sort": "Sort2"}
],
"implies": {
"antecedent": "LogicalExpression",
"consequent": "LogicalExpression"
}
}
```
Or for simple constraints:
```json
{
"name": "RuleName",
"constraint": "LogicalExpression"
}
```
- **Example**:
```json
{
"name": "Greater Than Rule",
"forall": [
{"name": "a", "sort": "Int"},
{"name": "b", "sort": "Int"}
],
"implies": {
"antecedent": "a > b",
"consequent": "Not(b > a)"
}
}
```
- **Important**: Rules with `"implies"` MUST have a `"forall"` field with at least one variable. The `"forall"` field cannot be empty. For rules without quantification, use `"constraint"` instead.
7. **Verifications**
- **Purpose**: Specify properties or conditions that need to be verified by the theorem prover.
- **Three Types of Verifications**:
**Type 1: Simple Constraint (no quantifiers)**
```json
{
"name": "VerificationName",
"constraint": "LogicalExpression"
}
```
Example:
```json
{
"name": "Compare Descendants",
"constraint": "num_descendants(genghis_khan) > num_descendants(julius_caesar)"
}
```
**Type 2: Existential Verification (checking if there exists a value)**
```json
{
"name": "VerificationName",
"exists": [
{"name": "Var", "sort": "Sort"}
],
"constraint": "LogicalExpression"
}
```
Example:
```json
{
"name": "Find Positive Number",
"exists": [
{"name": "x", "sort": "Int"}
],
"constraint": "And(x > 0, x < 10)"
}
```
**Type 3: Universal Verification (checking if property holds for all values)**
```json
{
"name": "VerificationName",
"forall": [
{"name": "Var", "sort": "Sort"}
],
"implies": {
"antecedent": "LogicalExpression",
"consequent": "LogicalExpression"
}
}
```
Example:
```json
{
"name": "All Positive Numbers Greater Than Zero",
"forall": [
{"name": "x", "sort": "Int"}
],
"implies": {
"antecedent": "x > 0",
"consequent": "x >= 1"
}
}
```
- **Important**: The `"exists"` and `"forall"` fields cannot be empty. They must contain at least one variable definition.
8. **Optimization** (Optional)
- **Purpose**: Define optimization problems with variables, constraints, and objectives.
- **Structure**:
```json
{
"variables": [
{"name": "Var", "sort": "Sort"}
],
"constraints": ["LogicalExpression"],
"objectives": [
{
"type": "minimize" or "maximize",
"expression": "ArithmeticExpression"
}
]
}
```
- **Example**:
```json
{
"variables": [
{"name": "x", "sort": "Int"}
],
"constraints": [
"x >= 0",
"x <= 10"
],
"objectives": [
{
"type": "maximize",
"expression": "x"
}
]
}
```
9. **Actions**
- **Purpose**: Specify which actions the interpreter should perform.
- **Possible Actions**:
- `"verify_conditions"`: Runs verifications.
- `"optimize"`: Solves optimization problems.
- **Structure**: An array of action strings.
- **Example**:
```json
["verify_conditions"]
```
---
### **Understanding Verification Semantics**
**Important: What Does SAT/UNSAT Mean?**
When you run verifications, the interpreter checks the satisfiability of your constraint given the knowledge base:
- **SAT (Satisfiable)**: The constraint CAN be true given the knowledge base. The solver found a model where both the knowledge base AND the constraint are satisfied simultaneously.
- This means the constraint is CONSISTENT with the knowledge base.
- A model (example values) will be shown.
- **UNSAT (Unsatisfiable)**: The constraint CONTRADICTS the knowledge base. There is no possible model where both the knowledge base and the constraint can be true together.
- This means the constraint is INCONSISTENT with the knowledge base.
- **UNKNOWN**: The solver timed out or couldn't determine satisfiability.
**Checking Different Types of Properties:**
1. **To check if a property CAN be true** (satisfiability):
- Add it directly to verifications
- SAT = yes, it's possible
- UNSAT = no, it contradicts the knowledge base
2. **To check if a property MUST be true** (entailment, KB ⊨ φ):
- Verify that the NEGATION of the property is UNSAT
- If KB ∧ ¬φ is UNSAT, then KB ⊨ φ (the knowledge base entails the property)
- Example: To prove "publicly_denounce(nancy_pelosi, abortion)" is false given KB, check if "publicly_denounce(nancy_pelosi, abortion)" is UNSAT
**Example**:
```json
"verifications": [
{
"name": "Can Pelosi Denounce Abortion",
"constraint": "publicly_denounce(nancy_pelosi, abortion)"
}
]
```
- If this returns SAT: Pelosi denouncing abortion is consistent with the knowledge base
- If this returns UNSAT: Pelosi denouncing abortion contradicts the knowledge base (meaning she definitely won't)
---
### **Available Operators and Functions**
- **Logical Operators**:
- `And(expr1, expr2, ...)`
- `Or(expr1, expr2, ...)`
- `Not(expr)`
- `Implies(antecedent, consequent)`
- `If(condition, true_expr, false_expr)`
- `Distinct(expr1, expr2, ...)`
- **Quantifiers**:
- `ForAll([vars], expr)`
- `Exists([vars], expr)`
- **Arithmetic Operators**:
- `+`, `-`, `*`, `/`
- Comparison: `==`, `!=`, `<`, `<=`, `>`, `>=`
- **Custom Functions**: Defined in the `"functions"` section.
---
### **Verification Strategy Guide**
**CRITICAL: Create ONE Verification Per Question**
When answering a question, create **exactly ONE verification condition** that directly tests what the question asks. Do NOT create multiple verifications testing both the positive and negative cases - this leads to ambiguous results.
**Question Type Strategies:**
1. **"Can/Could X do Y?" or "Is it possible that X...?"** (Possibility questions)
- Create a single verification checking if the property CAN be satisfied
- Use `exists` if checking for at least one example
- ✓ Example: "Can fish breathe underwater?"
```json
"verifications": [
{
"name": "Can fish breathe underwater",
"exists": [{"name": "f", "sort": "Animal"}],
"constraint": "And(is_fish(f), can_breathe_underwater(f))"
}
]
```
- SAT = Yes, it's possible | UNSAT = No, it contradicts the facts
2. **"Would/Does X do Y?" or "Is X true?"** (Factual questions)
- Create a single verification checking if the statement is consistent with facts
- ✓ Example: "Would Nancy Pelosi publicly denounce abortion?"
```json
"verifications": [
{
"name": "Would Pelosi denounce abortion",
"constraint": "publicly_denounce(nancy_pelosi, abortion)"
}
]
```
- SAT = Yes, it's consistent with the facts | UNSAT = No, it contradicts the facts
3. **"Is X always/never true?"** (Universal questions)
- Use `forall` to check all cases
- ✓ Example: "Do all birds fly?"
```json
"verifications": [
{
"name": "All birds can fly",
"forall": [{"name": "b", "sort": "Animal"}],
"implies": {
"antecedent": "is_bird(b)",
"consequent": "can_fly(b)"
}
}
]
```
**CRITICAL RULES:**
- ✓ **DO**: Create one clear verification that directly answers the question
- ✗ **DON'T**: Create multiple verifications testing both positive AND negative (e.g., "exists a fish that breathes underwater" AND "exists a fish that doesn't breathe underwater")
- ✗ **DON'T**: Test the inverse/negation unless the question specifically asks about it
- ✗ **DON'T**: Create redundant verifications that check the same thing different ways
**Example of WRONG approach** (causes ambiguous results):
```json
// DON'T DO THIS - Multiple verifications testing opposite things
"verifications": [
{"name": "Fish can breathe", "constraint": "can_breathe_underwater(fish)"},
{"name": "Fish cannot breathe", "constraint": "Not(can_breathe_underwater(fish))"}
]
// This gives SAT=1, UNSAT=1 = AMBIGUOUS!
```
---
### **Common Pitfalls to Avoid**
- **Undefined Variables**: Always define variables in the global `"variables"` section if they will be used in quantified expressions (ForAll/Exists). The quantifier binds the variable, but it must exist in the evaluation context first.
- **Using 'variables' field in assertions**: Do NOT add a `"variables"` field inside knowledge_base assertions. Variables should be declared in the global `"variables"` section only.
- **Using 'variables' instead of 'forall' in rules**: Rules must use `"forall"` for quantified variables, not `"variables"`.
- **Empty quantifier lists**: If you specify `"forall"` or `"exists"` in rules or verifications, they must contain at least one variable. Empty lists will cause errors.
- **Syntax Errors in Expressions**: Use correct syntax in logical expressions. Avoid typos and ensure that all parentheses and commas are correctly placed.
- **Invalid JSON**: Double-check the JSON structure for validity. Use a JSON linter or validator if necessary.
- **Misunderstanding SAT/UNSAT**: Remember that SAT means "possible/consistent" and UNSAT means "contradicts". To prove something MUST be true, check if its negation is UNSAT.
---
### **Conclusion**
By following these updated guidelines and paying close attention to variable declarations and context, you can create JSON-based DSL programs that are free of errors and compatible with the interpreter. Always define all variables used in expressions, use correct syntax, and validate your JSON output. This will help ensure that your programs execute successfully and provide accurate results when processed by the theorem prover.
---
**Task**:
Think step by step and reason about the given question. Decompose the question into logical reasoning steps, define the necessary sorts, functions, constants, variables, and knowledge base entries, and finally, construct a JSON file representing the problem. Ensure that:
- All variables used in expressions are properly defined.
- The JSON structure is valid and free of syntax errors.
- The logical expressions are syntactically correct and use variables available in the context.
- **Create EXACTLY ONE verification** that directly answers the question - do NOT create multiple verifications testing opposite cases.
**IMPORTANT: Output Format**
Your response MUST be wrapped in a markdown code block with the ```json``` tag like this:
```json
{
"sorts": [...],
"functions": [...],
...
}
```
Do NOT output raw JSON without the markdown code block. The parser expects the JSON to be enclosed in ```json ... ``` markers.
SAT is True. UNSAT is False. Answer the following question:
"""
def build_prompt(question: str) -> str:
"""Build the complete prompt for JSON DSL generation.
Args:
question: The reasoning question to answer
Returns:
Complete prompt string
"""
return DSL_INSTRUCTIONS + f"\nQuestion: {question}"

View File

@@ -0,0 +1,196 @@
"""ProofOfThought: Main API for Z3-based reasoning."""
import json
import logging
import os
import tempfile
import traceback
from dataclasses import dataclass
from typing import Any
from z3dsl.reasoning.program_generator import Z3ProgramGenerator
from z3dsl.reasoning.verifier import Z3Verifier
logger = logging.getLogger(__name__)
@dataclass
class QueryResult:
"""Result of a reasoning query."""
question: str
answer: bool | None
json_program: dict[str, Any] | None
sat_count: int
unsat_count: int
output: str
success: bool
num_attempts: int
error: str | None = None
class ProofOfThought:
"""High-level API for Z3-based reasoning.
Provides a simple interface that hides the complexity of:
- JSON DSL program generation
- Z3 solver execution
- Result parsing and interpretation
Example:
>>> from openai import OpenAI
>>> client = OpenAI(api_key="...")
>>> pot = ProofOfThought(llm_client=client)
>>> result = pot.query("Would Nancy Pelosi publicly denounce abortion?")
>>> print(result.answer) # False
"""
def __init__(
self,
llm_client: Any,
model: str = "gpt-5",
max_attempts: int = 3,
verify_timeout: int = 10000,
optimize_timeout: int = 100000,
cache_dir: str | None = None,
) -> None:
"""Initialize ProofOfThought.
Args:
llm_client: LLM client (OpenAI, AzureOpenAI, Anthropic, etc.)
model: LLM model/deployment name (default: "gpt-5")
max_attempts: Maximum retry attempts for program generation
verify_timeout: Z3 verification timeout in milliseconds
optimize_timeout: Z3 optimization timeout in milliseconds
cache_dir: Directory to cache generated programs (None = temp dir)
"""
self.generator = Z3ProgramGenerator(llm_client=llm_client, model=model)
self.verifier = Z3Verifier(verify_timeout=verify_timeout, optimize_timeout=optimize_timeout)
self.max_attempts = max_attempts
self.cache_dir = cache_dir or tempfile.gettempdir()
# Create cache directory if needed
os.makedirs(self.cache_dir, exist_ok=True)
def query(
self,
question: str,
temperature: float = 0.1,
max_tokens: int = 16384,
save_program: bool = False,
program_path: str | None = None,
) -> QueryResult:
"""Answer a reasoning question using Z3 theorem proving.
Args:
question: Natural language question to answer
temperature: LLM temperature for program generation
max_tokens: Maximum tokens for LLM response (default 16384 for GPT-5)
save_program: Whether to save generated JSON program
program_path: Path to save program (None = auto-generate)
Returns:
QueryResult with answer and execution details
"""
logger.info(f"Processing question: {question}")
previous_response: str | None = None
error_trace: str | None = None
for attempt in range(1, self.max_attempts + 1):
logger.info(f"Attempt {attempt}/{self.max_attempts}")
try:
# Generate or regenerate program
if attempt == 1:
gen_result = self.generator.generate(
question=question,
temperature=temperature,
max_tokens=max_tokens,
)
else:
gen_result = self.generator.generate_with_feedback(
question=question,
error_trace=error_trace or "",
previous_response=previous_response or "",
temperature=temperature,
max_tokens=max_tokens,
)
if not gen_result.success or gen_result.json_program is None:
error_trace = gen_result.error or "Failed to generate JSON program"
previous_response = gen_result.raw_response
logger.warning(f"Generation failed: {error_trace}")
continue
# Save program to temporary file
if program_path is None:
temp_file = tempfile.NamedTemporaryFile(
mode="w",
suffix=".json",
dir=self.cache_dir,
delete=not save_program,
)
json_path = temp_file.name
else:
json_path = program_path
with open(json_path, "w") as f:
json.dump(gen_result.json_program, f, indent=2)
logger.info(f"Generated program saved to: {json_path}")
# Execute Z3 verification
verify_result = self.verifier.verify(json_path)
if not verify_result.success:
error_trace = verify_result.error or "Z3 verification failed"
previous_response = gen_result.raw_response
logger.warning(f"Verification failed: {error_trace}")
continue
# Check if we got a definitive answer
if verify_result.answer is None:
error_trace = (
f"Ambiguous verification result: "
f"SAT={verify_result.sat_count}, UNSAT={verify_result.unsat_count}\n"
f"Output:\n{verify_result.output}"
)
previous_response = gen_result.raw_response
logger.warning(f"Ambiguous result: {error_trace}")
continue
# Success!
logger.info(
f"Successfully answered question on attempt {attempt}: {verify_result.answer}"
)
return QueryResult(
question=question,
answer=verify_result.answer,
json_program=gen_result.json_program,
sat_count=verify_result.sat_count,
unsat_count=verify_result.unsat_count,
output=verify_result.output,
success=True,
num_attempts=attempt,
)
except Exception as e:
error_trace = f"Error: {str(e)}\n{traceback.format_exc()}"
logger.error(f"Exception on attempt {attempt}: {error_trace}")
if "gen_result" in locals():
previous_response = gen_result.raw_response
# All attempts failed
logger.error(f"Failed to answer question after {self.max_attempts} attempts")
return QueryResult(
question=question,
answer=None,
json_program=None,
sat_count=0,
unsat_count=0,
output="",
success=False,
num_attempts=self.max_attempts,
error=f"Failed after {self.max_attempts} attempts. Last error: {error_trace}",
)

105
z3dsl/reasoning/verifier.py Normal file
View File

@@ -0,0 +1,105 @@
"""Z3 Verifier module for robust execution and output parsing."""
import io
import logging
from contextlib import redirect_stderr, redirect_stdout
from dataclasses import dataclass
from z3dsl.interpreter import Z3JSONInterpreter
logger = logging.getLogger(__name__)
@dataclass
class VerificationResult:
"""Result of Z3 verification execution."""
answer: bool | None # True (SAT), False (UNSAT), or None (ambiguous/error)
sat_count: int
unsat_count: int
output: str
success: bool
error: str | None = None
class Z3Verifier:
"""Robust Z3 interpreter execution and result parsing."""
def __init__(self, verify_timeout: int = 10000, optimize_timeout: int = 100000) -> None:
"""Initialize the Z3 Verifier.
Args:
verify_timeout: Timeout for verification in milliseconds
optimize_timeout: Timeout for optimization in milliseconds
"""
self.verify_timeout = verify_timeout
self.optimize_timeout = optimize_timeout
def verify(self, json_path: str) -> VerificationResult:
"""Execute Z3 interpreter on a JSON program and parse results.
Args:
json_path: Path to JSON DSL program file
Returns:
VerificationResult with answer and execution details
"""
try:
# Capture stdout and stderr for output logging
stdout_capture = io.StringIO()
stderr_capture = io.StringIO()
with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture):
interpreter = Z3JSONInterpreter(
json_path,
verify_timeout=self.verify_timeout,
optimize_timeout=self.optimize_timeout,
)
interpreter.run()
# Get structured verification counts from interpreter
sat_count, unsat_count = interpreter.get_verification_counts()
# Combine output for logging
full_output = stdout_capture.getvalue() + stderr_capture.getvalue()
# Determine answer based on verification results
answer = self._determine_answer(sat_count, unsat_count)
return VerificationResult(
answer=answer,
sat_count=sat_count,
unsat_count=unsat_count,
output=full_output,
success=True,
)
except Exception as e:
logger.error(f"Error executing Z3 interpreter: {e}")
return VerificationResult(
answer=None,
sat_count=0,
unsat_count=0,
output="",
success=False,
error=str(e),
)
def _determine_answer(self, sat_count: int, unsat_count: int) -> bool | None:
"""Determine boolean answer from SAT/UNSAT counts.
Args:
sat_count: Number of SAT occurrences
unsat_count: Number of UNSAT occurrences
Returns:
True if SAT only, False if UNSAT only, None if ambiguous
"""
if sat_count > 0 and unsat_count == 0:
return True
elif unsat_count > 0 and sat_count == 0:
return False
else:
# Ambiguous: both or neither
logger.warning(f"Ambiguous verification result: SAT={sat_count}, UNSAT={unsat_count}")
return None

View File

@@ -21,6 +21,8 @@ class Verifier:
self.expression_parser = expression_parser
self.sorts = sorts
self.verifications: dict[str, ExprRef] = {}
self.sat_count: int = 0
self.unsat_count: int = 0
def add_verifications(self, verification_defs: list[dict[str, Any]]) -> None:
"""Add verification conditions.
@@ -39,6 +41,10 @@ class Verifier:
exists_vars = verification["exists"]
if not exists_vars:
raise ValueError(f"Empty 'exists' list in verification '{name}'")
# Validate sorts exist before creating variables
for v in exists_vars:
if v["sort"] not in self.sorts:
raise ValueError(f"Sort '{v['sort']}' not defined")
variables = [Const(v["name"], self.sorts[v["sort"]]) for v in exists_vars]
constraint = self.expression_parser.parse_expression(
verification["constraint"], variables
@@ -48,6 +54,10 @@ class Verifier:
forall_vars = verification["forall"]
if not forall_vars:
raise ValueError(f"Empty 'forall' list in verification '{name}'")
# Validate sorts exist before creating variables
for v in forall_vars:
if v["sort"] not in self.sorts:
raise ValueError(f"Sort '{v['sort']}' not defined")
variables = [Const(v["name"], self.sorts[v["sort"]]) for v in forall_vars]
antecedent = self.expression_parser.parse_expression(
verification["implies"]["antecedent"], variables
@@ -82,6 +92,10 @@ class Verifier:
For entailment checking (knowledge_base IMPLIES condition),
check if knowledge_base AND NOT(condition) is UNSAT.
"""
# Reset counts at the start of verification
self.sat_count = 0
self.unsat_count = 0
if not self.verifications:
logger.info("No verifications to check")
return
@@ -99,10 +113,12 @@ class Verifier:
result = solver.check(condition)
if result == sat:
self.sat_count += 1
model = solver.model()
logger.info(f"{name}: SAT")
logger.info(f"Model: {model}")
elif result == unsat:
self.unsat_count += 1
logger.info(f"{name}: UNSAT (condition contradicts knowledge base)")
else:
logger.warning(f"{name}: UNKNOWN (timeout or incomplete)")