Add comprehensive test suite for Responses API (#20)

The project had almost no test coverage - just a single test checking if the API returns 200. This adds proper testing infrastructure and 21 new tests covering the main API functionality.

Tests now cover response creation, error handling, tools, sessions, performance, and usage tracking. All tests passing.
This commit is contained in:
Mihajlo Micic
2025-08-05 15:50:06 -04:00
committed by GitHub
parent 9074326499
commit 89fe402d10
2 changed files with 348 additions and 0 deletions

118
tests/conftest.py Normal file
View File

@@ -0,0 +1,118 @@
import os
import sys
import pytest
from typing import Generator, Any
from unittest.mock import Mock, MagicMock
from fastapi.testclient import TestClient
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from openai_harmony import (
HarmonyEncodingName,
load_harmony_encoding,
)
from gpt_oss.responses_api.api_server import create_api_server
@pytest.fixture(scope="session")
def harmony_encoding():
return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
@pytest.fixture
def mock_infer_token(harmony_encoding):
fake_tokens = harmony_encoding.encode(
"<|channel|>final<|message|>Test response<|return|>",
allowed_special="all"
)
token_queue = fake_tokens.copy()
def _mock_infer(tokens: list[int], temperature: float = 0.0, new_request: bool = False) -> int:
nonlocal token_queue
if len(token_queue) == 0:
token_queue = fake_tokens.copy()
return token_queue.pop(0)
return _mock_infer
@pytest.fixture
def api_client(harmony_encoding, mock_infer_token) -> Generator[TestClient, None, None]:
app = create_api_server(
infer_next_token=mock_infer_token,
encoding=harmony_encoding
)
with TestClient(app) as client:
yield client
@pytest.fixture
def sample_request_data():
return {
"model": "gpt-oss-120b",
"input": "Hello, how can I help you today?",
"stream": False,
"reasoning_effort": "low",
"temperature": 0.7,
"tools": []
}
@pytest.fixture
def mock_browser_tool():
mock = MagicMock()
mock.search.return_value = ["Result 1", "Result 2"]
mock.open_page.return_value = "Page content"
mock.find_on_page.return_value = "Found text"
return mock
@pytest.fixture
def mock_python_tool():
mock = MagicMock()
mock.execute.return_value = {
"output": "print('Hello')",
"error": None,
"exit_code": 0
}
return mock
@pytest.fixture(autouse=True)
def reset_test_environment():
test_env_vars = ['OPENAI_API_KEY', 'GPT_OSS_MODEL_PATH']
original_values = {}
for var in test_env_vars:
if var in os.environ:
original_values[var] = os.environ[var]
del os.environ[var]
yield
for var, value in original_values.items():
os.environ[var] = value
@pytest.fixture
def performance_timer():
import time
class Timer:
def __init__(self):
self.start_time = None
self.end_time = None
def start(self):
self.start_time = time.time()
def stop(self):
self.end_time = time.time()
return self.elapsed
@property
def elapsed(self):
if self.start_time and self.end_time:
return self.end_time - self.start_time
return None
return Timer()

230
tests/test_api_endpoints.py Normal file
View File

@@ -0,0 +1,230 @@
import pytest
import json
import asyncio
from fastapi import status
from unittest.mock import patch, MagicMock, AsyncMock
class TestResponsesEndpoint:
def test_basic_response_creation(self, api_client, sample_request_data):
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert "id" in data
assert data["object"] == "response"
assert data["model"] == sample_request_data["model"]
def test_response_with_high_reasoning(self, api_client, sample_request_data):
sample_request_data["reasoning_effort"] = "high"
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert "id" in data
assert data["status"] == "completed"
def test_response_with_medium_reasoning(self, api_client, sample_request_data):
sample_request_data["reasoning_effort"] = "medium"
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert "id" in data
assert data["status"] == "completed"
def test_response_with_invalid_model(self, api_client, sample_request_data):
sample_request_data["model"] = "invalid-model"
response = api_client.post("/v1/responses", json=sample_request_data)
# Should still accept but might handle differently
assert response.status_code == status.HTTP_200_OK
def test_response_with_empty_input(self, api_client, sample_request_data):
sample_request_data["input"] = ""
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
def test_response_with_tools(self, api_client, sample_request_data):
sample_request_data["tools"] = [
{
"type": "browser_search"
}
]
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
def test_response_with_custom_temperature(self, api_client, sample_request_data):
for temp in [0.0, 0.5, 1.0, 1.5, 2.0]:
sample_request_data["temperature"] = temp
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert "usage" in data
def test_streaming_response(self, api_client, sample_request_data):
sample_request_data["stream"] = True
with api_client.stream("POST", "/v1/responses", json=sample_request_data) as response:
assert response.status_code == status.HTTP_200_OK
# Verify we get SSE events
for line in response.iter_lines():
if line and line.startswith("data: "):
event_data = line[6:] # Remove "data: " prefix
if event_data != "[DONE]":
json.loads(event_data) # Should be valid JSON
break
class TestResponsesWithSession:
def test_response_with_session_id(self, api_client, sample_request_data):
session_id = "test-session-123"
sample_request_data["session_id"] = session_id
# First request
response1 = api_client.post("/v1/responses", json=sample_request_data)
assert response1.status_code == status.HTTP_200_OK
data1 = response1.json()
# Second request with same session
sample_request_data["input"] = "Follow up question"
response2 = api_client.post("/v1/responses", json=sample_request_data)
assert response2.status_code == status.HTTP_200_OK
data2 = response2.json()
# Should have different response IDs
assert data1["id"] != data2["id"]
def test_response_continuation(self, api_client, sample_request_data):
# Create initial response
response1 = api_client.post("/v1/responses", json=sample_request_data)
assert response1.status_code == status.HTTP_200_OK
data1 = response1.json()
response_id = data1["id"]
# Continue the response
continuation_request = {
"model": sample_request_data["model"],
"response_id": response_id,
"input": "Continue the previous thought"
}
response2 = api_client.post("/v1/responses", json=continuation_request)
assert response2.status_code == status.HTTP_200_OK
class TestErrorHandling:
def test_missing_required_fields(self, api_client):
# Model field has default, so test with empty JSON
response = api_client.post("/v1/responses", json={})
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
def test_invalid_reasoning_effort(self, api_client, sample_request_data):
sample_request_data["reasoning_effort"] = "invalid"
response = api_client.post("/v1/responses", json=sample_request_data)
# May handle gracefully or return error
assert response.status_code in [status.HTTP_200_OK, status.HTTP_422_UNPROCESSABLE_ENTITY]
def test_malformed_json(self, api_client):
response = api_client.post(
"/v1/responses",
data="not json",
headers={"Content-Type": "application/json"}
)
assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
def test_extremely_long_input(self, api_client, sample_request_data):
# Test with very long input
sample_request_data["input"] = "x" * 100000
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
class TestToolIntegration:
def test_browser_search_tool(self, api_client, sample_request_data):
sample_request_data["tools"] = [
{
"type": "browser_search"
}
]
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
def test_function_tool_integration(self, api_client, sample_request_data):
sample_request_data["tools"] = [
{
"type": "function",
"name": "test_function",
"parameters": {"type": "object", "properties": {}},
"description": "Test function"
}
]
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
def test_multiple_tools(self, api_client, sample_request_data):
sample_request_data["tools"] = [
{
"type": "browser_search"
},
{
"type": "function",
"name": "test_function",
"parameters": {"type": "object", "properties": {}},
"description": "Test function"
}
]
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
class TestPerformance:
def test_response_time_under_threshold(self, api_client, sample_request_data, performance_timer):
performance_timer.start()
response = api_client.post("/v1/responses", json=sample_request_data)
elapsed = performance_timer.stop()
assert response.status_code == status.HTTP_200_OK
# Response should be reasonably fast for mock inference
assert elapsed < 5.0 # 5 seconds threshold
def test_multiple_sequential_requests(self, api_client, sample_request_data):
# Test multiple requests work correctly
for i in range(3):
data = sample_request_data.copy()
data["input"] = f"Request {i}"
response = api_client.post("/v1/responses", json=data)
assert response.status_code == status.HTTP_200_OK
class TestUsageTracking:
def test_usage_object_structure(self, api_client, sample_request_data):
response = api_client.post("/v1/responses", json=sample_request_data)
assert response.status_code == status.HTTP_200_OK
data = response.json()
assert "usage" in data
usage = data["usage"]
assert "input_tokens" in usage
assert "output_tokens" in usage
assert "total_tokens" in usage
# reasoning_tokens may not always be present
# assert "reasoning_tokens" in usage
# Basic validation
assert usage["input_tokens"] >= 0
assert usage["output_tokens"] >= 0
assert usage["total_tokens"] == usage["input_tokens"] + usage["output_tokens"]
def test_usage_increases_with_longer_input(self, api_client, sample_request_data):
# Short input
response1 = api_client.post("/v1/responses", json=sample_request_data)
usage1 = response1.json()["usage"]
# Longer input
sample_request_data["input"] = sample_request_data["input"] * 10
response2 = api_client.post("/v1/responses", json=sample_request_data)
usage2 = response2.json()["usage"]
# Longer input should use more tokens
assert usage2["input_tokens"] > usage1["input_tokens"]