Compare commits

...

5 Commits

5 changed files with 207 additions and 52 deletions

View File

@@ -1,12 +1,31 @@
## Notes to Developer
- Mevcutta 0.X.Y versiyon llm factory sorunsuz çalışıyor ve değişiklik gerekmiyorsa; breaking change olmaması için güvenlik, vs gibi gereksinim olmadığı sürece versiyon ilerletmeyelim.
- örn: (tcell_llm_factory==0.2.10)
- Query Engine implement ederken cloud ve onprem fark etmeksizin "response" ve "consumption" return edelim. Onpremler için "consumption" None olabilir.
- Query Engine implement ederken cloud ve onprem fark etmeksizin "response" ve "consumption" return edelim. Onpremler için (hic dönülmemesi yerine) "consumption" None döndürülür.
- Async runtime'lar için LLMFactry & AsyncLLMFactory class'ları ayrı ayrı kullanılır ihtiyaca göre. Async tarafında olmayan QEngine ilgili developer implement edebilir.
## Build
- python -m pip install -U wheel setuptools build
- increment project_id on pyproject.toml
- archrose (for mac users)
- python -m build
## Upload
* artifactory urls:
* local-pypi-dist-dev/com/turkcell/sensai/llm_factory
* https://artifactory.turkcell.com.tr/ui/repos/tree/General/local-pypi-dist-dev/com/turkcell/sensai/llm_factory/
* Deploy tıkla
* ![deploy.png](static/deploy.png)
* Multiple Deploy seçerek yükle yeni build dosyaları
* ![deploy3.png](static/deploy3.png)
* target path güncelle:
* com/turkcell/sensai/llm_factory/0.2.0
---
## Improvement Notes (TODO):
- azure provider altında langchain'den azure class import edilmiş; yukarıda openai'dan farklı frameworkler eşitlenmeli mi?
- provider'ların hepsi kendi içinde langchain, vs llm dev frameworkü yönetsin. Biz karışmayalım. LLMFactory pyproject.yml'dan dependency olarak çıkartmalı.
- provider'ların hepsi kendi içinde langchain, vs llm dev frameworkü yönetsin. Biz karışmayalım. LLMFactory pyproject.yml'dan dependency olarak çıkartmalı.
- partial pip installation için (örn: pip install llm_factory[ollama]) dependency importlar da dinamik olmalı
(aşağıda örnek implementasyon var fakat paralelde best practice literatür taraması yapalım)
```python
@@ -160,20 +179,3 @@ testing = ["pytest", "coverage"]
```
## Build
- python -m pip install -U wheel setuptools build
- increment project_id on pyproject.toml
- archrose (for mac users)
- python -m build
## Upload
* artifactory urls:
* local-pypi-dist-dev/com/turkcell/sensai/llm_factory
* https://artifactory.turkcell.com.tr/ui/repos/tree/General/local-pypi-dist-dev/com/turkcell/sensai/llm_factory/
* Deploy tıkla
* ![deploy.png](static/deploy.png)
* Multiple Deploy seçerek yükle yeni build dosyaları
* ![deploy3.png](static/deploy3.png)
* target path güncelle:
* com/turkcell/sensai/llm_factory/0.2.0

View File

@@ -2,7 +2,7 @@ import datetime
from abc import ABC, abstractmethod
from loguru import logger
from ollama._types import ChatResponse
from ollama._types import ChatResponse, GenerateResponse
from langchain_community.callbacks import get_openai_callback
@@ -47,30 +47,42 @@ class BaseQueryEngine(ABC):
return messages["parser"].invoke(input=response.content)
return response.content
def get_stream(self, response):
def get_stream(self, response, response_type):
streamed_content = ""
model = None
created_at = datetime.datetime.now().isoformat() # Fallback timestamp
created_at = datetime.datetime.now().isoformat()
for chunk in response:
if model is None:
# Capture model and timestamp from the first chunk
model = chunk.get("model", "unknown_model")
created_at = chunk.get("created_at", created_at)
# Extract content (message key yoksa chunk geri dön ve response key ara)
content = chunk\
.get("message", chunk)\
.get("response",chunk["message"])\
.get("content", "") # required for model: llama3.3:70b-instruct-q8_0w
streamed_content += content
print(content, end="", flush=True) # Print streamed content
chunk_data = chunk.__dict__ if hasattr(chunk, '__dict__') else chunk
# Return a complete ChatResponse object
answer = ChatResponse(
model=model,
created_at=created_at,
done=True,
done_reason="streaming_done",
message={"role": "assistant", "content": streamed_content} # Combined content
)
return answer
if model is None:
model = chunk_data.get("model", "unknown_model")
created_at = chunk_data.get("created_at", created_at)
# Extract response content
if 'response' in chunk_data:
content = chunk_data.get('response', '')
else:
message_data = chunk_data.get("message", chunk_data)
response_data = message_data.get("response", chunk_data.get("message", {}))
content = response_data.get("content", "")
streamed_content += content
print(content, end="", flush=True)
if response_type == "chat":
return ChatResponse(
model=model,
created_at=created_at,
done=True,
done_reason="streaming_done",
message={"role": "assistant", "content": streamed_content}
)
else:
return GenerateResponse(
model=model,
created_at=created_at,
done=True,
done_reason="streaming_done",
response=streamed_content
)

View File

@@ -191,7 +191,7 @@ class OllamaCompletionText(BaseQueryEngine):
)
if self.output_stream:
# return answer in stream continuously
answer = self.get_stream(response)
answer = self.get_stream(response, response_type="completion")
return answer, None
else:
# return answer once generated
@@ -201,6 +201,8 @@ class OllamaChatCompletionText(BaseQueryEngine):
def __init__(self, model, provider):
super().__init__(model, provider)
self.model_name = os.environ.get("OLLAMA_MODEL_NAME")
# TODO:
# - enforce hard coded num_ctx based on base_url
self.options = Options(use_mmap=False, num_ctx=int(os.environ["OLLAMA_NUM_CTX"]))
self.output_stream = ast.literal_eval(os.environ.get("OLLAMA_OUTPUT_STREAM", "False"))
self.output_format = os.environ.get("OLLAMA_FORMAT", None)
@@ -232,7 +234,7 @@ class OllamaChatCompletionText(BaseQueryEngine):
)
if self.output_stream:
# return answer in stream continuously
answer = self.get_stream(response)
answer = self.get_stream(response, response_type="chat")
return answer, None
else:
# return answer once generated
@@ -242,6 +244,7 @@ class OllamaStructuredCompletionText(BaseQueryEngine):
def __init__(self, model, provider):
super().__init__(model, provider)
self.options = Options(use_mmap=False, num_ctx=int(os.getenv("OLLAMA_NUM_CTX", 40000)))
self.output_stream = ast.literal_eval(os.environ.get("OLLAMA_OUTPUT_STREAM", "False"))
self.model_name = os.environ.get("OLLAMA_MODEL_NAME")
def get_sample_message_content(self):
@@ -251,25 +254,42 @@ class OllamaStructuredCompletionText(BaseQueryEngine):
"""
def construct_message_content(self, messages):
#return Message(role="user", content=messages["content"])
return messages
if len(messages) == 1:
return messages[0]["content"]
elif len(messages) > 1:
logger.warning("Non-conv query engine can't handle conversation with multiple messages. Truncating previous and using only latest message.")
return messages[-1]["content"]
def calculate_consumption(self, cb):
return {"total_cost": 0, "total_tokens": 0}
def ask(self, pydantic_model, messages) -> dict:
response = self.model.chat(
messages=messages,
message_content = self.construct_message_content(messages)
response = self.model.generate(
prompt=message_content,
model=self.model_name,
format=pydantic_model.model_json_schema(),
options=self.options
stream=self.output_stream,
options=self.options,
keep_alive=-1,
)
return pydantic_model.model_validate_json(response.message.content).__dict__
if self.output_stream:
# return answer in stream continuously
answer = self.get_stream(response, response_type="completion")
#return pydantic_model.model_validate_json(answer.response).__dict__, None
return answer, None
else:
# return answer once generated
#return pydantic_model.model_validate_json(response.message.content).__dict__, None
return response, None
class OllamaStructuredChatCompletionText(BaseQueryEngine):
def __init__(self, model, provider):
super().__init__(model, provider)
self.options = Options(use_mmap=False, num_ctx=int(os.getenv("OLLAMA_NUM_CTX", 40000)))
# TODO:
# - OLLAMA_MODEL_NAME parametresi alternatif olarak runtime'da güncellenebilmeli
self.output_stream = ast.literal_eval(os.environ.get("OLLAMA_OUTPUT_STREAM", "False"))
self.model_name = os.environ.get("OLLAMA_MODEL_NAME")
def get_sample_message_content(self):
@@ -289,10 +309,16 @@ class OllamaStructuredChatCompletionText(BaseQueryEngine):
messages=messages,
model=self.model_name,
format=pydantic_model.model_json_schema(),
stream=self.output_stream,
keep_alive=-1,
options=self.options
)
messages.append(response.message.__dict__)
return messages
if self.output_stream:
answer = self.get_stream(response, response_type="chat")
return answer, None
else:
return response, None
# VLLM
class VLLMChatCompletionText(BaseQueryEngine):

Binary file not shown.

View File

@@ -0,0 +1,115 @@
import json
import os
import pytest
from pydantic import BaseModel, Field
from src.llm_factory_tcell.provider import llms
class MathOperation(BaseModel):
result: float = Field(..., description="result of the calculation")
class TestOllamaQueryEngines:
@pytest.fixture
def setup_environment(self):
test_env = {
'OLLAMA_MODEL_NAME': 'llama3.2:3b-instruct-fp16',
'OLLAMA_BASE_URL': 'http://192.168.1.210:33740/',
'OLLAMA_INFERENCE_TIMEOUT': "120",
"OLLAMA_NUM_CTX": "1024",
"OLLAMA_OUTPUT_STREAM": "True"
}
os.environ.update(test_env)
@pytest.fixture(params=[
pytest.param(
{"conversational": True, "structured": False, "pydantic_model": None},
id="chat-unstructured"
),
pytest.param(
{"conversational": True, "structured": True, "pydantic_model": MathOperation},
id="chat-structured"
),
pytest.param(
{"conversational": False, "structured": False, "pydantic_model": None},
id="completion-unstructured"
),
pytest.param(
{"conversational": False, "structured": True, "pydantic_model": MathOperation},
id="completion-structured"
)
])
def query_engine_config(self, request):
return request.param
@pytest.fixture
def query_engine(self,setup_environment, query_engine_config):
llm_factory = llms.LLMFactory()
return llm_factory.create_model(
provider="ollama",
model_name=os.environ["OLLAMA_MODEL_NAME"],
conversational=query_engine_config["conversational"],
structured=query_engine_config["structured"],
)
def test_query_engine(self, query_engine, query_engine_config):
messages = [{"role": "user", "content": "What is 2+2?"}]
initial_messages = messages.copy()
ask_kwargs = {"messages": messages} # equivalent of ".ask(messages=messages)"
if query_engine_config["structured"]:
ask_kwargs["pydantic_model"] = query_engine_config["pydantic_model"]
response, consumption = query_engine.ask(**ask_kwargs)
# Verify for conversational response
if query_engine_config["conversational"]:
assert hasattr(response, 'message'), "Response should have 'message' attribute"
assert hasattr(response.message, 'content'), "Message should have 'content' attribute"
assert hasattr(response.message, 'role'), "Message should have 'role' attribute"
assert response.message.role == "assistant"
assert messages == initial_messages # Verify llm response is not appended to conversation during .ask()
messages.append(response.message.__dict__) # Verify llm response can be added to conversation
assert len(messages) == 2
assert isinstance(messages[-1], dict)
assert 'role' in messages[-1]
assert 'content' in messages[-1]
# Verify for conversational but structured response
if query_engine_config["structured"]:
try:
json_response = json.loads(response.message.content)
assert isinstance(json_response, dict), "structured response should be a dictionary"
structured_response = MathOperation(**json_response)
self.verify_common_assertions_for_structured_response(structured_response)
except json.JSONDecodeError:
assert False, "Response content should be valid JSON"
# Verify for completion response
else:
assert hasattr(response, 'response'), "Response should have 'response' attribute"
# Verify for completion but structuredresponse
if query_engine_config["structured"]:
json_response = json.loads(response.response)
assert isinstance(json_response, dict), "structured response should be a dictionary"
structured_response = MathOperation(**json_response)
self.verify_common_assertions_for_structured_response(structured_response)
# Verify common response structure
assert response != None, "Response should not be None"
assert consumption is not None or consumption is None, "Consumption should either be None or have a value"
if consumption is not None:
assert isinstance(consumption, dict)
def verify_common_assertions_for_structured_response(self, structured_response):
assert isinstance(structured_response, MathOperation)
assert hasattr(structured_response, 'result')
assert isinstance(structured_response.result, float)
class TestAzureQueryEngines:
pass