Compare commits
5 Commits
9f211dfb6c
...
28cf74c738
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
28cf74c738 | ||
|
|
6cc73ac439 | ||
|
|
4b74300f3c | ||
|
|
fa3939786f | ||
|
|
a853217a3b |
40
README.md
40
README.md
@@ -1,12 +1,31 @@
|
||||
## Notes to Developer
|
||||
- Mevcutta 0.X.Y versiyon llm factory sorunsuz çalışıyor ve değişiklik gerekmiyorsa; breaking change olmaması için güvenlik, vs gibi gereksinim olmadığı sürece versiyon ilerletmeyelim.
|
||||
- örn: (tcell_llm_factory==0.2.10)
|
||||
- Query Engine implement ederken cloud ve onprem fark etmeksizin "response" ve "consumption" return edelim. Onpremler için "consumption" None olabilir.
|
||||
- Query Engine implement ederken cloud ve onprem fark etmeksizin "response" ve "consumption" return edelim. Onpremler için (hic dönülmemesi yerine) "consumption" None döndürülür.
|
||||
- Async runtime'lar için LLMFactry & AsyncLLMFactory class'ları ayrı ayrı kullanılır ihtiyaca göre. Async tarafında olmayan QEngine ilgili developer implement edebilir.
|
||||
|
||||
## Build
|
||||
- python -m pip install -U wheel setuptools build
|
||||
- increment project_id on pyproject.toml
|
||||
- archrose (for mac users)
|
||||
- python -m build
|
||||
|
||||
## Upload
|
||||
* artifactory urls:
|
||||
* local-pypi-dist-dev/com/turkcell/sensai/llm_factory
|
||||
* https://artifactory.turkcell.com.tr/ui/repos/tree/General/local-pypi-dist-dev/com/turkcell/sensai/llm_factory/
|
||||
* Deploy tıkla
|
||||
* 
|
||||
* Multiple Deploy seçerek yükle yeni build dosyaları
|
||||
* 
|
||||
* target path güncelle:
|
||||
* com/turkcell/sensai/llm_factory/0.2.0
|
||||
|
||||
---
|
||||
|
||||
## Improvement Notes (TODO):
|
||||
- azure provider altında langchain'den azure class import edilmiş; yukarıda openai'dan farklı frameworkler eşitlenmeli mi?
|
||||
- provider'ların hepsi kendi içinde langchain, vs llm dev frameworkü yönetsin. Biz karışmayalım. LLMFactory pyproject.yml'dan dependency olarak çıkartmalı.
|
||||
- provider'ların hepsi kendi içinde langchain, vs llm dev frameworkü yönetsin. Biz karışmayalım. LLMFactory pyproject.yml'dan dependency olarak çıkartmalı.
|
||||
- partial pip installation için (örn: pip install llm_factory[ollama]) dependency importlar da dinamik olmalı
|
||||
(aşağıda örnek implementasyon var fakat paralelde best practice literatür taraması yapalım)
|
||||
```python
|
||||
@@ -160,20 +179,3 @@ testing = ["pytest", "coverage"]
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Build
|
||||
- python -m pip install -U wheel setuptools build
|
||||
- increment project_id on pyproject.toml
|
||||
- archrose (for mac users)
|
||||
- python -m build
|
||||
|
||||
## Upload
|
||||
* artifactory urls:
|
||||
* local-pypi-dist-dev/com/turkcell/sensai/llm_factory
|
||||
* https://artifactory.turkcell.com.tr/ui/repos/tree/General/local-pypi-dist-dev/com/turkcell/sensai/llm_factory/
|
||||
* Deploy tıkla
|
||||
* 
|
||||
* Multiple Deploy seçerek yükle yeni build dosyaları
|
||||
* 
|
||||
* target path güncelle:
|
||||
* com/turkcell/sensai/llm_factory/0.2.0
|
||||
@@ -2,7 +2,7 @@ import datetime
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from loguru import logger
|
||||
from ollama._types import ChatResponse
|
||||
from ollama._types import ChatResponse, GenerateResponse
|
||||
from langchain_community.callbacks import get_openai_callback
|
||||
|
||||
|
||||
@@ -47,30 +47,42 @@ class BaseQueryEngine(ABC):
|
||||
return messages["parser"].invoke(input=response.content)
|
||||
return response.content
|
||||
|
||||
def get_stream(self, response):
|
||||
def get_stream(self, response, response_type):
|
||||
streamed_content = ""
|
||||
model = None
|
||||
created_at = datetime.datetime.now().isoformat() # Fallback timestamp
|
||||
created_at = datetime.datetime.now().isoformat()
|
||||
|
||||
for chunk in response:
|
||||
if model is None:
|
||||
# Capture model and timestamp from the first chunk
|
||||
model = chunk.get("model", "unknown_model")
|
||||
created_at = chunk.get("created_at", created_at)
|
||||
# Extract content (message key yoksa chunk geri dön ve response key ara)
|
||||
content = chunk\
|
||||
.get("message", chunk)\
|
||||
.get("response",chunk["message"])\
|
||||
.get("content", "") # required for model: llama3.3:70b-instruct-q8_0w
|
||||
streamed_content += content
|
||||
print(content, end="", flush=True) # Print streamed content
|
||||
chunk_data = chunk.__dict__ if hasattr(chunk, '__dict__') else chunk
|
||||
|
||||
# Return a complete ChatResponse object
|
||||
answer = ChatResponse(
|
||||
model=model,
|
||||
created_at=created_at,
|
||||
done=True,
|
||||
done_reason="streaming_done",
|
||||
message={"role": "assistant", "content": streamed_content} # Combined content
|
||||
)
|
||||
return answer
|
||||
if model is None:
|
||||
model = chunk_data.get("model", "unknown_model")
|
||||
created_at = chunk_data.get("created_at", created_at)
|
||||
|
||||
# Extract response content
|
||||
if 'response' in chunk_data:
|
||||
content = chunk_data.get('response', '')
|
||||
else:
|
||||
message_data = chunk_data.get("message", chunk_data)
|
||||
response_data = message_data.get("response", chunk_data.get("message", {}))
|
||||
content = response_data.get("content", "")
|
||||
|
||||
streamed_content += content
|
||||
print(content, end="", flush=True)
|
||||
|
||||
if response_type == "chat":
|
||||
return ChatResponse(
|
||||
model=model,
|
||||
created_at=created_at,
|
||||
done=True,
|
||||
done_reason="streaming_done",
|
||||
message={"role": "assistant", "content": streamed_content}
|
||||
)
|
||||
else:
|
||||
return GenerateResponse(
|
||||
model=model,
|
||||
created_at=created_at,
|
||||
done=True,
|
||||
done_reason="streaming_done",
|
||||
response=streamed_content
|
||||
)
|
||||
@@ -191,7 +191,7 @@ class OllamaCompletionText(BaseQueryEngine):
|
||||
)
|
||||
if self.output_stream:
|
||||
# return answer in stream continuously
|
||||
answer = self.get_stream(response)
|
||||
answer = self.get_stream(response, response_type="completion")
|
||||
return answer, None
|
||||
else:
|
||||
# return answer once generated
|
||||
@@ -201,6 +201,8 @@ class OllamaChatCompletionText(BaseQueryEngine):
|
||||
def __init__(self, model, provider):
|
||||
super().__init__(model, provider)
|
||||
self.model_name = os.environ.get("OLLAMA_MODEL_NAME")
|
||||
# TODO:
|
||||
# - enforce hard coded num_ctx based on base_url
|
||||
self.options = Options(use_mmap=False, num_ctx=int(os.environ["OLLAMA_NUM_CTX"]))
|
||||
self.output_stream = ast.literal_eval(os.environ.get("OLLAMA_OUTPUT_STREAM", "False"))
|
||||
self.output_format = os.environ.get("OLLAMA_FORMAT", None)
|
||||
@@ -232,7 +234,7 @@ class OllamaChatCompletionText(BaseQueryEngine):
|
||||
)
|
||||
if self.output_stream:
|
||||
# return answer in stream continuously
|
||||
answer = self.get_stream(response)
|
||||
answer = self.get_stream(response, response_type="chat")
|
||||
return answer, None
|
||||
else:
|
||||
# return answer once generated
|
||||
@@ -242,6 +244,7 @@ class OllamaStructuredCompletionText(BaseQueryEngine):
|
||||
def __init__(self, model, provider):
|
||||
super().__init__(model, provider)
|
||||
self.options = Options(use_mmap=False, num_ctx=int(os.getenv("OLLAMA_NUM_CTX", 40000)))
|
||||
self.output_stream = ast.literal_eval(os.environ.get("OLLAMA_OUTPUT_STREAM", "False"))
|
||||
self.model_name = os.environ.get("OLLAMA_MODEL_NAME")
|
||||
|
||||
def get_sample_message_content(self):
|
||||
@@ -251,25 +254,42 @@ class OllamaStructuredCompletionText(BaseQueryEngine):
|
||||
"""
|
||||
|
||||
def construct_message_content(self, messages):
|
||||
#return Message(role="user", content=messages["content"])
|
||||
return messages
|
||||
if len(messages) == 1:
|
||||
return messages[0]["content"]
|
||||
elif len(messages) > 1:
|
||||
logger.warning("Non-conv query engine can't handle conversation with multiple messages. Truncating previous and using only latest message.")
|
||||
return messages[-1]["content"]
|
||||
|
||||
def calculate_consumption(self, cb):
|
||||
return {"total_cost": 0, "total_tokens": 0}
|
||||
|
||||
def ask(self, pydantic_model, messages) -> dict:
|
||||
response = self.model.chat(
|
||||
messages=messages,
|
||||
message_content = self.construct_message_content(messages)
|
||||
response = self.model.generate(
|
||||
prompt=message_content,
|
||||
model=self.model_name,
|
||||
format=pydantic_model.model_json_schema(),
|
||||
options=self.options
|
||||
stream=self.output_stream,
|
||||
options=self.options,
|
||||
keep_alive=-1,
|
||||
)
|
||||
return pydantic_model.model_validate_json(response.message.content).__dict__
|
||||
if self.output_stream:
|
||||
# return answer in stream continuously
|
||||
answer = self.get_stream(response, response_type="completion")
|
||||
#return pydantic_model.model_validate_json(answer.response).__dict__, None
|
||||
return answer, None
|
||||
else:
|
||||
# return answer once generated
|
||||
#return pydantic_model.model_validate_json(response.message.content).__dict__, None
|
||||
return response, None
|
||||
|
||||
class OllamaStructuredChatCompletionText(BaseQueryEngine):
|
||||
def __init__(self, model, provider):
|
||||
super().__init__(model, provider)
|
||||
self.options = Options(use_mmap=False, num_ctx=int(os.getenv("OLLAMA_NUM_CTX", 40000)))
|
||||
# TODO:
|
||||
# - OLLAMA_MODEL_NAME parametresi alternatif olarak runtime'da güncellenebilmeli
|
||||
self.output_stream = ast.literal_eval(os.environ.get("OLLAMA_OUTPUT_STREAM", "False"))
|
||||
self.model_name = os.environ.get("OLLAMA_MODEL_NAME")
|
||||
|
||||
def get_sample_message_content(self):
|
||||
@@ -289,10 +309,16 @@ class OllamaStructuredChatCompletionText(BaseQueryEngine):
|
||||
messages=messages,
|
||||
model=self.model_name,
|
||||
format=pydantic_model.model_json_schema(),
|
||||
stream=self.output_stream,
|
||||
keep_alive=-1,
|
||||
options=self.options
|
||||
)
|
||||
messages.append(response.message.__dict__)
|
||||
return messages
|
||||
if self.output_stream:
|
||||
answer = self.get_stream(response, response_type="chat")
|
||||
return answer, None
|
||||
else:
|
||||
return response, None
|
||||
|
||||
|
||||
# VLLM
|
||||
class VLLMChatCompletionText(BaseQueryEngine):
|
||||
|
||||
BIN
src/llm_factory_tcell/tests/.coverage
Normal file
BIN
src/llm_factory_tcell/tests/.coverage
Normal file
Binary file not shown.
115
src/llm_factory_tcell/tests/test_integration.py
Normal file
115
src/llm_factory_tcell/tests/test_integration.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from src.llm_factory_tcell.provider import llms
|
||||
|
||||
|
||||
class MathOperation(BaseModel):
|
||||
result: float = Field(..., description="result of the calculation")
|
||||
|
||||
|
||||
class TestOllamaQueryEngines:
|
||||
@pytest.fixture
|
||||
def setup_environment(self):
|
||||
test_env = {
|
||||
'OLLAMA_MODEL_NAME': 'llama3.2:3b-instruct-fp16',
|
||||
'OLLAMA_BASE_URL': 'http://192.168.1.210:33740/',
|
||||
'OLLAMA_INFERENCE_TIMEOUT': "120",
|
||||
"OLLAMA_NUM_CTX": "1024",
|
||||
"OLLAMA_OUTPUT_STREAM": "True"
|
||||
}
|
||||
os.environ.update(test_env)
|
||||
|
||||
@pytest.fixture(params=[
|
||||
pytest.param(
|
||||
{"conversational": True, "structured": False, "pydantic_model": None},
|
||||
id="chat-unstructured"
|
||||
),
|
||||
pytest.param(
|
||||
{"conversational": True, "structured": True, "pydantic_model": MathOperation},
|
||||
id="chat-structured"
|
||||
),
|
||||
pytest.param(
|
||||
{"conversational": False, "structured": False, "pydantic_model": None},
|
||||
id="completion-unstructured"
|
||||
),
|
||||
pytest.param(
|
||||
{"conversational": False, "structured": True, "pydantic_model": MathOperation},
|
||||
id="completion-structured"
|
||||
)
|
||||
])
|
||||
def query_engine_config(self, request):
|
||||
return request.param
|
||||
|
||||
@pytest.fixture
|
||||
def query_engine(self,setup_environment, query_engine_config):
|
||||
llm_factory = llms.LLMFactory()
|
||||
return llm_factory.create_model(
|
||||
provider="ollama",
|
||||
model_name=os.environ["OLLAMA_MODEL_NAME"],
|
||||
conversational=query_engine_config["conversational"],
|
||||
structured=query_engine_config["structured"],
|
||||
)
|
||||
|
||||
def test_query_engine(self, query_engine, query_engine_config):
|
||||
|
||||
messages = [{"role": "user", "content": "What is 2+2?"}]
|
||||
initial_messages = messages.copy()
|
||||
ask_kwargs = {"messages": messages} # equivalent of ".ask(messages=messages)"
|
||||
if query_engine_config["structured"]:
|
||||
ask_kwargs["pydantic_model"] = query_engine_config["pydantic_model"]
|
||||
|
||||
response, consumption = query_engine.ask(**ask_kwargs)
|
||||
|
||||
# Verify for conversational response
|
||||
if query_engine_config["conversational"]:
|
||||
assert hasattr(response, 'message'), "Response should have 'message' attribute"
|
||||
assert hasattr(response.message, 'content'), "Message should have 'content' attribute"
|
||||
assert hasattr(response.message, 'role'), "Message should have 'role' attribute"
|
||||
assert response.message.role == "assistant"
|
||||
assert messages == initial_messages # Verify llm response is not appended to conversation during .ask()
|
||||
messages.append(response.message.__dict__) # Verify llm response can be added to conversation
|
||||
assert len(messages) == 2
|
||||
assert isinstance(messages[-1], dict)
|
||||
assert 'role' in messages[-1]
|
||||
assert 'content' in messages[-1]
|
||||
|
||||
# Verify for conversational but structured response
|
||||
if query_engine_config["structured"]:
|
||||
try:
|
||||
json_response = json.loads(response.message.content)
|
||||
assert isinstance(json_response, dict), "structured response should be a dictionary"
|
||||
structured_response = MathOperation(**json_response)
|
||||
self.verify_common_assertions_for_structured_response(structured_response)
|
||||
except json.JSONDecodeError:
|
||||
assert False, "Response content should be valid JSON"
|
||||
|
||||
# Verify for completion response
|
||||
else:
|
||||
assert hasattr(response, 'response'), "Response should have 'response' attribute"
|
||||
|
||||
# Verify for completion but structuredresponse
|
||||
if query_engine_config["structured"]:
|
||||
json_response = json.loads(response.response)
|
||||
assert isinstance(json_response, dict), "structured response should be a dictionary"
|
||||
structured_response = MathOperation(**json_response)
|
||||
self.verify_common_assertions_for_structured_response(structured_response)
|
||||
|
||||
# Verify common response structure
|
||||
assert response != None, "Response should not be None"
|
||||
assert consumption is not None or consumption is None, "Consumption should either be None or have a value"
|
||||
if consumption is not None:
|
||||
assert isinstance(consumption, dict)
|
||||
|
||||
|
||||
def verify_common_assertions_for_structured_response(self, structured_response):
|
||||
assert isinstance(structured_response, MathOperation)
|
||||
assert hasattr(structured_response, 'result')
|
||||
assert isinstance(structured_response.result, float)
|
||||
|
||||
|
||||
class TestAzureQueryEngines:
|
||||
pass
|
||||
Reference in New Issue
Block a user