diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index be98bee..b89bda5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -586,6 +586,21 @@ class Llama: max_tokens: int = 128, repeat_penalty: float = 1.1, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + """Generate a chat completion from a list of messages. + + Args: + messages: A list of messages to generate a response for. + temperature: The temperature to use for sampling. + top_p: The top-p value to use for sampling. + top_k: The top-k value to use for sampling. + stream: Whether to stream the results. + stop: A list of strings to stop generation when encountered. + max_tokens: The maximum number of tokens to generate. + repeat_penalty: The penalty to apply to repeated tokens. + + Returns: + Generated chat completion or a stream of chat completion chunks. + """ instructions = """Complete the following chat conversation between the user and the assistant. System messages should be strictly followed as additional instructions.""" chat_history = "\n".join( f'{message["role"]} {message.get("user", "")}: {message["content"]}'