Merge branch 'main' of github.com:abetlen/llama_cpp_python into Maximilian-Winter/main

2023-09-07 17:34:22 +03:00 · 2023-05-08 19:57:09 -04:00
parent f315b82832 7499fc1cbb
commit 93a9019bb1
18 changed files with 1120 additions and 578 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -5,7 +5,7 @@ import time
 import math
 import multiprocessing
 from typing import List, Optional, Union, Generator, Sequence, Iterator, Deque, Tuple
-from collections import deque
+from collections import deque, OrderedDict

 from . import llama_cpp
 from .llama_types import *
@@ -14,46 +14,59 @@ from .llama_types import *
 class LlamaCache:
    """Cache for a llama.cpp model."""

-    def __init__(self):
-        self.cache_state: Dict[Tuple[llama_cpp.llama_token, ...], "LlamaState"] = dict()
+    def __init__(self, capacity_bytes: int = (2 << 30)):
+        self.cache_state: OrderedDict[
+            Tuple[llama_cpp.llama_token, ...], "LlamaState"
+        ] = OrderedDict()
+        self.capacity_bytes = capacity_bytes

-    def _sorted_keys(self) -> List[Tuple[llama_cpp.llama_token, ...]]:
-        return [
-            key
-            for _, key in sorted(
-                ((len(key), key) for key in self.cache_state.keys()), reverse=True
-            )
-        ]
+    @property
+    def cache_size(self):
+        return sum([state.llama_state_size for state in self.cache_state.values()])

-    def _find_key(
-        self, key: Tuple[llama_cpp.llama_token, ...]
+    def _find_longest_prefix_key(
+        self,
+        key: Tuple[llama_cpp.llama_token, ...],
    ) -> Optional[Tuple[llama_cpp.llama_token, ...]]:
-        for k in self._sorted_keys():
-            if key[: len(k)] == k:
-                return k
-        return None
+        min_len = 0
+        min_key = None
+        keys = (
+            (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys()
+        )
+        for k, prefix_len in keys:
+            if prefix_len > min_len:
+                min_len = prefix_len
+                min_key = k
+        return min_key

    def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState":
-        _key = self._find_key(tuple(key))
+        key = tuple(key)
+        _key = self._find_longest_prefix_key(key)
        if _key is None:
-            raise KeyError(f"Key not found: {key}")
-        return self.cache_state[_key]
+            raise KeyError(f"Key not found")
+        value = self.cache_state[_key]
+        self.cache_state.move_to_end(_key)
+        return value

    def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool:
-        return self._find_key(tuple(key)) is not None
+        return self._find_longest_prefix_key(tuple(key)) is not None

    def __setitem__(self, key: Sequence[llama_cpp.llama_token], value: "LlamaState"):
-        self.cache_state = dict()  # NOTE: Currently limit to one cache entry.
-        self.cache_state[tuple(key)] = value
+        key = tuple(key)
+        if key in self.cache_state:
+            del self.cache_state[key]
+        self.cache_state[key] = value
+        while self.cache_size > self.capacity_bytes:
+            self.cache_state.popitem(last=False)


 class LlamaState:
    def __init__(
        self,
        eval_tokens: Deque[llama_cpp.llama_token],
-        eval_logits: Deque[List[llama_cpp.c_float]],
+        eval_logits: Deque[List[float]],
        llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
-        llama_state_size: llama_cpp.c_size_t,
+        llama_state_size: int,
    ):
        self.eval_tokens = eval_tokens
        self.eval_logits = eval_logits
@@ -127,9 +140,7 @@ class Llama:
        self.last_n_tokens_size = last_n_tokens_size
        self.n_batch = min(n_ctx, n_batch)
        self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx)
-        self.eval_logits: Deque[List[float]] = deque(
-            maxlen=n_ctx if logits_all else 1
-        )
+        self.eval_logits: Deque[List[float]] = deque(maxlen=n_ctx if logits_all else 1)

        self.cache: Optional[LlamaCache] = None

@@ -250,7 +261,7 @@ class Llama:
            ]
            self.eval_logits.extend(logits)

-    def _sample_top_p_top_k(
+    def _sample(
        self,
        last_n_tokens_data,  # type: llama_cpp.Array[llama_cpp.llama_token]
        last_n_tokens_size: llama_cpp.c_int,
@@ -263,6 +274,8 @@ class Llama:
        mirostat_mu: llama_cpp.c_float,
        mirostat_m: llama_cpp.c_int,
        repeat_penalty: llama_cpp.c_float,
+        frequency_penalty: llama_cpp.c_float,
+        presence_penalty: llama_cpp.c_float,
    ):
        assert self.ctx is not None
        assert len(self.eval_logits) > 0
@@ -289,24 +302,24 @@ class Llama:
            ctx=self.ctx,
            last_tokens_data=last_n_tokens_data,
            last_tokens_size=last_n_tokens_size,
-            candidates=llama_cpp.ctypes.pointer(candidates),
+            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
            penalty=repeat_penalty,
        )
-        if mirostat_mode == 1:
+        if mirostat_mode.value == 1:
            llama_cpp.llama_sample_temperature(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
                temp=temp,
            )
            llama_cpp.llama_sample_token_mirostat(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
                tau=mirostat_tau,
                eta=mirostat_eta,
-                mu=mirostat_mu,
+                mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
                m=mirostat_m
            )
-        elif mirostat_mode == 2:
+        elif mirostat_mode.value == 2:
            llama_cpp.llama_sample_temperature(
                ctx=self.ctx,
                candidates=llama_cpp.ctypes.pointer(candidates),
@@ -314,45 +327,57 @@ class Llama:
            )
            llama_cpp.llama_sample_token_mirostat_v2(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
                tau=mirostat_tau,
                eta=mirostat_eta,
-                mu=mirostat_mu
+                mu=llama_cpp.ctypes.byref(mirostat_mu) # type: ignore
            )
-        elif float(temp.value) == 0.0:
+        llama_cpp.llama_sample_frequency_and_presence_penalties(
+            ctx=self.ctx,
+            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+            last_tokens_data=last_n_tokens_data,
+            last_tokens_size=last_n_tokens_size,
+            alpha_frequency=frequency_penalty,
+            alpha_presence=presence_penalty,
+        )
+        if float(temp.value) == 0.0:
            return llama_cpp.llama_sample_token_greedy(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
            )
        else:
            llama_cpp.llama_sample_top_k(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                k=top_k,
+                min_keep=llama_cpp.c_size_t(1),
            )
            llama_cpp.llama_sample_tail_free(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                z=llama_cpp.c_float(1.0),
+                min_keep=llama_cpp.c_size_t(1),
            )
            llama_cpp.llama_sample_typical(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                p=llama_cpp.c_float(1.0),
+                min_keep=llama_cpp.c_size_t(1),
            )
            llama_cpp.llama_sample_top_p(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                p=top_p,
+                min_keep=llama_cpp.c_size_t(1),
            )
            llama_cpp.llama_sample_temperature(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                temp=temp,
            )
            return llama_cpp.llama_sample_token(
                ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
            )

    def sample(
@@ -366,6 +391,8 @@ class Llama:
        mirostat_mu: float,
        mirostat_m: int,
        repeat_penalty: float,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
    ):
        """Sample a token from the model.

@@ -382,7 +409,7 @@ class Llama:
        last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
            0, self.last_n_tokens_size - len(self.eval_tokens)
        ) + list(self.eval_tokens)[-self.last_n_tokens_size :]
-        return self._sample_top_p_top_k(
+        return self._sample(
            last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
                *last_n_tokens_data
            ),
@@ -396,6 +423,8 @@ class Llama:
            mirostat_eta=llama_cpp.c_float(mirostat_eta),
            mirostat_m=llama_cpp.c_int(mirostat_m),
            repeat_penalty=llama_cpp.c_float(repeat_penalty),
+            frequency_penalty=llama_cpp.c_float(frequency_penalty),
+            presence_penalty=llama_cpp.c_float(presence_penalty),
        )

    def generate(
@@ -410,6 +439,8 @@ class Llama:
        mirostat_mu: float,
        mirostat_m: int,
        repeat_penalty: float,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
        reset: bool = True,
    ) -> Generator[
        llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
@@ -468,6 +499,8 @@ class Llama:
                mirostat_eta=mirostat_eta,
                mirostat_mu=mirostat_mu,
                mirostat_m=mirostat_m,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
                repeat_penalty=repeat_penalty,
            )
            tokens_or_none = yield token
@@ -547,6 +580,8 @@ class Llama:
        logprobs: Optional[int] = None,
        echo: bool = False,
        stop: Optional[List[str]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
        repeat_penalty: float = 1.1,
        top_k: int = 40,
        stream: bool = False,
@@ -581,10 +616,22 @@ class Llama:
                "logprobs is not supported for models created with logits_all=False"
            )

-        if self.cache and prompt_tokens in self.cache:
-            if self.verbose:
-                print("Llama._create_completion: cache hit", file=sys.stderr)
-            self.load_state(self.cache[prompt_tokens])
+        if self.cache:
+            try:
+                cache_item = self.cache[prompt_tokens]
+                cache_prefix_len = Llama.longest_token_prefix(
+                    cache_item.eval_tokens, prompt_tokens
+                )
+                eval_prefix_len = Llama.longest_token_prefix(
+                    self.eval_tokens, prompt_tokens
+                )
+                if cache_prefix_len > eval_prefix_len:
+                    self.load_state(cache_item)
+                    if self.verbose:
+                        print("Llama._create_completion: cache hit", file=sys.stderr)
+            except KeyError:
+                if self.verbose:
+                    print("Llama._create_completion: cache miss", file=sys.stderr)

        finish_reason = "length"
        multibyte_fix = 0
@@ -598,6 +645,8 @@ class Llama:
            mirostat_eta=mirostat_eta,
            mirostat_mu=mirostat_mu,
            mirostat_m=mirostat_m,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
            repeat_penalty=repeat_penalty,
        ):
            if token == llama_cpp.llama_token_eos():
@@ -605,12 +654,6 @@ class Llama:
                finish_reason = "stop"
                break

-            if self.cache and len(completion_tokens) == 0:
-                if prompt_tokens not in self.cache:
-                    if self.verbose:
-                        print("Llama._create_completion: cache miss", file=sys.stderr)
-                    self.cache[prompt_tokens] = self.save_state()
-
            completion_tokens.append(token)

            all_text = self.detokenize(completion_tokens)
@@ -669,6 +712,11 @@ class Llama:
                finish_reason = "length"
                break

+        if self.cache:
+            if self.verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+
        if stream:
            yield {
                "id": completion_id,
@@ -778,6 +826,8 @@ class Llama:
        logprobs: Optional[int] = None,
        echo: bool = False,
        stop: Optional[List[str]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
        repeat_penalty: float = 1.1,
        top_k: int = 40,
        stream: bool = False,
@@ -818,6 +868,8 @@ class Llama:
            logprobs=logprobs,
            echo=echo,
            stop=stop,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
            repeat_penalty=repeat_penalty,
            top_k=top_k,
            stream=stream,
@@ -843,6 +895,8 @@ class Llama:
        logprobs: Optional[int] = None,
        echo: bool = False,
        stop: Optional[List[str]] = [],
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
        repeat_penalty: float = 1.1,
        top_k: int = 40,
        stream: bool = False,
@@ -883,6 +937,8 @@ class Llama:
            logprobs=logprobs,
            echo=echo,
            stop=stop,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
            repeat_penalty=repeat_penalty,
            top_k=top_k,
            stream=stream,
@@ -955,6 +1011,8 @@ class Llama:
        stream: bool = False,
        stop: Optional[List[str]] = [],
        max_tokens: int = 256,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
        repeat_penalty: float = 1.1,
    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
        """Generate a chat completion from a list of messages.
@@ -988,6 +1046,8 @@ class Llama:
            stream=stream,
            max_tokens=max_tokens,
            repeat_penalty=repeat_penalty,
+            presence_penalty=presence_penalty,
+            frequency_penalty=frequency_penalty,
        )
        if stream:
            chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@@ -1085,3 +1145,15 @@ class Llama:
        exps = [math.exp(float(x)) for x in logits]
        sum_exps = sum(exps)
        return [math.log(x / sum_exps) for x in exps]
+
+    @staticmethod
+    def longest_token_prefix(
+        a: Sequence[llama_cpp.llama_token], b: Sequence[llama_cpp.llama_token]
+    ):
+        longest_prefix = 0
+        for _a, _b in zip(a, b):
+            if _a == _b:
+                longest_prefix += 1
+            else:
+                break
+        return longest_prefix
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -157,7 +157,7 @@ _lib.llama_context_default_params.argtypes = []
 _lib.llama_context_default_params.restype = llama_context_params


-def llama_mmap_supported() -> c_bool:
+def llama_mmap_supported() -> bool:
    return _lib.llama_mmap_supported()


@@ -165,7 +165,7 @@ _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool


-def llama_mlock_supported() -> c_bool:
+def llama_mlock_supported() -> bool:
    return _lib.llama_mlock_supported()


@@ -260,7 +260,7 @@ _lib.llama_get_state_size.restype = c_size_t
 # Returns the number of bytes copied
 def llama_copy_state_data(
    ctx: llama_context_p, dest  # type: Array[c_uint8]
-) -> c_size_t:
+) -> int:
    return _lib.llama_copy_state_data(ctx, dest)


@@ -272,7 +272,7 @@ _lib.llama_copy_state_data.restype = c_size_t
 # Returns the number of bytes read
 def llama_set_state_data(
    ctx: llama_context_p, src  # type: Array[c_uint8]
-) -> c_size_t:
+) -> int:
    return _lib.llama_set_state_data(ctx, src)


@@ -387,7 +387,9 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
+def llama_get_logits(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
    return _lib.llama_get_logits(ctx)


@@ -397,7 +399,9 @@ _lib.llama_get_logits.restype = c_float_p

 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
+def llama_get_embeddings(
+    ctx: llama_context_p,
+):  # type: (...) -> Array[float] # type: ignore
    return _lib.llama_get_embeddings(ctx)


@@ -515,7 +519,7 @@ def llama_sample_top_k(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    k: c_int,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
    return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)

@@ -534,7 +538,7 @@ def llama_sample_top_p(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
    return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)

@@ -553,7 +557,7 @@ def llama_sample_tail_free(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    z: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
    return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)

@@ -572,7 +576,7 @@ def llama_sample_typical(
    ctx: llama_context_p,
    candidates,  # type: _Pointer[llama_token_data_array]
    p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
    return _lib.llama_sample_typical(ctx, candidates, p, min_keep)

--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -58,7 +58,7 @@ class Completion(TypedDict):


 class ChatCompletionMessage(TypedDict):
-    role: Union[Literal["assistant"], Literal["user"], Literal["system"]]
+    role: Literal["assistant", "user", "system"]
    content: str
    user: NotRequired[str]

--- a/llama_cpp/server/main.py
+++ b/llama_cpp/server/main.py
@@ -31,16 +31,18 @@ from llama_cpp.server.app import create_app, Settings
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    for name, field in Settings.__fields__.items():
+        description = field.field_info.description
+        if field.default is not None and description is not None:
+            description += f" (default: {field.default})"
        parser.add_argument(
            f"--{name}",
            dest=name,
            type=field.type_,
-            default=field.default,
-            help=field.field_info.description,
+            help=description,
        )

    args = parser.parse_args()
-    settings = Settings(**vars(args))
+    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
    app = create_app(settings=settings)

    uvicorn.run(
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,8 +1,8 @@
-import os
 import json
+import multiprocessing
 from threading import Lock
 from typing import List, Optional, Union, Iterator, Dict
-from typing_extensions import TypedDict, Literal, Annotated
+from typing_extensions import TypedDict, Literal

 import llama_cpp

@@ -13,18 +13,48 @@ from sse_starlette.sse import EventSourceResponse


 class Settings(BaseSettings):
-    model: str
-    n_ctx: int = 2048
-    n_batch: int = 512
-    n_threads: int = max((os.cpu_count() or 2) // 2, 1)
-    f16_kv: bool = True
-    use_mlock: bool = False  # This causes a silent failure on platforms that don't support mlock (e.g. Windows) took forever to figure out...
-    use_mmap: bool = True
-    embedding: bool = True
-    last_n_tokens_size: int = 64
-    logits_all: bool = False
-    cache: bool = False  # WARNING: This is an experimental feature
-    vocab_only: bool = False
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_batch: int = Field(
+        default=512, ge=1, description="The batch size to use per eval."
+    )
+    n_threads: int = Field(
+        default=max(multiprocessing.cpu_count() // 2, 1),
+        ge=1,
+        description="The number of threads to use.",
+    )
+    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
+    use_mlock: bool = Field(
+        default=llama_cpp.llama_mlock_supported(),
+        description="Use mlock.",
+    )
+    use_mmap: bool = Field(
+        default=llama_cpp.llama_mmap_supported(),
+        description="Use mmap.",
+    )
+    embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    last_n_tokens_size: int = Field(
+        default=64,
+        ge=0,
+        description="Last n tokens to keep for repeat penalty calculation.",
+    )
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    cache: bool = Field(
+        default=False,
+        description="Use a cache to reduce processing times for evaluated prompts.",
+    )
+    cache_size: int = Field(
+        default=2 << 30,
+        description="The size of the cache in bytes. Only used if cache is True.",
+    )
+    vocab_only: bool = Field(
+        default=False, description="Whether to only return the vocabulary."
+    )
+    verbose: bool = Field(
+        default=True, description="Whether to print debug information."
+    )


 router = APIRouter()
@@ -60,9 +90,10 @@ def create_app(settings: Optional[Settings] = None):
        n_ctx=settings.n_ctx,
        last_n_tokens_size=settings.last_n_tokens_size,
        vocab_only=settings.vocab_only,
+        verbose=settings.verbose,
    )
    if settings.cache:
-        cache = llama_cpp.LlamaCache()
+        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
        llama.set_cache(cache)
    return app

@@ -75,18 +106,78 @@ def get_llama():
        yield llama


+model_field = Field(description="The model to use for generating completions.")
+
+max_tokens_field = Field(
+    default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
+)
+
+temperature_field = Field(
+    default=0.8,
+    ge=0.0,
+    le=2.0,
+    description="Adjust the randomness of the generated text.\n\n"
+    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
+)
+
+top_p_field = Field(
+    default=0.95,
+    ge=0.0,
+    le=1.0,
+    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
+    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
+)
+
+stop_field = Field(
+    default=None,
+    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+)
+
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="Limit the next token selection to the K most probable tokens.\n\n"
+    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
+)
+
+repeat_penalty_field = Field(
+    default=1.1,
+    ge=0.0,
+    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
+    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
+)
+
+
 class CreateCompletionRequest(BaseModel):
-    prompt: Union[str, List[str]]
-    suffix: Optional[str] = Field(None)
-    max_tokens: int = 16
-    temperature: float = 0.8
-    top_p: float = 0.95
-    echo: bool = False
-    stop: Optional[List[str]] = []
-    stream: bool = False
+    prompt: Optional[str] = Field(
+        default="", description="The prompt to generate completions for."
+    )
+    suffix: Optional[str] = Field(
+        default=None,
+        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
+    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    echo: bool = Field(
+        default=False,
+        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
+    )
+    stop: Optional[List[str]] = stop_field
+    stream: bool = stream_field
+    logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated.",
+    )

    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
+    model: Optional[str] = model_field
    n: Optional[int] = 1
    logprobs: Optional[int] = Field(None)
    presence_penalty: Optional[float] = 0
@@ -96,8 +187,8 @@ class CreateCompletionRequest(BaseModel):
    user: Optional[str] = Field(None)

    # llama.cpp specific parameters
-    top_k: int = 40
-    repeat_penalty: float = 1.1
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field

    class Config:
        schema_extra = {
@@ -118,16 +209,11 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 def create_completion(
    request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    if isinstance(request.prompt, list):
-        request.prompt = "".join(request.prompt)
-
    completion_or_chunks = llama(
        **request.dict(
            exclude={
                "model",
                "n",
-                "frequency_penalty",
-                "presence_penalty",
                "best_of",
                "logit_bias",
                "user",
@@ -142,8 +228,8 @@ def create_completion(


 class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str]
-    input: str
+    model: Optional[str] = model_field
+    input: str = Field(description="The input to embed.")
    user: Optional[str]

    class Config:
@@ -168,22 +254,24 @@ def create_embedding(


 class ChatCompletionRequestMessage(BaseModel):
-    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
-    content: str
-    user: Optional[str] = None
+    role: Literal["system", "user", "assistant"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: str = Field(default="", description="The content of the message.")


 class CreateChatCompletionRequest(BaseModel):
-    model: Optional[str]
-    messages: List[ChatCompletionRequestMessage]
-    temperature: float = 0.8
-    top_p: float = 0.95
-    stream: bool = False
-    stop: Optional[List[str]] = []
-    max_tokens: int = 128
+    messages: List[ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    stop: Optional[List[str]] = stop_field
+    stream: bool = stream_field

    # ignored or currently unsupported
-    model: Optional[str] = Field(None)
+    model: Optional[str] = model_field
    n: Optional[int] = 1
    presence_penalty: Optional[float] = 0
    frequency_penalty: Optional[float] = 0
@@ -191,7 +279,8 @@ class CreateChatCompletionRequest(BaseModel):
    user: Optional[str] = Field(None)

    # llama.cpp specific parameters
-    repeat_penalty: float = 1.1
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field

    class Config:
        schema_extra = {
@@ -224,8 +313,6 @@ def create_chat_completion(
            exclude={
                "model",
                "n",
-                "presence_penalty",
-                "frequency_penalty",
                "logit_bias",
                "user",
            }
@@ -266,7 +353,9 @@ GetModelResponse = create_model_from_typeddict(ModelList)


@router.get("/v1/models", response_model=GetModelResponse)
-def get_models() -> ModelList:
+def get_models(
+    llama: llama_cpp.Llama = Depends(get_llama),
+) -> ModelList:
    return {
        "object": "list",
        "data": [