Add experimental cache

2023-09-07 17:34:22 +03:00 · 2023-04-15 12:03:09 -04:00
parent a6372a7ae5
commit 92c077136d
2 changed files with 69 additions and 5 deletions
--- a/llama_cpp/server/main.py
+++ b/llama_cpp/server/main.py
@@ -35,6 +35,7 @@ class Settings(BaseSettings):
    embedding: bool = True
    last_n_tokens_size: int = 64
    logits_all: bool = False
+    cache: bool = False  # WARNING: This is an experimental feature


 app = FastAPI(
@@ -60,6 +61,9 @@ llama = llama_cpp.Llama(
    n_ctx=settings.n_ctx,
    last_n_tokens_size=settings.last_n_tokens_size,
 )
+if settings.cache:
+    cache = llama_cpp.LlamaCache()
+    llama.set_cache(cache)
 llama_lock = Lock()


@@ -68,7 +72,6 @@ def get_llama():
        yield llama


-
 class CreateCompletionRequest(BaseModel):
    prompt: Union[str, List[str]]
    suffix: Optional[str] = Field(None)