mirror of
https://github.com/abetlen/llama-cpp-python.git
synced 2023-09-07 17:34:22 +03:00
Add experimental cache
This commit is contained in:
@@ -35,6 +35,7 @@ class Settings(BaseSettings):
|
||||
embedding: bool = True
|
||||
last_n_tokens_size: int = 64
|
||||
logits_all: bool = False
|
||||
cache: bool = False # WARNING: This is an experimental feature
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
@@ -60,6 +61,9 @@ llama = llama_cpp.Llama(
|
||||
n_ctx=settings.n_ctx,
|
||||
last_n_tokens_size=settings.last_n_tokens_size,
|
||||
)
|
||||
if settings.cache:
|
||||
cache = llama_cpp.LlamaCache()
|
||||
llama.set_cache(cache)
|
||||
llama_lock = Lock()
|
||||
|
||||
|
||||
@@ -68,7 +72,6 @@ def get_llama():
|
||||
yield llama
|
||||
|
||||
|
||||
|
||||
class CreateCompletionRequest(BaseModel):
|
||||
prompt: Union[str, List[str]]
|
||||
suffix: Optional[str] = Field(None)
|
||||
|
||||
Reference in New Issue
Block a user