Merge branch 'main' of https://github.com/abetlen/llama-cpp-python

2023-09-07 17:34:22 +03:00 · 2023-04-05 14:18:27 +02:00
parent c862e8bac5 6de2f24aca
commit e4c6f34d95
19 changed files with 6212 additions and 123 deletions
--- a/examples/fastapi_server.py
+++ b/examples/fastapi_server.py
@@ -1,97 +0,0 @@
-"""Example FastAPI server for llama.cpp.
-"""
-import json
-from typing import List, Optional, Iterator
-
-import llama_cpp
-
-from fastapi import FastAPI
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
-from sse_starlette.sse import EventSourceResponse
-
-
-class Settings(BaseSettings):
-    model: str
-
-
-app = FastAPI(
-    title="🦙 llama.cpp Python API",
-    version="0.0.1",
-)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-settings = Settings()
-llama = llama_cpp.Llama(
-    settings.model,
-    f16_kv=True,
-    use_mlock=True,
-    embedding=True,
-    n_threads=6,
-    n_batch=2048,
-)
-
-
-class CreateCompletionRequest(BaseModel):
-    prompt: str
-    suffix: Optional[str] = Field(None)
-    max_tokens: int = 16
-    temperature: float = 0.8
-    top_p: float = 0.95
-    logprobs: Optional[int] = Field(None)
-    echo: bool = False
-    stop: List[str] = []
-    repeat_penalty: float = 1.1
-    top_k: int = 40
-    stream: bool = False
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
-        }
-
-
-CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
-
-
-@app.post(
-    "/v1/completions",
-    response_model=CreateCompletionResponse,
-)
-def create_completion(request: CreateCompletionRequest):
-    if request.stream:
-        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
-        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
-    return llama(**request.dict())
-
-
-class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str]
-    input: str
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
-        }
-
-
-CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
-
-
-@app.post(
-    "/v1/embeddings",
-    response_model=CreateEmbeddingResponse,
-)
-def create_embedding(request: CreateEmbeddingRequest):
-    return llama.create_embedding(request.input)
--- a/examples/high_level_api/fastapi_server.py
+++ b/examples/high_level_api/fastapi_server.py
@@ -0,0 +1,181 @@
+"""Example FastAPI server for llama.cpp.
+
+To run this example:
+
+```bash
+pip install fastapi uvicorn sse-starlette
+export MODEL=../models/7B/...
+uvicorn fastapi_server_chat:app --reload
+```
+
+Then visit http://localhost:8000/docs to see the interactive API docs.
+
+"""
+import os
+import json
+from typing import List, Optional, Literal, Union, Iterator
+
+import llama_cpp
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
+from sse_starlette.sse import EventSourceResponse
+
+
+class Settings(BaseSettings):
+    model: str
+    n_ctx: int = 2048
+    n_batch: int = 2048
+    n_threads: int = os.cpu_count() or 1
+    f16_kv: bool = True
+    use_mlock: bool = True
+    embedding: bool = True
+    last_n_tokens_size: int = 64
+
+
+app = FastAPI(
+    title="🦙 llama.cpp Python API",
+    version="0.0.1",
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+settings = Settings()
+llama = llama_cpp.Llama(
+    settings.model,
+    f16_kv=settings.f16_kv,
+    use_mlock=settings.use_mlock,
+    embedding=settings.embedding,
+    n_threads=settings.n_threads,
+    n_batch=settings.n_batch,
+    n_ctx=settings.n_ctx,
+    last_n_tokens_size=settings.last_n_tokens_size,
+)
+
+
+class CreateCompletionRequest(BaseModel):
+    prompt: str
+    suffix: Optional[str] = Field(None)
+    max_tokens: int = 16
+    temperature: float = 0.8
+    top_p: float = 0.95
+    logprobs: Optional[int] = Field(None)
+    echo: bool = False
+    stop: List[str] = []
+    repeat_penalty: float = 1.1
+    top_k: int = 40
+    stream: bool = False
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                "stop": ["\n", "###"],
+            }
+        }
+
+
+CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
+
+
+@app.post(
+    "/v1/completions",
+    response_model=CreateCompletionResponse,
+)
+def create_completion(request: CreateCompletionRequest):
+    if request.stream:
+        chunks: Iterator[llama_cpp.CompletionChunk] = llama(**request.dict())  # type: ignore
+        return EventSourceResponse(dict(data=json.dumps(chunk)) for chunk in chunks)
+    return llama(**request.dict())
+
+
+class CreateEmbeddingRequest(BaseModel):
+    model: Optional[str]
+    input: str
+    user: Optional[str]
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "input": "The food was delicious and the waiter...",
+            }
+        }
+
+
+CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
+
+
+@app.post(
+    "/v1/embeddings",
+    response_model=CreateEmbeddingResponse,
+)
+def create_embedding(request: CreateEmbeddingRequest):
+    return llama.create_embedding(**request.dict(exclude={"model", "user"}))
+
+
+class ChatCompletionRequestMessage(BaseModel):
+    role: Union[Literal["system"], Literal["user"], Literal["assistant"]]
+    content: str
+    user: Optional[str] = None
+
+
+class CreateChatCompletionRequest(BaseModel):
+    model: Optional[str]
+    messages: List[ChatCompletionRequestMessage]
+    temperature: float = 0.8
+    top_p: float = 0.95
+    stream: bool = False
+    stop: List[str] = []
+    max_tokens: int = 128
+    repeat_penalty: float = 1.1
+
+    class Config:
+        schema_extra = {
+            "example": {
+                "messages": [
+                    ChatCompletionRequestMessage(
+                        role="system", content="You are a helpful assistant."
+                    ),
+                    ChatCompletionRequestMessage(
+                        role="user", content="What is the capital of France?"
+                    ),
+                ]
+            }
+        }
+
+
+CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
+
+
+@app.post(
+    "/v1/chat/completions",
+    response_model=CreateChatCompletionResponse,
+)
+async def create_chat_completion(
+    request: CreateChatCompletionRequest,
+) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
+    completion_or_chunks = llama.create_chat_completion(
+        **request.dict(exclude={"model"}),
+    )
+
+    if request.stream:
+
+        async def server_sent_events(
+            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
+        ):
+            for chat_chunk in chat_chunks:
+                yield dict(data=json.dumps(chat_chunk))
+            yield dict(data="[DONE]")
+
+        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
+
+        return EventSourceResponse(
+            server_sent_events(chunks),
+        )
+    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
+    return completion
--- a/examples/high_level_api/high_level_api_embedding.py
+++ b/examples/high_level_api/high_level_api_embedding.py
--- a/examples/high_level_api/high_level_api_inference.py
+++ b/examples/high_level_api/high_level_api_inference.py
@@ -11,7 +11,7 @@ llm = Llama(model_path=args.model)

 output = llm(
    "Question: What are the names of the planets in the solar system? Answer: ",
-    max_tokens=1,
+    max_tokens=48,
    stop=["Q:", "\n"],
    echo=True,
 )
--- a/examples/high_level_api/high_level_api_streaming.py
+++ b/examples/high_level_api/high_level_api_streaming.py
@@ -4,7 +4,7 @@ import argparse
 from llama_cpp import Llama

 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default=".//models/...")
+parser.add_argument("-m", "--model", type=str, default="./models/...")
 args = parser.parse_args()

 llm = Llama(model_path=args.model)
--- a/examples/high_level_api/langchain_custom_llm.py
+++ b/examples/high_level_api/langchain_custom_llm.py
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
--- a/examples/low_level_api/quantize.py
+++ b/examples/low_level_api/quantize.py
@@ -0,0 +1,25 @@
+import os
+import argparse
+import llama_cpp
+
+
+def main(args):
+    if not os.path.exists(fname_inp):
+        raise RuntimeError(f"Input file does not exist ({fname_inp})")
+    if os.path.exists(fname_out):
+        raise RuntimeError(f"Output file already exists ({fname_out})")
+    fname_inp = args.fname_inp.encode("utf-8")
+    fname_out = args.fname_out.encode("utf-8")
+    itype = args.itype
+    return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype)
+    if return_code != 0:
+        raise RuntimeError("Failed to quantize model")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("fname_inp", type=str, help="Path to input model")
+    parser.add_argument("fname_out", type=str, help="Path to output model")
+    parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)")
+    args = parser.parse_args()
+    main(args)
--- a/examples/notebooks/PerformanceTuning.ipynb
+++ b/examples/notebooks/PerformanceTuning.ipynb