From 6d3c20e39dbae1a0c89e1ce6d5bec076b102f2e6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 3 May 2023 22:20:53 -0400
Subject: [PATCH 01/34] Add CUDA docker image build to github actions

---
 .github/workflows/build-docker.yaml | 38 +++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 44196f1..8ffa45f 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -37,3 +37,41 @@ jobs:
           pull: true # always fetch the latest base images
           platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64
           tags: ghcr.io/abetlen/llama-cpp-python:latest
+
+  docker-cuda:
+    name: Build and push Docker CUDA image
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: "true"
+
+      - name: Setup CUDA 12.1
+        uses: Jimver/cuda-toolkit@v0.2.10
+        id: cuda-toolkit
+        with:
+          cuda: '12.1.0'
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v2 
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          file: Dockerfile.cuda
+          context: .
+          push: true # push to registry
+          pull: true # always fetch the latest base images
+          platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64
+          tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest

From 0607f6578efe03c7b8894d2ed5f71eaf03473c55 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 3 May 2023 23:22:16 -0400
Subject: [PATCH 02/34] Use network installer for cuda

---
 .github/workflows/build-docker.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 8ffa45f..2ec5c0d 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -52,6 +52,7 @@ jobs:
         id: cuda-toolkit
         with:
           cuda: '12.1.0'
+          method: network
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v2

From d594892fd425cb41b30e4cb31e3aa5ef1c16e681 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 00:02:46 -0400
Subject: [PATCH 03/34] Remove Docker CUDA build job

---
 .github/workflows/build-docker.yaml | 41 +----------------------------
 1 file changed, 1 insertion(+), 40 deletions(-)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 2ec5c0d..16b00a2 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -36,43 +36,4 @@ jobs:
           push: true # push to registry
           pull: true # always fetch the latest base images
           platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64
-          tags: ghcr.io/abetlen/llama-cpp-python:latest
-
-  docker-cuda:
-    name: Build and push Docker CUDA image
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: "true"
-
-      - name: Setup CUDA 12.1
-        uses: Jimver/cuda-toolkit@v0.2.10
-        id: cuda-toolkit
-        with:
-          cuda: '12.1.0'
-          method: network
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v2 
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v4
-        with:
-          file: Dockerfile.cuda
-          context: .
-          push: true # push to registry
-          pull: true # always fetch the latest base images
-          platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64
-          tags: ghcr.io/abetlen/llama-cpp-python-cuda:latest
+          tags: ghcr.io/abetlen/llama-cpp-python:latest
\ No newline at end of file

From 329297fafb4916951cf1c3146505a9501e986d95 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 12:18:40 -0400
Subject: [PATCH 04/34] Bugfix: Missing logits_to_logprobs

---
 llama_cpp/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fef7b3e..8cd77ee 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -639,7 +639,7 @@ class Llama:
                 self.detokenize([token]).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
-            all_logprobs = [Llama._logits_to_logprobs(row) for row in self.eval_logits]
+            all_logprobs = [Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits]
             for token, token_str, logprobs_token in zip(
                 all_tokens, all_token_strs, all_logprobs
             ):
@@ -985,7 +985,7 @@ class Llama:
         return llama_cpp.llama_token_bos()
 
     @staticmethod
-    def logits_to_logprobs(logits: List[llama_cpp.c_float]) -> List[llama_cpp.c_float]:
+    def logits_to_logprobs(logits: List[float]) -> List[float]:
         exps = [math.exp(float(x)) for x in logits]
         sum_exps = sum(exps)
-        return [llama_cpp.c_float(math.log(x / sum_exps)) for x in exps]
+        return [math.log(x / sum_exps) for x in exps]

From d78cec67df876221471782e7e1fbe62abf48ee25 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 12:20:25 -0400
Subject: [PATCH 05/34] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index e216aa0..2edbdb0 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit e216aa04633892b972d013719e38b59fd4917341
+Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863

From cabd8b8ed1ee45a19baa9436668898bbe9471492 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 12:21:20 -0400
Subject: [PATCH 06/34] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 64f7a0d..2dab374 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.41"
+version = "0.1.42"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index f7f0fa4..0a52826 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.41",
+    version="0.1.42",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 501321875f449594c249cdbbc9b48208fbce4bde Mon Sep 17 00:00:00 2001
From: Thomas Neu <81517187+th-neu@users.noreply.github.com>
Date: Thu, 4 May 2023 21:03:19 +0200
Subject: [PATCH 07/34] Slim-Bullseye based docker image

ends up at ~669MB
---
 Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 14fb3be..f58506f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3-bullseye
+FROM python:3-slim-bullseye
 
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
@@ -6,10 +6,10 @@ ENV HOST 0.0.0.0
 COPY . .
 
 # Install the package
-RUN apt update && apt install -y libopenblas-dev
+RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
 
 RUN LLAMA_OPENBLAS=1 python3 setup.py develop
 
 # Run the server
-CMD python3 -m llama_cpp.server
\ No newline at end of file
+CMD python3 -m llama_cpp.server

From 97c6372350c57a4fffb6072cb299e5a9bd8b38dc Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 21:58:27 -0400
Subject: [PATCH 08/34] Rewind model to longest prefix.

---
 llama_cpp/llama.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 8cd77ee..7a8c25b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -390,18 +390,28 @@ class Llama:
         """
         assert self.ctx is not None
 
-        if (
-            reset
-            and len(self.eval_tokens) > 0
-            and tuple(self.eval_tokens) == tuple(tokens[: len(self.eval_tokens)])
-        ):
-            if self.verbose:
-                print("Llama.generate: cache hit", file=sys.stderr)
-            reset = False
-            tokens = tokens[len(self.eval_tokens) :]
+        if reset and len(self.eval_tokens) > 0:
+            longest_prefix = 0
+            for a, b in zip(self.eval_tokens, tokens[:-1]):
+                if a == b:
+                    longest_prefix += 1
+                else:
+                    break
+            if longest_prefix > 0:
+                if self.verbose:
+                    print("Llama.generate: prefix-match hit", file=sys.stderr)
+                reset = False
+                tokens = tokens[longest_prefix:]
+                for _ in range(len(self.eval_tokens) - longest_prefix):
+                    self.eval_tokens.pop()
+                    try:
+                        self.eval_logits.pop()
+                    except IndexError:
+                        pass
 
         if reset:
             self.reset()
+
         while True:
             self.eval(tokens)
             token = self.sample(

From 853dc711cc5507ca119cb822f459cd16c9021f15 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 21:58:36 -0400
Subject: [PATCH 09/34] Format

---
 llama_cpp/llama.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7a8c25b..32d5424 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -649,7 +649,10 @@ class Llama:
                 self.detokenize([token]).decode("utf-8", errors="ignore")
                 for token in all_tokens
             ]
-            all_logprobs = [Llama.logits_to_logprobs(list(map(float, row))) for row in self.eval_logits]
+            all_logprobs = [
+                Llama.logits_to_logprobs(list(map(float, row)))
+                for row in self.eval_logits
+            ]
             for token, token_str, logprobs_token in zip(
                 all_tokens, all_token_strs, all_logprobs
             ):
@@ -968,7 +971,10 @@ class Llama:
         llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))()
         llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes))
         if self.verbose:
-            print(f"Llama.save_state: saving {n_bytes} bytes of llama state", file=sys.stderr)
+            print(
+                f"Llama.save_state: saving {n_bytes} bytes of llama state",
+                file=sys.stderr,
+            )
         return LlamaState(
             eval_tokens=self.eval_tokens.copy(),
             eval_logits=self.eval_logits.copy(),

From 5c165a85da5a340aca85a44e2282db2e5f729463 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 4 May 2023 21:59:37 -0400
Subject: [PATCH 10/34] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2dab374..ca0346f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.42"
+version = "0.1.43"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 0a52826..405886a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.42",
+    version="0.1.43",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 952ba9ecaf7a78be1844a1c533d6f6f580b92833 Mon Sep 17 00:00:00 2001
From: Thomas Neu <81517187+th-neu@users.noreply.github.com>
Date: Fri, 5 May 2023 14:21:57 +0200
Subject: [PATCH 11/34] Update README.md

add windows server commad
---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index a8afa67..ee6ec2d 100644
--- a/README.md
+++ b/README.md
@@ -64,12 +64,20 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl
 
 To install the server package and get started:
 
+Linux
 ```bash
 pip install llama-cpp-python[server]
 export MODEL=./models/7B/ggml-model.bin
 python3 -m llama_cpp.server
 ```
 
+Windows
+```cmd
+pip install llama-cpp-python[server]
+SET MODEL=\models\7B\ggml-model.bin
+python3 -m llama_cpp.server
+```
+
 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
 
 ## Docker image

From eb54e30f343251767ec0a2cb10da2684b896718f Mon Sep 17 00:00:00 2001
From: Thomas Neu <81517187+th-neu@users.noreply.github.com>
Date: Fri, 5 May 2023 14:22:41 +0200
Subject: [PATCH 12/34] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ee6ec2d..d24bad5 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ python3 -m llama_cpp.server
 Windows
 ```cmd
 pip install llama-cpp-python[server]
-SET MODEL=\models\7B\ggml-model.bin
+SET MODEL=..\models\7B\ggml-model.bin
 python3 -m llama_cpp.server
 ```
 

From 24fc38754b6da802ae5b32fb301e957868ec5e86 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 12:08:28 -0400
Subject: [PATCH 13/34] Add cli options to server. Closes #37

---
 llama_cpp/server/__main__.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 4fbee37..5c9598a 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -22,12 +22,26 @@ Then visit http://localhost:8000/docs to see the interactive API docs.
 
 """
 import os
+import argparse
+
 import uvicorn
 
-from llama_cpp.server.app import create_app
+from llama_cpp.server.app import create_app, Settings
 
 if __name__ == "__main__":
-    app = create_app()
+    parser = argparse.ArgumentParser()
+    for name, field in Settings.__fields__.items():
+        parser.add_argument(
+            f"--{name}",
+            dest=name,
+            type=field.type_,
+            default=field.default,
+            help=field.field_info.description,
+        )
+
+    args = parser.parse_args()
+    settings = Settings(**vars(args))
+    app = create_app(settings=settings)
 
     uvicorn.run(
         app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))

From 5be0efa5f8f98f4b889ca9869e5005ecb5f195d2 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 12:21:49 -0400
Subject: [PATCH 14/34] Cache should raise KeyError when key is missing

---
 llama_cpp/llama.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 32d5424..4e03ed4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -33,12 +33,10 @@ class LlamaCache:
                 return k
         return None
 
-    def __getitem__(
-        self, key: Sequence[llama_cpp.llama_token]
-    ) -> Optional["LlamaState"]:
+    def __getitem__(self, key: Sequence[llama_cpp.llama_token]) -> "LlamaState":
         _key = self._find_key(tuple(key))
         if _key is None:
-            return None
+            raise KeyError(f"Key not found: {key}")
         return self.cache_state[_key]
 
     def __contains__(self, key: Sequence[llama_cpp.llama_token]) -> bool:

From b6a9a0b6ba74c8b539e98ec31fc6558563b20c96 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 12:22:27 -0400
Subject: [PATCH 15/34] Add types for all low-level api functions

---
 llama_cpp/llama.py     |  2 +-
 llama_cpp/llama_cpp.py | 81 +++++++++++++++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4e03ed4..c1c8847 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -52,7 +52,7 @@ class LlamaState:
         self,
         eval_tokens: Deque[llama_cpp.llama_token],
         eval_logits: Deque[List[llama_cpp.c_float]],
-        llama_state,
+        llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
         llama_state_size: llama_cpp.c_size_t,
     ):
         self.eval_tokens = eval_tokens
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 30414f5..0a35445 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -17,7 +17,7 @@ import pathlib
 
 
 # Load the library
-def _load_shared_library(lib_base_name):
+def _load_shared_library(lib_base_name: str):
     # Determine the file extension based on the platform
     if sys.platform.startswith("linux"):
         lib_ext = ".so"
@@ -252,7 +252,9 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
-def llama_copy_state_data(ctx: llama_context_p, dest) -> c_size_t:
+def llama_copy_state_data(
+    ctx: llama_context_p, dest  # type: Array[c_uint8]
+) -> c_size_t:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -262,7 +264,9 @@ _lib.llama_copy_state_data.restype = c_size_t
 
 # Set the state reading from the specified address
 # Returns the number of bytes read
-def llama_set_state_data(ctx: llama_context_p, src) -> c_size_t:
+def llama_set_state_data(
+    ctx: llama_context_p, src  # type: Array[c_uint8]
+) -> c_size_t:
     return _lib.llama_set_state_data(ctx, src)
 
 
@@ -274,9 +278,9 @@ _lib.llama_set_state_data.restype = c_size_t
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens_out,
+    tokens_out,  # type: Array[llama_token]
     n_token_capacity: c_size_t,
-    n_token_count_out,
+    n_token_count_out,  # type: Array[c_size_t]
 ) -> c_size_t:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
@@ -294,7 +298,10 @@ _lib.llama_load_session_file.restype = c_size_t
 
 
 def llama_save_session_file(
-    ctx: llama_context_p, path_session: bytes, tokens, n_token_count: c_size_t
+    ctx: llama_context_p,
+    path_session: bytes,
+    tokens,  # type: Array[llama_token]
+    n_token_count: c_size_t,
 ) -> c_size_t:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
 
@@ -433,8 +440,8 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates,
-    last_tokens_data,
+    candidates, # type: Array[llama_token_data]
+    last_tokens_data, # type: Array[llama_token]
     last_tokens_size: c_int,
     penalty: c_float,
 ):
@@ -456,8 +463,8 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates,
-    last_tokens_data,
+    candidates, # type: Array[llama_token_data]
+    last_tokens_data, # type: Array[llama_token]
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
@@ -484,7 +491,10 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-def llama_sample_softmax(ctx: llama_context_p, candidates):
+def llama_sample_softmax(
+    ctx: llama_context_p, 
+    candidates # type: Array[llama_token_data]
+):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -497,7 +507,10 @@ _lib.llama_sample_softmax.restype = None
 
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
-    ctx: llama_context_p, candidates, k: c_int, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p, 
+    candidates,  # type: Array[llama_token_data]
+    k: c_int, 
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -513,7 +526,10 @@ _lib.llama_sample_top_k.restype = None
 
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
-    ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p, 
+    candidates, # type: Array[llama_token_data] 
+    p: c_float, 
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -529,7 +545,10 @@ _lib.llama_sample_top_p.restype = None
 
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
-    ctx: llama_context_p, candidates, z: c_float, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    z: c_float,
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -545,7 +564,10 @@ _lib.llama_sample_tail_free.restype = None
 
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
-    ctx: llama_context_p, candidates, p: c_float, min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    p: c_float, 
+    min_keep: c_size_t = c_size_t(1)
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
@@ -559,7 +581,11 @@ _lib.llama_sample_typical.argtypes = [
 _lib.llama_sample_typical.restype = None
 
 
-def llama_sample_temperature(ctx: llama_context_p, candidates, temp: c_float):
+def llama_sample_temperature(
+    ctx: llama_context_p, 
+    candidates, # type: Array[llama_token_data]
+    temp: c_float
+):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
 
@@ -578,7 +604,12 @@ _lib.llama_sample_temperature.restype = None
 # @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
-    ctx: llama_context_p, candidates, tau: c_float, eta: c_float, m: c_int, mu
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    tau: c_float,
+    eta: c_float, 
+    m: c_int,
+    mu # type: Array[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
@@ -600,7 +631,11 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
-    ctx: llama_context_p, candidates, tau: c_float, eta: c_float, mu
+    ctx: llama_context_p,
+    candidates, # type: Array[llama_token_data]
+    tau: c_float, 
+    eta: c_float,
+    mu # type: Array[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
@@ -616,7 +651,10 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 
 # @details Selects the token with the highest probability.
-def llama_sample_token_greedy(ctx: llama_context_p, candidates) -> llama_token:
+def llama_sample_token_greedy(
+    ctx: llama_context_p, 
+    candidates # type: Array[llama_token_data]
+) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
 
@@ -628,7 +666,10 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 
 # @details Randomly selects a token from the candidates based on their probabilities.
-def llama_sample_token(ctx: llama_context_p, candidates) -> llama_token:
+def llama_sample_token(
+    ctx: llama_context_p,
+    candidates # type: Array[llama_token_data]
+) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 
 

From 22c3056b2a8d19f2c5ce9ab817e312da21e66d9c Mon Sep 17 00:00:00 2001
From: Thomas Neu <81517187+th-neu@users.noreply.github.com>
Date: Fri, 5 May 2023 18:40:00 +0200
Subject: [PATCH 16/34] Update README.md

added MacOS
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d24bad5..c46fa11 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ This allows you to use llama.cpp compatible models with any OpenAI compatible cl
 
 To install the server package and get started:
 
-Linux
+Linux/MacOS
 ```bash
 pip install llama-cpp-python[server]
 export MODEL=./models/7B/ggml-model.bin

From 5e7ddfc3d6933471ba503477c0513a8987db4d9a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 13:54:22 -0400
Subject: [PATCH 17/34] Fix llama_cpp types

---
 llama_cpp/llama_cpp.py | 74 +++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 0a35445..87d9249 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -8,6 +8,7 @@ from ctypes import (
     c_void_p,
     c_bool,
     POINTER,
+    _Pointer,  # type: ignore
     Structure,
     Array,
     c_uint8,
@@ -252,9 +253,7 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
-def llama_copy_state_data(
-    ctx: llama_context_p, dest  # type: Array[c_uint8]
-) -> c_size_t:
+def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -278,9 +277,9 @@ _lib.llama_set_state_data.restype = c_size_t
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens_out,  # type: Array[llama_token]
+    tokens_out: Array[llama_token],
     n_token_capacity: c_size_t,
-    n_token_count_out,  # type: Array[c_size_t]
+    n_token_count_out: _Pointer[c_size_t],
 ) -> c_size_t:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
@@ -300,7 +299,7 @@ _lib.llama_load_session_file.restype = c_size_t
 def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens,  # type: Array[llama_token]
+    tokens: Array[llama_token],
     n_token_count: c_size_t,
 ) -> c_size_t:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
@@ -321,7 +320,7 @@ _lib.llama_save_session_file.restype = c_size_t
 # Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
-    tokens,  # type: Array[llama_token]
+    tokens: Array[llama_token],
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
@@ -440,8 +439,8 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    last_tokens_data, # type: Array[llama_token]
+    candidates: _Pointer[llama_token_data],
+    last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     penalty: c_float,
 ):
@@ -463,8 +462,8 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    last_tokens_data, # type: Array[llama_token]
+    candidates: _Pointer[llama_token_data],
+    last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
@@ -491,10 +490,7 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-def llama_sample_softmax(
-    ctx: llama_context_p, 
-    candidates # type: Array[llama_token_data]
-):
+def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -507,10 +503,10 @@ _lib.llama_sample_softmax.restype = None
 
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
-    ctx: llama_context_p, 
-    candidates,  # type: Array[llama_token_data]
-    k: c_int, 
-    min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates: _Pointer[llama_token_data],
+    k: c_int,
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -526,10 +522,10 @@ _lib.llama_sample_top_k.restype = None
 
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
-    ctx: llama_context_p, 
-    candidates, # type: Array[llama_token_data] 
-    p: c_float, 
-    min_keep: c_size_t = c_size_t(1)
+    ctx: llama_context_p,
+    candidates: _Pointer[llama_token_data],
+    p: c_float,
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -546,9 +542,9 @@ _lib.llama_sample_top_p.restype = None
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     z: c_float,
-    min_keep: c_size_t = c_size_t(1)
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -565,9 +561,9 @@ _lib.llama_sample_tail_free.restype = None
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    p: c_float, 
-    min_keep: c_size_t = c_size_t(1)
+    candidates: _Pointer[llama_token_data],
+    p: c_float,
+    min_keep: c_size_t = c_size_t(1),
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
@@ -582,9 +578,7 @@ _lib.llama_sample_typical.restype = None
 
 
 def llama_sample_temperature(
-    ctx: llama_context_p, 
-    candidates, # type: Array[llama_token_data]
-    temp: c_float
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float
 ):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
@@ -605,11 +599,11 @@ _lib.llama_sample_temperature.restype = None
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
+    candidates: _Pointer[llama_token_data],
     tau: c_float,
-    eta: c_float, 
+    eta: c_float,
     m: c_int,
-    mu # type: Array[c_float]
+    mu: _Pointer[c_float],
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
@@ -632,10 +626,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates, # type: Array[llama_token_data]
-    tau: c_float, 
+    candidates: _Pointer[llama_token_data],
+    tau: c_float,
     eta: c_float,
-    mu # type: Array[c_float]
+    mu: _Pointer[c_float],
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
@@ -652,8 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 # @details Selects the token with the highest probability.
 def llama_sample_token_greedy(
-    ctx: llama_context_p, 
-    candidates # type: Array[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
 ) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -667,8 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 # @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(
-    ctx: llama_context_p,
-    candidates # type: Array[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
 ) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From 6702d2abfdc313873931baa470b8b547dd825727 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:00:30 -0400
Subject: [PATCH 18/34] Fix candidates type

---
 llama_cpp/llama_cpp.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 87d9249..61b40f8 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -439,7 +439,7 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     penalty: c_float,
@@ -462,7 +462,7 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     last_tokens_data: Array[llama_token],
     last_tokens_size: c_int,
     alpha_frequency: c_float,
@@ -504,7 +504,7 @@ _lib.llama_sample_softmax.restype = None
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     k: c_int,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -523,7 +523,7 @@ _lib.llama_sample_top_k.restype = None
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -542,7 +542,7 @@ _lib.llama_sample_top_p.restype = None
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     z: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -561,7 +561,7 @@ _lib.llama_sample_tail_free.restype = None
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -578,7 +578,7 @@ _lib.llama_sample_typical.restype = None
 
 
 def llama_sample_temperature(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data], temp: c_float
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float
 ):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
@@ -599,7 +599,7 @@ _lib.llama_sample_temperature.restype = None
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     tau: c_float,
     eta: c_float,
     m: c_int,
@@ -626,7 +626,7 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data],
+    candidates: _Pointer[llama_token_data_array],
     tau: c_float,
     eta: c_float,
     mu: _Pointer[c_float],
@@ -646,7 +646,7 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 # @details Selects the token with the highest probability.
 def llama_sample_token_greedy(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -660,7 +660,7 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 # @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data]
+    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From 66e28eb548974fe50aa80b8593f77cff651959c6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:00:41 -0400
Subject: [PATCH 19/34] Fix temperature bug

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c1c8847..6cd65a4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -287,7 +287,7 @@ class Llama:
             candidates=llama_cpp.ctypes.pointer(candidates),
             penalty=repeat_penalty,
         )
-        if temp == 0.0:
+        if float(temp) == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),

From 40501435c12578fc0bc696c2bdc0bf63d0e15650 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:04:12 -0400
Subject: [PATCH 20/34] Fix: types

---
 llama_cpp/llama_cpp.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 61b40f8..8ce3c89 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -141,6 +141,11 @@ LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9)  # except 1d tensors
 
+# Misc
+c_float_p = POINTER(c_float)
+c_uint8_p = POINTER(c_uint8)
+c_size_t_p = POINTER(c_size_t)
+
 # Functions
 
 
@@ -257,7 +262,7 @@ def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_
     return _lib.llama_copy_state_data(ctx, dest)
 
 
-_lib.llama_copy_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
+_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
 _lib.llama_copy_state_data.restype = c_size_t
 
 
@@ -269,7 +274,7 @@ def llama_set_state_data(
     return _lib.llama_set_state_data(ctx, src)
 
 
-_lib.llama_set_state_data.argtypes = [llama_context_p, POINTER(c_uint8)]
+_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p]
 _lib.llama_set_state_data.restype = c_size_t
 
 
@@ -291,7 +296,7 @@ _lib.llama_load_session_file.argtypes = [
     c_char_p,
     llama_token_p,
     c_size_t,
-    POINTER(c_size_t),
+    c_size_t_p,
 ]
 _lib.llama_load_session_file.restype = c_size_t
 
@@ -340,7 +345,7 @@ _lib.llama_eval.restype = c_int
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
-    tokens,  # type: Array[llama_token]
+    tokens: Array[llama_token],
     n_max_tokens: c_int,
     add_bos: c_bool,
 ) -> c_int:
@@ -385,7 +390,7 @@ def llama_get_logits(ctx: llama_context_p):
 
 
 _lib.llama_get_logits.argtypes = [llama_context_p]
-_lib.llama_get_logits.restype = POINTER(c_float)
+_lib.llama_get_logits.restype = c_float_p
 
 
 # Get the embeddings for the input
@@ -395,7 +400,7 @@ def llama_get_embeddings(ctx: llama_context_p):
 
 
 _lib.llama_get_embeddings.argtypes = [llama_context_p]
-_lib.llama_get_embeddings.restype = POINTER(c_float)
+_lib.llama_get_embeddings.restype = c_float_p
 
 
 # Token Id -> String. Uses the vocabulary in the provided context
@@ -614,7 +619,7 @@ _lib.llama_sample_token_mirostat.argtypes = [
     c_float,
     c_float,
     c_int,
-    POINTER(c_float),
+    c_float_p,
 ]
 _lib.llama_sample_token_mirostat.restype = llama_token
 
@@ -639,7 +644,7 @@ _lib.llama_sample_token_mirostat_v2.argtypes = [
     llama_token_data_array_p,
     c_float,
     c_float,
-    POINTER(c_float),
+    c_float_p,
 ]
 _lib.llama_sample_token_mirostat_v2.restype = llama_token
 

From e24c3d7447e158164397686bbecac2d22d8a75a1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:05:31 -0400
Subject: [PATCH 21/34] Prefer explicit imports

---
 llama_cpp/llama_cpp.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 8ce3c89..f6a71fa 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -68,11 +68,11 @@ _lib_base_name = "llama"
 _lib = _load_shared_library(_lib_base_name)
 
 # C types
-LLAMA_FILE_VERSION = ctypes.c_int(1)
+LLAMA_FILE_VERSION = c_int(1)
 LLAMA_FILE_MAGIC = b"ggjt"
 LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
 LLAMA_SESSION_MAGIC = b"ggsn"
-LLAMA_SESSION_VERSION = ctypes.c_int(1)
+LLAMA_SESSION_VERSION = c_int(1)
 
 llama_context_p = c_void_p
 
@@ -128,18 +128,18 @@ class llama_context_params(Structure):
 
 llama_context_params_p = POINTER(llama_context_params)
 
-LLAMA_FTYPE_ALL_F32 = ctypes.c_int(0)
-LLAMA_FTYPE_MOSTLY_F16 = ctypes.c_int(1)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_0 = ctypes.c_int(2)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
+LLAMA_FTYPE_ALL_F32 = c_int(0)
+LLAMA_FTYPE_MOSTLY_F16 = c_int(1)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
-LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
-# LLAMA_FTYPE_MOSTYL_Q4_3 = ctypes.c_int(6)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q8_0 = ctypes.c_int(7)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_0 = ctypes.c_int(8)  # except 1d tensors
-LLAMA_FTYPE_MOSTLY_Q5_1 = ctypes.c_int(9)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
+# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)  # except 1d tensors
 
 # Misc
 c_float_p = POINTER(c_float)
@@ -216,8 +216,8 @@ _lib.llama_model_quantize.restype = c_int
 # Returns 0 on success
 def llama_apply_lora_from_file(
     ctx: llama_context_p,
-    path_lora: ctypes.c_char_p,
-    path_base_model: ctypes.c_char_p,
+    path_lora: c_char_p,
+    path_base_model: c_char_p,
     n_threads: c_int,
 ) -> c_int:
     return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)

From 3e28e0e50ccd7b579ae99b0fbe163fbed8888167 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:12:26 -0400
Subject: [PATCH 22/34] Fix: runtime type errors

---
 llama_cpp/llama_cpp.py | 52 ++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index f6a71fa..3b1ac1e 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -258,7 +258,9 @@ _lib.llama_get_state_size.restype = c_size_t
 # Copies the state to the specified destination address.
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
-def llama_copy_state_data(ctx: llama_context_p, dest: Array[c_uint8]) -> c_size_t:
+def llama_copy_state_data(
+    ctx: llama_context_p, dest  # type: Array[c_uint8]
+) -> c_size_t:
     return _lib.llama_copy_state_data(ctx, dest)
 
 
@@ -282,9 +284,9 @@ _lib.llama_set_state_data.restype = c_size_t
 def llama_load_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens_out: Array[llama_token],
+    tokens_out,  # type: Array[llama_token]
     n_token_capacity: c_size_t,
-    n_token_count_out: _Pointer[c_size_t],
+    n_token_count_out,  # type: _Pointer[c_size_t]
 ) -> c_size_t:
     return _lib.llama_load_session_file(
         ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
@@ -304,7 +306,7 @@ _lib.llama_load_session_file.restype = c_size_t
 def llama_save_session_file(
     ctx: llama_context_p,
     path_session: bytes,
-    tokens: Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_token_count: c_size_t,
 ) -> c_size_t:
     return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
@@ -325,7 +327,7 @@ _lib.llama_save_session_file.restype = c_size_t
 # Returns 0 on success
 def llama_eval(
     ctx: llama_context_p,
-    tokens: Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_tokens: c_int,
     n_past: c_int,
     n_threads: c_int,
@@ -345,7 +347,7 @@ _lib.llama_eval.restype = c_int
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
-    tokens: Array[llama_token],
+    tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
 ) -> c_int:
@@ -444,8 +446,8 @@ _lib.llama_token_nl.restype = llama_token
 # @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
 def llama_sample_repetition_penalty(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
-    last_tokens_data: Array[llama_token],
+    candidates,  # type: _Pointer[llama_token_data_array]
+    last_tokens_data,  # type: Array[llama_token]
     last_tokens_size: c_int,
     penalty: c_float,
 ):
@@ -467,8 +469,8 @@ _lib.llama_sample_repetition_penalty.restype = None
 # @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
 def llama_sample_frequency_and_presence_penalties(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
-    last_tokens_data: Array[llama_token],
+    candidates,  # type: _Pointer[llama_token_data_array]
+    last_tokens_data,  # type: Array[llama_token]
     last_tokens_size: c_int,
     alpha_frequency: c_float,
     alpha_presence: c_float,
@@ -495,7 +497,9 @@ _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-def llama_sample_softmax(ctx: llama_context_p, candidates: _Pointer[llama_token_data]):
+def llama_sample_softmax(
+    ctx: llama_context_p, candidates  # type: _Pointer[llama_token_data]
+):
     return _lib.llama_sample_softmax(ctx, candidates)
 
 
@@ -509,7 +513,7 @@ _lib.llama_sample_softmax.restype = None
 # @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_k(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     k: c_int,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -528,7 +532,7 @@ _lib.llama_sample_top_k.restype = None
 # @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
 def llama_sample_top_p(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -547,7 +551,7 @@ _lib.llama_sample_top_p.restype = None
 # @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
 def llama_sample_tail_free(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     z: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -566,7 +570,7 @@ _lib.llama_sample_tail_free.restype = None
 # @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
 def llama_sample_typical(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
     min_keep: c_size_t = c_size_t(1),
 ):
@@ -583,7 +587,9 @@ _lib.llama_sample_typical.restype = None
 
 
 def llama_sample_temperature(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array], temp: c_float
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    temp: c_float,
 ):
     return _lib.llama_sample_temperature(ctx, candidates, temp)
 
@@ -604,11 +610,11 @@ _lib.llama_sample_temperature.restype = None
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     tau: c_float,
     eta: c_float,
     m: c_int,
-    mu: _Pointer[c_float],
+    mu,  # type: _Pointer[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
 
@@ -631,10 +637,10 @@ _lib.llama_sample_token_mirostat.restype = llama_token
 # @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
 def llama_sample_token_mirostat_v2(
     ctx: llama_context_p,
-    candidates: _Pointer[llama_token_data_array],
+    candidates,  # type: _Pointer[llama_token_data_array]
     tau: c_float,
     eta: c_float,
-    mu: _Pointer[c_float],
+    mu,  # type: _Pointer[c_float]
 ) -> llama_token:
     return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
 
@@ -651,7 +657,8 @@ _lib.llama_sample_token_mirostat_v2.restype = llama_token
 
 # @details Selects the token with the highest probability.
 def llama_sample_token_greedy(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token_greedy(ctx, candidates)
 
@@ -665,7 +672,8 @@ _lib.llama_sample_token_greedy.restype = llama_token
 
 # @details Randomly selects a token from the candidates based on their probabilities.
 def llama_sample_token(
-    ctx: llama_context_p, candidates: _Pointer[llama_token_data_array]
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
 ) -> llama_token:
     return _lib.llama_sample_token(ctx, candidates)
 

From b5f3e746275bf231df544c60f30b80f537195af7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:22:55 -0400
Subject: [PATCH 23/34] Add return type annotations for embeddings and logits

---
 llama_cpp/llama_cpp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 3b1ac1e..ccec12c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -387,7 +387,7 @@ _lib.llama_n_embd.restype = c_int
 # Can be mutated in order to change the probabilities of the next token
 # Rows: n_tokens
 # Cols: n_vocab
-def llama_get_logits(ctx: llama_context_p):
+def llama_get_logits(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_logits(ctx)
 
 
@@ -397,7 +397,7 @@ _lib.llama_get_logits.restype = c_float_p
 
 # Get the embeddings for the input
 # shape: [n_embd] (1-dimensional)
-def llama_get_embeddings(ctx: llama_context_p):
+def llama_get_embeddings(ctx: llama_context_p):  # type: (...) -> Array[float] # type: ignore
     return _lib.llama_get_embeddings(ctx)
 
 

From 98bbd1c6a8ea1f86c010583f6b1ab74996a1c751 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 5 May 2023 14:23:14 -0400
Subject: [PATCH 24/34] Fix eval logits type

---
 llama_cpp/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 6cd65a4..a643f51 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -127,7 +127,7 @@ class Llama:
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
         self.eval_tokens: Deque[llama_cpp.llama_token] = deque(maxlen=n_ctx)
-        self.eval_logits: Deque[List[llama_cpp.c_float]] = deque(
+        self.eval_logits: Deque[List[float]] = deque(
             maxlen=n_ctx if logits_all else 1
         )
 
@@ -245,7 +245,7 @@ class Llama:
             n_vocab = llama_cpp.llama_n_vocab(self.ctx)
             cols = int(n_vocab)
             logits_view = llama_cpp.llama_get_logits(self.ctx)
-            logits: List[List[llama_cpp.c_float]] = [
+            logits: List[List[float]] = [
                 [logits_view[i * cols + j] for j in range(cols)] for i in range(rows)
             ]
             self.eval_logits.extend(logits)
@@ -287,7 +287,7 @@ class Llama:
             candidates=llama_cpp.ctypes.pointer(candidates),
             penalty=repeat_penalty,
         )
-        if float(temp) == 0.0:
+        if float(temp.value) == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),

From 79d50a29f40c4b14cd56a329ee50f269e673f277 Mon Sep 17 00:00:00 2001
From: Thomas Neu <81517187+th-neu@users.noreply.github.com>
Date: Sat, 6 May 2023 01:02:59 +0200
Subject: [PATCH 25/34] Create dependabot.yml

---
 .github/dependabot.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 .github/dependabot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..91abb11
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"

From c9bb602b2682ae12c5690829fee1635fcdfc707c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 5 May 2023 23:25:53 +0000
Subject: [PATCH 26/34] Bump black from 23.1.0 to 23.3.0

Bumps [black](https://github.com/psf/black) from 23.1.0 to 23.3.0.
- [Release notes](https://github.com/psf/black/releases)
- [Changelog](https://github.com/psf/black/blob/main/CHANGES.md)
- [Commits](https://github.com/psf/black/compare/23.1.0...23.3.0)

---
updated-dependencies:
- dependency-name: black
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 56 +++++++++++++++++++++++++-------------------------
 pyproject.toml |  2 +-
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index a505168..129f923 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry and should not be changed by hand.
 
 [[package]]
 name = "anyio"
@@ -42,37 +42,37 @@ tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy
 
 [[package]]
 name = "black"
-version = "23.1.0"
+version = "23.3.0"
 description = "The uncompromising code formatter."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"},
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"},
-    {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"},
-    {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"},
-    {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"},
-    {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"},
-    {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"},
-    {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"},
-    {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"},
-    {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"},
-    {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"},
-    {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"},
-    {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"},
-    {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"},
-    {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"},
-    {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"},
-    {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"},
-    {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"},
-    {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
+    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
+    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
+    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
+    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
+    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
+    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
+    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
+    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
+    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
+    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
+    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
+    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
+    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
+    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
+    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
+    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
+    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
 ]
 
 [package.dependencies]
@@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "aa15e57300668bd23c051b4cd87bec4c1a58dcccd2f2b4767579fea7f2c5fa41"
+content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c"
diff --git a/pyproject.toml b/pyproject.toml
index ca0346f..a164ef7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ typing-extensions = "^4.5.0"
 
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.1.0"
+black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.2"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}

From fdcab2286c8d9e91779590d6facb3aee34456169 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 May 2023 21:11:57 +0000
Subject: [PATCH 27/34] Bump mkdocs-material from 9.1.4 to 9.1.9

Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.1.4 to 9.1.9.
- [Release notes](https://github.com/squidfunk/mkdocs-material/releases)
- [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG)
- [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.1.4...9.1.9)

---
updated-dependencies:
- dependency-name: mkdocs-material
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 129f923..287d05e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -792,14 +792,14 @@ mkdocs = ">=1.1"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.1.4"
+version = "9.1.9"
 description = "Documentation that simply works"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.4-py3-none-any.whl", hash = "sha256:4c92dcf9365068259bef3eed8e0dd5410056b6f7187bdea2d52848c0f94cd94c"},
-    {file = "mkdocs_material-9.1.4.tar.gz", hash = "sha256:c3a8943e9e4a7d2624291da365bbccf0b9f88688aa6947a46260d8c165cd4389"},
+    {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"},
+    {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"},
 ]
 
 [package.dependencies]
@@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "714083b7f30a677f9a358a9633970fb88b8198d50558a0b50bf311d4a209ed4c"
+content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8"
diff --git a/pyproject.toml b/pyproject.toml
index a164ef7..55ca8ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.2"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}
-mkdocs-material = "^9.1.4"
+mkdocs-material = "^9.1.9"
 pytest = "^7.2.2"
 httpx = "^0.24.0"
 

From 2a21b8f69e7049f03a4ab3e0b5ec51d81456a796 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 May 2023 21:16:08 +0000
Subject: [PATCH 28/34] Bump mkdocs from 1.4.2 to 1.4.3

Bumps [mkdocs](https://github.com/mkdocs/mkdocs) from 1.4.2 to 1.4.3.
- [Release notes](https://github.com/mkdocs/mkdocs/releases)
- [Commits](https://github.com/mkdocs/mkdocs/compare/1.4.2...1.4.3)

---
updated-dependencies:
- dependency-name: mkdocs
  dependency-type: direct:development
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 8 ++++----
 pyproject.toml | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 287d05e..d30dc8f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -747,14 +747,14 @@ files = [
 
 [[package]]
 name = "mkdocs"
-version = "1.4.2"
+version = "1.4.3"
 description = "Project documentation with Markdown."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs-1.4.2-py3-none-any.whl", hash = "sha256:c8856a832c1e56702577023cd64cc5f84948280c1c0fcc6af4cd39006ea6aa8c"},
-    {file = "mkdocs-1.4.2.tar.gz", hash = "sha256:8947af423a6d0facf41ea1195b8e1e8c85ad94ac95ae307fe11232e0424b11c5"},
+    {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"},
+    {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"},
 ]
 
 [package.dependencies]
@@ -1458,4 +1458,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "a921481e74f47e925f7ec2814fa0bc2e07707cb36fd12d9b33ecc6b0402a27c8"
+content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe"
diff --git a/pyproject.toml b/pyproject.toml
index 55ca8ce..1f79b74 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ typing-extensions = "^4.5.0"
 [tool.poetry.group.dev.dependencies]
 black = "^23.3.0"
 twine = "^4.0.2"
-mkdocs = "^1.4.2"
+mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}
 mkdocs-material = "^9.1.9"
 pytest = "^7.2.2"

From 33d41fb8f3f949e29d4038fdf542ee8445af190a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 7 May 2023 00:07:39 +0000
Subject: [PATCH 29/34] Bump pytest from 7.2.2 to 7.3.1

Bumps [pytest](https://github.com/pytest-dev/pytest) from 7.2.2 to 7.3.1.
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/7.2.2...7.3.1)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 30 +++++-------------------------
 pyproject.toml |  2 +-
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d30dc8f..0bd08d5 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -21,25 +21,6 @@ doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
 test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"]
 trio = ["trio (>=0.16,<0.22)"]
 
-[[package]]
-name = "attrs"
-version = "22.2.0"
-description = "Classes Without Boilerplate"
-category = "dev"
-optional = false
-python-versions = ">=3.6"
-files = [
-    {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"},
-    {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"},
-]
-
-[package.extras]
-cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"]
-dev = ["attrs[docs,tests]"]
-docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"]
-tests = ["attrs[tests-no-zope]", "zope.interface"]
-tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"]
-
 [[package]]
 name = "black"
 version = "23.3.0"
@@ -1007,18 +988,17 @@ pyyaml = "*"
 
 [[package]]
 name = "pytest"
-version = "7.2.2"
+version = "7.3.1"
 description = "pytest: simple powerful testing with Python"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"},
-    {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"},
+    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
+    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
 ]
 
 [package.dependencies]
-attrs = ">=19.2.0"
 colorama = {version = "*", markers = "sys_platform == \"win32\""}
 exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
 iniconfig = "*"
@@ -1027,7 +1007,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
 
 [[package]]
 name = "python-dateutil"
@@ -1458,4 +1438,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "f2de41d10587a7f21e4891584de2c7152dfa6f75809144778b2dc34d93395abe"
+content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca"
diff --git a/pyproject.toml b/pyproject.toml
index 1f79b74..6f83611 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.20.0"}
 mkdocs-material = "^9.1.9"
-pytest = "^7.2.2"
+pytest = "^7.3.1"
 httpx = "^0.24.0"
 
 [build-system]

From ae3c639764359890e692776cfb87ff84b911532f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 7 May 2023 00:16:31 +0000
Subject: [PATCH 30/34] Bump mkdocstrings from 0.20.0 to 0.21.2

Bumps [mkdocstrings](https://github.com/mkdocstrings/mkdocstrings) from 0.20.0 to 0.21.2.
- [Release notes](https://github.com/mkdocstrings/mkdocstrings/releases)
- [Changelog](https://github.com/mkdocstrings/mkdocstrings/blob/master/CHANGELOG.md)
- [Commits](https://github.com/mkdocstrings/mkdocstrings/compare/0.20.0...0.21.2)

---
updated-dependencies:
- dependency-name: mkdocstrings
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock    | 9 +++++----
 pyproject.toml | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 0bd08d5..5b364a7 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -808,14 +808,14 @@ files = [
 
 [[package]]
 name = "mkdocstrings"
-version = "0.20.0"
+version = "0.21.2"
 description = "Automatic documentation from sources, for MkDocs."
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocstrings-0.20.0-py3-none-any.whl", hash = "sha256:f17fc2c4f760ec302b069075ef9e31045aa6372ca91d2f35ded3adba8e25a472"},
-    {file = "mkdocstrings-0.20.0.tar.gz", hash = "sha256:c757f4f646d4f939491d6bc9256bfe33e36c5f8026392f49eaa351d241c838e5"},
+    {file = "mkdocstrings-0.21.2-py3-none-any.whl", hash = "sha256:949ef8da92df9d692ca07be50616459a6b536083a25520fd54b00e8814ce019b"},
+    {file = "mkdocstrings-0.21.2.tar.gz", hash = "sha256:304e56a2e90595708a38a13a278e538a67ad82052dd5c8b71f77a604a4f3d911"},
 ]
 
 [package.dependencies]
@@ -826,6 +826,7 @@ mkdocs = ">=1.2"
 mkdocs-autorefs = ">=0.3.1"
 mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""}
 pymdown-extensions = ">=6.3"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""}
 
 [package.extras]
 crystal = ["mkdocstrings-crystal (>=0.3.4)"]
@@ -1438,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "e02fff3d4a50fbc9a89f6f001409a5f066c26a341c2d5f2dfbfb32f07e711eca"
+content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc"
diff --git a/pyproject.toml b/pyproject.toml
index 6f83611..a11faef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ typing-extensions = "^4.5.0"
 black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
-mkdocstrings = {extras = ["python"], version = "^0.20.0"}
+mkdocstrings = {extras = ["python"], version = "^0.21.2"}
 mkdocs-material = "^9.1.9"
 pytest = "^7.3.1"
 httpx = "^0.24.0"

From bc853e3742fd2a4718bd66bd501bdb5ede50f6d3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 6 May 2023 21:32:50 -0400
Subject: [PATCH 31/34] Fix type for eval_logits in LlamaState object

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index a643f51..fc91ea4 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -51,7 +51,7 @@ class LlamaState:
     def __init__(
         self,
         eval_tokens: Deque[llama_cpp.llama_token],
-        eval_logits: Deque[List[llama_cpp.c_float]],
+        eval_logits: Deque[List[float]],
         llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
         llama_state_size: llama_cpp.c_size_t,
     ):

From c76e0913bbc6a039f5456ca44f4d84966e5c14fd Mon Sep 17 00:00:00 2001
From: Andrei <abetlen@gmail.com>
Date: Sat, 6 May 2023 22:18:31 -0400
Subject: [PATCH 32/34] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 80 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++
 2 files changed, 100 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..b8e33e5
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,80 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `llama-cpp-python` did, instead.
+
+# Environment and Context
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+**Note: Many issues seem to be regarding performance issues / differences with `llama.cpp`. In these cases we need to confirm that you're comparing against the version of `llama.cpp` that was built with your python package, and which parameters you're passing to the context.**
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+
+Example environment info:
+```
+llama-cpp-python$ git log | head -1
+commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
+
+llama-cpp-python$ python3 --version
+Python 3.10.10
+
+llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette"
+fastapi            0.95.0
+sse-starlette      1.3.3
+uvicorn            0.21.1
+```
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..bbcbbe7
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.

From 7c3743fe5f2781a8aab9ba8e15f4d250963747cf Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 00:12:47 -0400
Subject: [PATCH 33/34] Update llama.cpp

---
 llama_cpp/llama.py     | 4 ++++
 llama_cpp/llama_cpp.py | 8 ++++----
 vendor/llama.cpp       | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fc91ea4..0db5c10 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -297,21 +297,25 @@ class Llama:
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 k=top_k,
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_tail_free(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 z=llama_cpp.c_float(1.0),
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_typical(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 p=llama_cpp.c_float(1.0),
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_top_p(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.pointer(candidates),
                 p=top_p,
+                min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_temperature(
                 ctx=self.ctx,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index ccec12c..527ed7c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -515,7 +515,7 @@ def llama_sample_top_k(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     k: c_int,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
 
@@ -534,7 +534,7 @@ def llama_sample_top_p(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
 
@@ -553,7 +553,7 @@ def llama_sample_tail_free(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     z: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
 
@@ -572,7 +572,7 @@ def llama_sample_typical(
     ctx: llama_context_p,
     candidates,  # type: _Pointer[llama_token_data_array]
     p: c_float,
-    min_keep: c_size_t = c_size_t(1),
+    min_keep: c_size_t,
 ):
     return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 2edbdb0..1b0fd45 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 2edbdb0f99336cb41f0995061c7602ed54beb863
+Subproject commit 1b0fd454650ef4d68a980e3225488b79e6e9af25

From 397ae97f64bb235db5a773a63caaeea5b258a20c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 7 May 2023 01:41:19 -0400
Subject: [PATCH 34/34] Update README

---
 README.md | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c46fa11..9daca60 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,10 @@ You can force the use of `cmake` on Linux / MacOS setting the `FORCE_CMAKE=1` en
 
 ## High-level API
 
+The high-level API provides a simple managed interface through the `Llama` class.
+
+Below is a short example demonstrating how to use the high-level API to generate text:
+
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="./models/7B/ggml-model.bin")
@@ -90,8 +94,25 @@ docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-
 
 ## Low-level API
 
-The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
-The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
+The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+
+Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
+
+```python
+>>> import llama_cpp
+>>> import ctypes
+>>> params = llama_cpp.llama_context_default_params()
+# use bytes for char * params
+>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
+>>> max_tokens = params.n_ctx
+# use ctypes arrays for array params
+>>> tokens = (llama_cppp.llama_token * int(max_tokens))()
+>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
+>>> llama_cpp.llama_free(ctx)
+```
+
+Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
 
 
 # Documentation