Remove excessive errors="ignore" and add utf8 test

2023-09-07 17:34:22 +03:00 · 2023-04-29 12:19:22 +02:00
parent b7d14efc8b
commit 18a0c10032
2 changed files with 39 additions and 5 deletions
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -358,7 +358,7 @@ class Llama:
        if self.verbose:
            llama_cpp.llama_reset_timings(self.ctx)

-        tokens = self.tokenize(input.encode("utf-8", errors="ignore"))
+        tokens = self.tokenize(input.encode("utf-8"))
        self.reset()
        self.eval(tokens)
        n_tokens = len(tokens)
@@ -416,7 +416,7 @@ class Llama:
        completion_tokens: List[llama_cpp.llama_token] = []
        # Add blank space to start of prompt to match OG llama tokenizer
        prompt_tokens: List[llama_cpp.llama_token] = self.tokenize(
-            b" " + prompt.encode("utf-8", errors="ignore")
+            b" " + prompt.encode("utf-8")
        )
        text: bytes = b""
        returned_characters: int = 0
@@ -431,7 +431,7 @@ class Llama:
            )

        if stop != []:
-            stop_sequences = [s.encode("utf-8", errors="ignore") for s in stop]
+            stop_sequences = [s.encode("utf-8") for s in stop]
        else:
            stop_sequences = []