Push to talk, basic UI and ollama response stream

2024-04-20 16:48:11 +03:00 · 2023-11-12 18:28:47 +01:00
parent b2f53b9075
commit 928471b50b
4 changed files with 165 additions and 41 deletions
--- a/assistant.png
+++ b/assistant.png
--- a/assistant.py
+++ b/assistant.py
@@ -8,9 +8,18 @@ import requests
 import json
 import yaml
 from yaml import Loader
+import pygame, sys
+import pygame.locals
+
+BACK_COLOR = (0,0,0)
+REC_COLOR = (255,0,0)
+TEXT_COLOR = (255,255,255)
+REC_SIZE = 80
+FONT_SIZE = 24
+WIDTH = 320
+HEIGHT = 240
+

-if sys.version_info[0:3] != (3, 9, 13):
-    print('Warning, it was only tested with python 3.9.13, it may fail')

 INPUT_DEFAULT_DURATION_SECONDS = 5
 INPUT_FORMAT = pyaudio.paInt16
@@ -25,18 +34,60 @@ class Assistant:


    def __init__(self):
+
        self.config = self.initConfig()    
-        print("Loading Whisper model...")
+
+        programIcon = pygame.image.load('assistant.png')
+
+        self.clock = pygame.time.Clock()
+        pygame.display.set_icon(programIcon)
+        pygame.display.set_caption("Assistant")
+
+        self.windowSurface = pygame.display.set_mode((WIDTH, HEIGHT), 0, 32)
+        self.font = pygame.font.SysFont(None, FONT_SIZE)
+
+        self.audio = pyaudio.PyAudio()
+        try:
+            self.audio.open(format=INPUT_FORMAT, 
+                            channels=INPUT_CHANNELS,
+                            rate=INPUT_RATE, 
+                            input=True,
+                            frames_per_buffer=INPUT_CHUNK).close()
+        except :        
+            self.wait_exit()
+
+        self.display_message(self.config.messages.loadingModel)
        self.model = whisper.load_model(self.config.whisperRecognition.modelPath)
        self.tts = pyttsx3.init()    
-        self.audio = pyaudio.PyAudio()
-        self.conversation_history = [self.config.conversation.context+self.config.conversation.greeting+"\n"]
+        self.conversation_history = [self.config.conversation.context,
+                                     self.config.conversation.greeting]
+        self.context = []

+        self.display_ready()
+
+        self.text_to_speech(self.config.conversation.greeting)
+
+    def wait_exit(self):
+        while True:
+            self.display_message(self.config.messages.noAudioInput)
+            self.clock.tick(60)
+            for event in pygame.event.get():
+                if event.type == pygame.locals.QUIT:
+                    self.shutdown()
+
+    def shutdown(self):
+        self.audio.terminate()
+        pygame.quit()
+        sys.exit()

    def initConfig(self):
        class Inst:
            pass
        config=Inst();
+        config.messages = Inst()
+        config.messages.pressSpace = "Pressez sur espace pour parler puis relachez."
+        config.messages.loadingModel = "Loading model..."
+        config.messages.noAudioInput = "Erreur: Pas d'entrée son"
        config.whisperRecognition = Inst()
        config.whisperRecognition.modelPath = "whisper/large-v3.pt"
        config.whisperRecognition.lang = "fr"
@@ -54,72 +105,139 @@ class Assistant:
        #dic depth 2: map values to attributes
        def dic2Object(dic, object):
            for key in dic: 
-                setattr(object, key, dic[key]) 
+                if hasattr(object, key):
+                    setattr(object, key, dic[key]) 
+                else:
+                    print("Ignoring unknow setting ", key)
        #dic depth 1: fill depth 2 attributes
        for key in dic: 
-             dic2Object(dic[key], getattr(config, key))
+            if hasattr(config, key):
+                dic2Object(dic[key], getattr(config, key))
+            else:
+                print("Ignoring unknow setting ", key)
+                

        return config

-    def waveform_from_mic(self, duration=INPUT_DEFAULT_DURATION_SECONDS) -> np.ndarray:
+    def display_rec_start(self):
+        self.windowSurface.fill(BACK_COLOR)
+        pygame.draw.circle(self.windowSurface, REC_COLOR, (WIDTH/2, HEIGHT/2), REC_SIZE)
+        pygame.display.flip()

-        stream = self.audio.open(format=INPUT_FORMAT, channels=INPUT_CHANNELS,
-                                 rate=INPUT_RATE, input=True,
+    def display_message(self, text):
+        self.windowSurface.fill(BACK_COLOR)
+        
+        label = self.font.render(text, 1, TEXT_COLOR)
+        size = label.get_rect()[2:4]
+        self.windowSurface.blit(label, (WIDTH/2 - size[0]/2, HEIGHT/2 - size[1]/2))
+
+        pygame.display.flip()
+
+    def display_ready(self):
+        self.display_message(self.config.messages.pressSpace)
+
+    def waveform_from_mic(self, key = pygame.K_SPACE) -> np.ndarray:
+
+        self.display_rec_start()
+
+        stream = self.audio.open(format=INPUT_FORMAT, 
+                                 channels=INPUT_CHANNELS,
+                                 rate=INPUT_RATE, 
+                                 input=True,
                                 frames_per_buffer=INPUT_CHUNK)
        frames = []

-        for _ in range(0, int(INPUT_RATE / INPUT_CHUNK * duration)):
-            data = stream.read(INPUT_CHUNK)
-            frames.append(data)
+        while True:            
+            pygame.event.pump() # process event queue
+            pressed = pygame.key.get_pressed()
+            if pressed[key]:
+                data = stream.read(INPUT_CHUNK)
+                frames.append(data)
+            else:
+                break

        stream.stop_stream()
        stream.close()
-        self.audio.terminate()
+        
+        self.display_ready()

        return np.frombuffer(b''.join(frames), np.int16).astype(np.float32) * (1 / 32768.0)

    def speech_to_text(self, waveform):
-        print("Finished recording, converting to text...")
        self.text_to_speech(self.config.conversation.recognitionWaitMsg)

-        transcript = self.model.transcribe(waveform, language = self.config.whisperRecognition.lang, fp16=torch.cuda.is_available())
-        return transcript["text"]
+        transcript = self.model.transcribe(waveform, 
+                                           language = self.config.whisperRecognition.lang, 
+                                           fp16=torch.cuda.is_available())
+        text = transcript["text"]
+        self.text_to_speech(text)
+        return text
    
    
-    def ask_ollama(self, prompt):
-        print("Sending: ", prompt)
-        self.text_to_speech(prompt+self.config.conversation.llmWaitMsg)    
+    def ask_ollama(self, prompt, responseCallback):
+        self.text_to_speech(self.config.conversation.llmWaitMsg)    

        self.conversation_history.append(prompt)
        full_prompt = "\n".join(self.conversation_history)
-        response = requests.post(self.config.ollama.url, json= {"model": self.config.ollama.model,"stream":False,"prompt":full_prompt}, headers=OLLAMA_REST_HEADERS)
-        if response.status_code == 200:
-            data = json.loads(response.text)
-            response_text = data["response"]
-            self.conversation_history.append(response_text)
-            print("Received: ", response_text)
-            return response_text
-        else:
-            return "Erreur: " + response.text
+        jsonParam= {"model": self.config.ollama.model,
+                                        "stream":True,
+                                        "context":self.context,
+                                        "prompt":full_prompt}
+        print(jsonParam)
+        response = requests.post(self.config.ollama.url, 
+                                 json=jsonParam, 
+                                 headers=OLLAMA_REST_HEADERS,
+                                 stream=True)
+        response.raise_for_status()
+
+        tokens = []
+        for line in response.iter_lines():
+            print(line)
+            body = json.loads(line)
+            token = body.get('response', '')
+            tokens.append(token)
+            # the response streams one token at a time, print that as we receive it
+            if token == "." or token == ":":
+                responseCallback("".join(tokens))
+                tokens = []
+
+            if 'error' in body:
+                responseCallback("Erreur: " + body['error'])
+
+            if body.get('done', False):
+                self.context = body['context']

    def text_to_speech(self, text):
+        print(text)
        self.tts.say(text)
        self.tts.runAndWait()

 def main():

+    if sys.version_info[0:3] != (3, 9, 13):
+        print('Warning, it was only tested with python 3.9.13, it may fail')
+        
+    pygame.init()
+
    ass = Assistant()

-    ass.text_to_speech(ass.config.conversation.greeting)
-    print("Recording...")
+    push_to_talk_key = pygame.K_SPACE;
+   
+    while True:
+        ass.clock.tick(60)
+        for event in pygame.event.get():
+            if event.type == pygame.KEYDOWN and event.key == push_to_talk_key:
+                print('Talk to me!')
+                speech = ass.waveform_from_mic(push_to_talk_key)

-    speech = ass.waveform_from_mic()
+                transcription = ass.speech_to_text(waveform=speech)
+                
+                ass.ask_ollama(transcription, ass.text_to_speech)
+                print('Done')

-    transcription = ass.speech_to_text(waveform=speech)
-    
-    response = ass.ask_ollama(transcription)
+            if event.type == pygame.locals.QUIT:
+                ass.shutdown()

-    ass.text_to_speech(text=response)

 if __name__ == "__main__":
    main()
--- a/assistant.yaml
+++ b/assistant.yaml
@@ -1,3 +1,8 @@
+messages:
+  pressSpace: "Pressez sur espace pour parler puis relachez."
+  loadingModel: "Chargement du modèle..."
+  noAudioInput: "Erreur: Pas d'entrée son"
+
 whisperRecognition:
  modelPath: "whisper/large-v3.pt"
  lang: "fr"
@@ -7,7 +12,7 @@ ollama:
  model: "mistral"

 conversation:
-  context: "This is a discussion in french.\\n"
-  greeting: " Je vous écoute."
-  recognitionWaitMsg: " J'interprète votre demande."
-  llmWaitMsg: " Laissez moi réfléchir."
+  context: "This is a discussion in french."
+  greeting: "Je vous écoute."
+  recognitionWaitMsg: "Oui."
+  llmWaitMsg: "Laissez moi réfléchir."
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ openai==1.2.3
 Wave==0.0.2
 openai-whisper @ git+https://github.com/openai/whisper.git@fcfeaf1b61994c071bba62da47d7846933576ac9
 PyAudio==0.2.14
-pyyaml==6.0.1
+pyyaml==6.0.1
+pygame==2.5.2