First version

2024-04-20 16:48:11 +03:00 · 2023-11-12 02:41:11 +01:00
parent e9e9efaf0a
commit 96c3b0feb9
5 changed files with 177 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+pytorch_model_*.bin
+whisper/**
--- a/README.md
+++ b/README.md
@@ -1,2 +1,26 @@
 # ollama-voice
-plug whisper audio transcription to a local ollama server and ouput tts audio responses
+Plug whisper audio transcription to a local ollama server and ouput tts audio responses
+
+This is just a simple combination of three tools in offline mode:
+ - Speech recognition: [whisper](https://github.com/openai/whisper) running local models in offline mode
+ - Large Language Mode: [ollama](https://github.com/jmorganca/ollama) running local models in offline mode
+ - Offline Text To Speech: [pyttsx3](https://pypi.org/project/pyttsx3/)
+
+## Prerequisites
+
+whisper dependencies are setup to run on GPU so Install Cuda before running `pip install`.
+
+## Running
+
+Install [ollama](https://ollama.ai/) and ensure server is started locally first (in WLS under windows) (e.g. `curl https://ollama.ai/install.sh | sh`)
+
+Download a [whisper](https://github.com/openai/whisper) [model](https://github.com/openai/whisper#available-models-and-languages) and place it in the `whisper` subfolder (e.g. https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt)
+
+Configure `assistant.yaml` settings. (It is setup to work in french with ollama [mistral](https://ollama.ai/library/mistral) model by default...)
+
+Run `assistant.py`
+
+## Todo
+
+- Allow a full conversation with a "press to talk" function between requests
+- Process ollama json responses in stream mode to generate voice at the end of each sentence.
--- a/assistant.py
+++ b/assistant.py
@@ -0,0 +1,125 @@
+import pyttsx3
+import numpy as np
+import whisper
+import pyaudio
+import sys
+import torch
+import requests
+import json
+import yaml
+from yaml import Loader
+
+if sys.version_info[0:3] != (3, 9, 13):
+    print('Warning, it was only tested with python 3.9.13, it may fail')
+
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+CHUNK = 1024
+OLLAMA_REST_HEADERS = {'Content-Type': 'application/json',}
+INPUT_CONFIG_PATH ="assistant.yaml"
+    
+
+class Assistant:
+
+
+    def __init__(self):
+        self.config = self.initConfig()        
+        self.audio = pyaudio.PyAudio()
+        print("Loading Whisper model...")
+        self.model = whisper.load_model(self.config.whisperRecognition.modelPath)
+        self.tts = pyttsx3.init()
+        self.conversation_history = [self.config.conversation.context+self.config.conversation.greeting+"\n"]
+
+
+    def initConfig(self):
+        class Inst:
+            pass
+        config=Inst();
+        config.whisperRecognition = Inst()
+        config.whisperRecognition.modelPath = "whisper/large-v3.pt"
+        config.whisperRecognition.lang = "fr"
+        config.ollama = Inst()
+        config.ollama.url = "http://localhost:11434/api/generate"
+        config.ollama.model = 'mistral'
+        config.conversation = Inst()
+        config.conversation.context = "This is a discussion in french.\n"
+        config.conversation.greeting = "Je vous écoute."
+        config.conversation.recognitionWaitMsg = "J'interprète votre demande."
+        config.conversation.llmWaitMsg = "Laissez moi réfléchir."
+        
+        stream = open(INPUT_CONFIG_PATH, 'r', encoding="utf-8")
+        dic = yaml.load(stream, Loader=Loader)
+        #dic depth 2: map values to attributes
+        def dic2Object(dic, object):
+            for key in dic: 
+                setattr(object, key, dic[key]) 
+        #dic depth 1: fill depth 2 attributes
+        for key in dic: 
+             dic2Object(dic[key], getattr(config, key))
+
+        return config
+
+    def waveform_from_mic(self, duration=5) -> np.ndarray:
+
+        stream = self.audio.open(format=FORMAT, channels=CHANNELS,
+                                 rate=RATE, input=True,
+                                 frames_per_buffer=CHUNK)
+        frames = []
+
+        for _ in range(0, int(RATE / CHUNK * duration)):
+            data = stream.read(CHUNK)
+            frames.append(data)
+
+        stream.stop_stream()
+        stream.close()
+        self.audio.terminate()
+
+        return np.frombuffer(b''.join(frames), np.int16).astype(np.float32) * (1 / 32768.0)
+
+    def speech_to_text(self, waveform):
+        print("Finished recording, converting to text...")
+        self.text_to_speech(self.config.conversation.recognitionWaitMsg)
+
+        transcript = self.model.transcribe(waveform, language = self.config.whisperRecognition.lang, fp16=torch.cuda.is_available())
+        return transcript["text"]
+    
+    
+    def ask_ollama(self, prompt):
+        print("Sending: ", prompt)
+        self.text_to_speech(prompt+self.config.conversation.llmWaitMsg)    
+
+        self.conversation_history.append(prompt)
+        full_prompt = "\n".join(self.conversation_history)
+        response = requests.post(self.config.ollama.url, json= {"model": self.config.ollama.model,"stream":False,"prompt":full_prompt}, headers=OLLAMA_REST_HEADERS)
+        if response.status_code == 200:
+            data = json.loads(response.text)
+            response_text = data["response"]
+            self.conversation_history.append(response_text)
+            print("Received: ", response_text)
+            return response_text
+        else:
+            return "Erreur: " + response.text
+
+    def text_to_speech(self, text):
+        self.tts.say(text)
+        self.tts.runAndWait()
+
+def main():
+
+    ass = Assistant()
+
+    ass.text_to_speech(ass.config.conversation.greeting)
+    print("Recording...")
+
+    speech = ass.waveform_from_mic()
+
+    transcription = ass.speech_to_text(waveform=speech)
+    
+    response = ass.ask_ollama(transcription)
+
+    ass.text_to_speech(text=response)
+
+if __name__ == "__main__":
+    main()
+
--- a/assistant.yaml
+++ b/assistant.yaml
@@ -0,0 +1,13 @@
+whisperRecognition:
+  modelPath: "whisper/large-v3.pt"
+  lang: "fr"
+
+ollama:
+  url: "http://localhost:11434/api/generate"
+  model: "mistral"
+
+conversation:
+  context: "This is a discussion in french.\\n"
+  greeting: " Je vous écoute."
+  recognitionWaitMsg: " J'interprète votre demande."
+  llmWaitMsg: " Laissez moi réfléchir."
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,12 @@
+cuda-python==12.3.0
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.1.0+cu121
+torchvision==0.16.0+cu121
+torchaudio==2.1.0+cu121
+pyttsx3==2.90
+blobfile==2.1.1
+openai==1.2.3
+Wave==0.0.2
+openai-whisper @ git+https://github.com/openai/whisper.git@fcfeaf1b61994c071bba62da47d7846933576ac9
+PyAudio==0.2.14
+pyyaml==6.0.1