First version

This commit is contained in:
M
2023-11-12 02:41:11 +01:00
parent e9e9efaf0a
commit 96c3b0feb9
5 changed files with 177 additions and 1 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
pytorch_model_*.bin
whisper/**

View File

@@ -1,2 +1,26 @@
# ollama-voice
plug whisper audio transcription to a local ollama server and ouput tts audio responses
Plug whisper audio transcription to a local ollama server and ouput tts audio responses
This is just a simple combination of three tools in offline mode:
- Speech recognition: [whisper](https://github.com/openai/whisper) running local models in offline mode
- Large Language Mode: [ollama](https://github.com/jmorganca/ollama) running local models in offline mode
- Offline Text To Speech: [pyttsx3](https://pypi.org/project/pyttsx3/)
## Prerequisites
whisper dependencies are setup to run on GPU so Install Cuda before running `pip install`.
## Running
Install [ollama](https://ollama.ai/) and ensure server is started locally first (in WLS under windows) (e.g. `curl https://ollama.ai/install.sh | sh`)
Download a [whisper](https://github.com/openai/whisper) [model](https://github.com/openai/whisper#available-models-and-languages) and place it in the `whisper` subfolder (e.g. https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt)
Configure `assistant.yaml` settings. (It is setup to work in french with ollama [mistral](https://ollama.ai/library/mistral) model by default...)
Run `assistant.py`
## Todo
- Allow a full conversation with a "press to talk" function between requests
- Process ollama json responses in stream mode to generate voice at the end of each sentence.

125
assistant.py Normal file
View File

@@ -0,0 +1,125 @@
import pyttsx3
import numpy as np
import whisper
import pyaudio
import sys
import torch
import requests
import json
import yaml
from yaml import Loader
if sys.version_info[0:3] != (3, 9, 13):
print('Warning, it was only tested with python 3.9.13, it may fail')
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
OLLAMA_REST_HEADERS = {'Content-Type': 'application/json',}
INPUT_CONFIG_PATH ="assistant.yaml"
class Assistant:
def __init__(self):
self.config = self.initConfig()
self.audio = pyaudio.PyAudio()
print("Loading Whisper model...")
self.model = whisper.load_model(self.config.whisperRecognition.modelPath)
self.tts = pyttsx3.init()
self.conversation_history = [self.config.conversation.context+self.config.conversation.greeting+"\n"]
def initConfig(self):
class Inst:
pass
config=Inst();
config.whisperRecognition = Inst()
config.whisperRecognition.modelPath = "whisper/large-v3.pt"
config.whisperRecognition.lang = "fr"
config.ollama = Inst()
config.ollama.url = "http://localhost:11434/api/generate"
config.ollama.model = 'mistral'
config.conversation = Inst()
config.conversation.context = "This is a discussion in french.\n"
config.conversation.greeting = "Je vous écoute."
config.conversation.recognitionWaitMsg = "J'interprète votre demande."
config.conversation.llmWaitMsg = "Laissez moi réfléchir."
stream = open(INPUT_CONFIG_PATH, 'r', encoding="utf-8")
dic = yaml.load(stream, Loader=Loader)
#dic depth 2: map values to attributes
def dic2Object(dic, object):
for key in dic:
setattr(object, key, dic[key])
#dic depth 1: fill depth 2 attributes
for key in dic:
dic2Object(dic[key], getattr(config, key))
return config
def waveform_from_mic(self, duration=5) -> np.ndarray:
stream = self.audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
stream.stop_stream()
stream.close()
self.audio.terminate()
return np.frombuffer(b''.join(frames), np.int16).astype(np.float32) * (1 / 32768.0)
def speech_to_text(self, waveform):
print("Finished recording, converting to text...")
self.text_to_speech(self.config.conversation.recognitionWaitMsg)
transcript = self.model.transcribe(waveform, language = self.config.whisperRecognition.lang, fp16=torch.cuda.is_available())
return transcript["text"]
def ask_ollama(self, prompt):
print("Sending: ", prompt)
self.text_to_speech(prompt+self.config.conversation.llmWaitMsg)
self.conversation_history.append(prompt)
full_prompt = "\n".join(self.conversation_history)
response = requests.post(self.config.ollama.url, json= {"model": self.config.ollama.model,"stream":False,"prompt":full_prompt}, headers=OLLAMA_REST_HEADERS)
if response.status_code == 200:
data = json.loads(response.text)
response_text = data["response"]
self.conversation_history.append(response_text)
print("Received: ", response_text)
return response_text
else:
return "Erreur: " + response.text
def text_to_speech(self, text):
self.tts.say(text)
self.tts.runAndWait()
def main():
ass = Assistant()
ass.text_to_speech(ass.config.conversation.greeting)
print("Recording...")
speech = ass.waveform_from_mic()
transcription = ass.speech_to_text(waveform=speech)
response = ass.ask_ollama(transcription)
ass.text_to_speech(text=response)
if __name__ == "__main__":
main()

13
assistant.yaml Normal file
View File

@@ -0,0 +1,13 @@
whisperRecognition:
modelPath: "whisper/large-v3.pt"
lang: "fr"
ollama:
url: "http://localhost:11434/api/generate"
model: "mistral"
conversation:
context: "This is a discussion in french.\\n"
greeting: " Je vous écoute."
recognitionWaitMsg: " J'interprète votre demande."
llmWaitMsg: " Laissez moi réfléchir."

12
requirements.txt Normal file
View File

@@ -0,0 +1,12 @@
cuda-python==12.3.0
--extra-index-url https://download.pytorch.org/whl/cu121
torch==2.1.0+cu121
torchvision==0.16.0+cu121
torchaudio==2.1.0+cu121
pyttsx3==2.90
blobfile==2.1.1
openai==1.2.3
Wave==0.0.2
openai-whisper @ git+https://github.com/openai/whisper.git@fcfeaf1b61994c071bba62da47d7846933576ac9
PyAudio==0.2.14
pyyaml==6.0.1