mirror of
https://github.com/maudoin/ollama-voice.git
synced 2024-04-20 16:48:11 +03:00
First version
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
pytorch_model_*.bin
|
||||
whisper/**
|
||||
26
README.md
26
README.md
@@ -1,2 +1,26 @@
|
||||
# ollama-voice
|
||||
plug whisper audio transcription to a local ollama server and ouput tts audio responses
|
||||
Plug whisper audio transcription to a local ollama server and ouput tts audio responses
|
||||
|
||||
This is just a simple combination of three tools in offline mode:
|
||||
- Speech recognition: [whisper](https://github.com/openai/whisper) running local models in offline mode
|
||||
- Large Language Mode: [ollama](https://github.com/jmorganca/ollama) running local models in offline mode
|
||||
- Offline Text To Speech: [pyttsx3](https://pypi.org/project/pyttsx3/)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
whisper dependencies are setup to run on GPU so Install Cuda before running `pip install`.
|
||||
|
||||
## Running
|
||||
|
||||
Install [ollama](https://ollama.ai/) and ensure server is started locally first (in WLS under windows) (e.g. `curl https://ollama.ai/install.sh | sh`)
|
||||
|
||||
Download a [whisper](https://github.com/openai/whisper) [model](https://github.com/openai/whisper#available-models-and-languages) and place it in the `whisper` subfolder (e.g. https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt)
|
||||
|
||||
Configure `assistant.yaml` settings. (It is setup to work in french with ollama [mistral](https://ollama.ai/library/mistral) model by default...)
|
||||
|
||||
Run `assistant.py`
|
||||
|
||||
## Todo
|
||||
|
||||
- Allow a full conversation with a "press to talk" function between requests
|
||||
- Process ollama json responses in stream mode to generate voice at the end of each sentence.
|
||||
125
assistant.py
Normal file
125
assistant.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import pyttsx3
|
||||
import numpy as np
|
||||
import whisper
|
||||
import pyaudio
|
||||
import sys
|
||||
import torch
|
||||
import requests
|
||||
import json
|
||||
import yaml
|
||||
from yaml import Loader
|
||||
|
||||
if sys.version_info[0:3] != (3, 9, 13):
|
||||
print('Warning, it was only tested with python 3.9.13, it may fail')
|
||||
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
RATE = 16000
|
||||
CHUNK = 1024
|
||||
OLLAMA_REST_HEADERS = {'Content-Type': 'application/json',}
|
||||
INPUT_CONFIG_PATH ="assistant.yaml"
|
||||
|
||||
|
||||
class Assistant:
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.config = self.initConfig()
|
||||
self.audio = pyaudio.PyAudio()
|
||||
print("Loading Whisper model...")
|
||||
self.model = whisper.load_model(self.config.whisperRecognition.modelPath)
|
||||
self.tts = pyttsx3.init()
|
||||
self.conversation_history = [self.config.conversation.context+self.config.conversation.greeting+"\n"]
|
||||
|
||||
|
||||
def initConfig(self):
|
||||
class Inst:
|
||||
pass
|
||||
config=Inst();
|
||||
config.whisperRecognition = Inst()
|
||||
config.whisperRecognition.modelPath = "whisper/large-v3.pt"
|
||||
config.whisperRecognition.lang = "fr"
|
||||
config.ollama = Inst()
|
||||
config.ollama.url = "http://localhost:11434/api/generate"
|
||||
config.ollama.model = 'mistral'
|
||||
config.conversation = Inst()
|
||||
config.conversation.context = "This is a discussion in french.\n"
|
||||
config.conversation.greeting = "Je vous écoute."
|
||||
config.conversation.recognitionWaitMsg = "J'interprète votre demande."
|
||||
config.conversation.llmWaitMsg = "Laissez moi réfléchir."
|
||||
|
||||
stream = open(INPUT_CONFIG_PATH, 'r', encoding="utf-8")
|
||||
dic = yaml.load(stream, Loader=Loader)
|
||||
#dic depth 2: map values to attributes
|
||||
def dic2Object(dic, object):
|
||||
for key in dic:
|
||||
setattr(object, key, dic[key])
|
||||
#dic depth 1: fill depth 2 attributes
|
||||
for key in dic:
|
||||
dic2Object(dic[key], getattr(config, key))
|
||||
|
||||
return config
|
||||
|
||||
def waveform_from_mic(self, duration=5) -> np.ndarray:
|
||||
|
||||
stream = self.audio.open(format=FORMAT, channels=CHANNELS,
|
||||
rate=RATE, input=True,
|
||||
frames_per_buffer=CHUNK)
|
||||
frames = []
|
||||
|
||||
for _ in range(0, int(RATE / CHUNK * duration)):
|
||||
data = stream.read(CHUNK)
|
||||
frames.append(data)
|
||||
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
self.audio.terminate()
|
||||
|
||||
return np.frombuffer(b''.join(frames), np.int16).astype(np.float32) * (1 / 32768.0)
|
||||
|
||||
def speech_to_text(self, waveform):
|
||||
print("Finished recording, converting to text...")
|
||||
self.text_to_speech(self.config.conversation.recognitionWaitMsg)
|
||||
|
||||
transcript = self.model.transcribe(waveform, language = self.config.whisperRecognition.lang, fp16=torch.cuda.is_available())
|
||||
return transcript["text"]
|
||||
|
||||
|
||||
def ask_ollama(self, prompt):
|
||||
print("Sending: ", prompt)
|
||||
self.text_to_speech(prompt+self.config.conversation.llmWaitMsg)
|
||||
|
||||
self.conversation_history.append(prompt)
|
||||
full_prompt = "\n".join(self.conversation_history)
|
||||
response = requests.post(self.config.ollama.url, json= {"model": self.config.ollama.model,"stream":False,"prompt":full_prompt}, headers=OLLAMA_REST_HEADERS)
|
||||
if response.status_code == 200:
|
||||
data = json.loads(response.text)
|
||||
response_text = data["response"]
|
||||
self.conversation_history.append(response_text)
|
||||
print("Received: ", response_text)
|
||||
return response_text
|
||||
else:
|
||||
return "Erreur: " + response.text
|
||||
|
||||
def text_to_speech(self, text):
|
||||
self.tts.say(text)
|
||||
self.tts.runAndWait()
|
||||
|
||||
def main():
|
||||
|
||||
ass = Assistant()
|
||||
|
||||
ass.text_to_speech(ass.config.conversation.greeting)
|
||||
print("Recording...")
|
||||
|
||||
speech = ass.waveform_from_mic()
|
||||
|
||||
transcription = ass.speech_to_text(waveform=speech)
|
||||
|
||||
response = ass.ask_ollama(transcription)
|
||||
|
||||
ass.text_to_speech(text=response)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
13
assistant.yaml
Normal file
13
assistant.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
whisperRecognition:
|
||||
modelPath: "whisper/large-v3.pt"
|
||||
lang: "fr"
|
||||
|
||||
ollama:
|
||||
url: "http://localhost:11434/api/generate"
|
||||
model: "mistral"
|
||||
|
||||
conversation:
|
||||
context: "This is a discussion in french.\\n"
|
||||
greeting: " Je vous écoute."
|
||||
recognitionWaitMsg: " J'interprète votre demande."
|
||||
llmWaitMsg: " Laissez moi réfléchir."
|
||||
12
requirements.txt
Normal file
12
requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
cuda-python==12.3.0
|
||||
--extra-index-url https://download.pytorch.org/whl/cu121
|
||||
torch==2.1.0+cu121
|
||||
torchvision==0.16.0+cu121
|
||||
torchaudio==2.1.0+cu121
|
||||
pyttsx3==2.90
|
||||
blobfile==2.1.1
|
||||
openai==1.2.3
|
||||
Wave==0.0.2
|
||||
openai-whisper @ git+https://github.com/openai/whisper.git@fcfeaf1b61994c071bba62da47d7846933576ac9
|
||||
PyAudio==0.2.14
|
||||
pyyaml==6.0.1
|
||||
Reference in New Issue
Block a user