mirror of
https://github.com/maudoin/ollama-voice.git
synced 2024-04-20 16:48:11 +03:00
Push to talk, basic UI and ollama response stream
This commit is contained in:
BIN
assistant.png
Normal file
BIN
assistant.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 822 B |
190
assistant.py
190
assistant.py
@@ -8,9 +8,18 @@ import requests
|
||||
import json
|
||||
import yaml
|
||||
from yaml import Loader
|
||||
import pygame, sys
|
||||
import pygame.locals
|
||||
|
||||
BACK_COLOR = (0,0,0)
|
||||
REC_COLOR = (255,0,0)
|
||||
TEXT_COLOR = (255,255,255)
|
||||
REC_SIZE = 80
|
||||
FONT_SIZE = 24
|
||||
WIDTH = 320
|
||||
HEIGHT = 240
|
||||
|
||||
|
||||
if sys.version_info[0:3] != (3, 9, 13):
|
||||
print('Warning, it was only tested with python 3.9.13, it may fail')
|
||||
|
||||
INPUT_DEFAULT_DURATION_SECONDS = 5
|
||||
INPUT_FORMAT = pyaudio.paInt16
|
||||
@@ -25,18 +34,60 @@ class Assistant:
|
||||
|
||||
|
||||
def __init__(self):
|
||||
|
||||
self.config = self.initConfig()
|
||||
print("Loading Whisper model...")
|
||||
|
||||
programIcon = pygame.image.load('assistant.png')
|
||||
|
||||
self.clock = pygame.time.Clock()
|
||||
pygame.display.set_icon(programIcon)
|
||||
pygame.display.set_caption("Assistant")
|
||||
|
||||
self.windowSurface = pygame.display.set_mode((WIDTH, HEIGHT), 0, 32)
|
||||
self.font = pygame.font.SysFont(None, FONT_SIZE)
|
||||
|
||||
self.audio = pyaudio.PyAudio()
|
||||
try:
|
||||
self.audio.open(format=INPUT_FORMAT,
|
||||
channels=INPUT_CHANNELS,
|
||||
rate=INPUT_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=INPUT_CHUNK).close()
|
||||
except :
|
||||
self.wait_exit()
|
||||
|
||||
self.display_message(self.config.messages.loadingModel)
|
||||
self.model = whisper.load_model(self.config.whisperRecognition.modelPath)
|
||||
self.tts = pyttsx3.init()
|
||||
self.audio = pyaudio.PyAudio()
|
||||
self.conversation_history = [self.config.conversation.context+self.config.conversation.greeting+"\n"]
|
||||
self.conversation_history = [self.config.conversation.context,
|
||||
self.config.conversation.greeting]
|
||||
self.context = []
|
||||
|
||||
self.display_ready()
|
||||
|
||||
self.text_to_speech(self.config.conversation.greeting)
|
||||
|
||||
def wait_exit(self):
|
||||
while True:
|
||||
self.display_message(self.config.messages.noAudioInput)
|
||||
self.clock.tick(60)
|
||||
for event in pygame.event.get():
|
||||
if event.type == pygame.locals.QUIT:
|
||||
self.shutdown()
|
||||
|
||||
def shutdown(self):
|
||||
self.audio.terminate()
|
||||
pygame.quit()
|
||||
sys.exit()
|
||||
|
||||
def initConfig(self):
|
||||
class Inst:
|
||||
pass
|
||||
config=Inst();
|
||||
config.messages = Inst()
|
||||
config.messages.pressSpace = "Pressez sur espace pour parler puis relachez."
|
||||
config.messages.loadingModel = "Loading model..."
|
||||
config.messages.noAudioInput = "Erreur: Pas d'entrée son"
|
||||
config.whisperRecognition = Inst()
|
||||
config.whisperRecognition.modelPath = "whisper/large-v3.pt"
|
||||
config.whisperRecognition.lang = "fr"
|
||||
@@ -54,72 +105,139 @@ class Assistant:
|
||||
#dic depth 2: map values to attributes
|
||||
def dic2Object(dic, object):
|
||||
for key in dic:
|
||||
setattr(object, key, dic[key])
|
||||
if hasattr(object, key):
|
||||
setattr(object, key, dic[key])
|
||||
else:
|
||||
print("Ignoring unknow setting ", key)
|
||||
#dic depth 1: fill depth 2 attributes
|
||||
for key in dic:
|
||||
dic2Object(dic[key], getattr(config, key))
|
||||
if hasattr(config, key):
|
||||
dic2Object(dic[key], getattr(config, key))
|
||||
else:
|
||||
print("Ignoring unknow setting ", key)
|
||||
|
||||
|
||||
return config
|
||||
|
||||
def waveform_from_mic(self, duration=INPUT_DEFAULT_DURATION_SECONDS) -> np.ndarray:
|
||||
def display_rec_start(self):
|
||||
self.windowSurface.fill(BACK_COLOR)
|
||||
pygame.draw.circle(self.windowSurface, REC_COLOR, (WIDTH/2, HEIGHT/2), REC_SIZE)
|
||||
pygame.display.flip()
|
||||
|
||||
stream = self.audio.open(format=INPUT_FORMAT, channels=INPUT_CHANNELS,
|
||||
rate=INPUT_RATE, input=True,
|
||||
def display_message(self, text):
|
||||
self.windowSurface.fill(BACK_COLOR)
|
||||
|
||||
label = self.font.render(text, 1, TEXT_COLOR)
|
||||
size = label.get_rect()[2:4]
|
||||
self.windowSurface.blit(label, (WIDTH/2 - size[0]/2, HEIGHT/2 - size[1]/2))
|
||||
|
||||
pygame.display.flip()
|
||||
|
||||
def display_ready(self):
|
||||
self.display_message(self.config.messages.pressSpace)
|
||||
|
||||
def waveform_from_mic(self, key = pygame.K_SPACE) -> np.ndarray:
|
||||
|
||||
self.display_rec_start()
|
||||
|
||||
stream = self.audio.open(format=INPUT_FORMAT,
|
||||
channels=INPUT_CHANNELS,
|
||||
rate=INPUT_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=INPUT_CHUNK)
|
||||
frames = []
|
||||
|
||||
for _ in range(0, int(INPUT_RATE / INPUT_CHUNK * duration)):
|
||||
data = stream.read(INPUT_CHUNK)
|
||||
frames.append(data)
|
||||
while True:
|
||||
pygame.event.pump() # process event queue
|
||||
pressed = pygame.key.get_pressed()
|
||||
if pressed[key]:
|
||||
data = stream.read(INPUT_CHUNK)
|
||||
frames.append(data)
|
||||
else:
|
||||
break
|
||||
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
self.audio.terminate()
|
||||
|
||||
self.display_ready()
|
||||
|
||||
return np.frombuffer(b''.join(frames), np.int16).astype(np.float32) * (1 / 32768.0)
|
||||
|
||||
def speech_to_text(self, waveform):
|
||||
print("Finished recording, converting to text...")
|
||||
self.text_to_speech(self.config.conversation.recognitionWaitMsg)
|
||||
|
||||
transcript = self.model.transcribe(waveform, language = self.config.whisperRecognition.lang, fp16=torch.cuda.is_available())
|
||||
return transcript["text"]
|
||||
transcript = self.model.transcribe(waveform,
|
||||
language = self.config.whisperRecognition.lang,
|
||||
fp16=torch.cuda.is_available())
|
||||
text = transcript["text"]
|
||||
self.text_to_speech(text)
|
||||
return text
|
||||
|
||||
|
||||
def ask_ollama(self, prompt):
|
||||
print("Sending: ", prompt)
|
||||
self.text_to_speech(prompt+self.config.conversation.llmWaitMsg)
|
||||
def ask_ollama(self, prompt, responseCallback):
|
||||
self.text_to_speech(self.config.conversation.llmWaitMsg)
|
||||
|
||||
self.conversation_history.append(prompt)
|
||||
full_prompt = "\n".join(self.conversation_history)
|
||||
response = requests.post(self.config.ollama.url, json= {"model": self.config.ollama.model,"stream":False,"prompt":full_prompt}, headers=OLLAMA_REST_HEADERS)
|
||||
if response.status_code == 200:
|
||||
data = json.loads(response.text)
|
||||
response_text = data["response"]
|
||||
self.conversation_history.append(response_text)
|
||||
print("Received: ", response_text)
|
||||
return response_text
|
||||
else:
|
||||
return "Erreur: " + response.text
|
||||
jsonParam= {"model": self.config.ollama.model,
|
||||
"stream":True,
|
||||
"context":self.context,
|
||||
"prompt":full_prompt}
|
||||
print(jsonParam)
|
||||
response = requests.post(self.config.ollama.url,
|
||||
json=jsonParam,
|
||||
headers=OLLAMA_REST_HEADERS,
|
||||
stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
tokens = []
|
||||
for line in response.iter_lines():
|
||||
print(line)
|
||||
body = json.loads(line)
|
||||
token = body.get('response', '')
|
||||
tokens.append(token)
|
||||
# the response streams one token at a time, print that as we receive it
|
||||
if token == "." or token == ":":
|
||||
responseCallback("".join(tokens))
|
||||
tokens = []
|
||||
|
||||
if 'error' in body:
|
||||
responseCallback("Erreur: " + body['error'])
|
||||
|
||||
if body.get('done', False):
|
||||
self.context = body['context']
|
||||
|
||||
def text_to_speech(self, text):
|
||||
print(text)
|
||||
self.tts.say(text)
|
||||
self.tts.runAndWait()
|
||||
|
||||
def main():
|
||||
|
||||
if sys.version_info[0:3] != (3, 9, 13):
|
||||
print('Warning, it was only tested with python 3.9.13, it may fail')
|
||||
|
||||
pygame.init()
|
||||
|
||||
ass = Assistant()
|
||||
|
||||
ass.text_to_speech(ass.config.conversation.greeting)
|
||||
print("Recording...")
|
||||
push_to_talk_key = pygame.K_SPACE;
|
||||
|
||||
while True:
|
||||
ass.clock.tick(60)
|
||||
for event in pygame.event.get():
|
||||
if event.type == pygame.KEYDOWN and event.key == push_to_talk_key:
|
||||
print('Talk to me!')
|
||||
speech = ass.waveform_from_mic(push_to_talk_key)
|
||||
|
||||
speech = ass.waveform_from_mic()
|
||||
transcription = ass.speech_to_text(waveform=speech)
|
||||
|
||||
ass.ask_ollama(transcription, ass.text_to_speech)
|
||||
print('Done')
|
||||
|
||||
transcription = ass.speech_to_text(waveform=speech)
|
||||
|
||||
response = ass.ask_ollama(transcription)
|
||||
if event.type == pygame.locals.QUIT:
|
||||
ass.shutdown()
|
||||
|
||||
ass.text_to_speech(text=response)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
messages:
|
||||
pressSpace: "Pressez sur espace pour parler puis relachez."
|
||||
loadingModel: "Chargement du modèle..."
|
||||
noAudioInput: "Erreur: Pas d'entrée son"
|
||||
|
||||
whisperRecognition:
|
||||
modelPath: "whisper/large-v3.pt"
|
||||
lang: "fr"
|
||||
@@ -7,7 +12,7 @@ ollama:
|
||||
model: "mistral"
|
||||
|
||||
conversation:
|
||||
context: "This is a discussion in french.\\n"
|
||||
greeting: " Je vous écoute."
|
||||
recognitionWaitMsg: " J'interprète votre demande."
|
||||
llmWaitMsg: " Laissez moi réfléchir."
|
||||
context: "This is a discussion in french."
|
||||
greeting: "Je vous écoute."
|
||||
recognitionWaitMsg: "Oui."
|
||||
llmWaitMsg: "Laissez moi réfléchir."
|
||||
|
||||
@@ -9,4 +9,5 @@ openai==1.2.3
|
||||
Wave==0.0.2
|
||||
openai-whisper @ git+https://github.com/openai/whisper.git@fcfeaf1b61994c071bba62da47d7846933576ac9
|
||||
PyAudio==0.2.14
|
||||
pyyaml==6.0.1
|
||||
pyyaml==6.0.1
|
||||
pygame==2.5.2
|
||||
Reference in New Issue
Block a user