Push to talk, basic UI and ollama response stream

This commit is contained in:
M
2023-11-12 18:28:47 +01:00
parent b2f53b9075
commit 928471b50b
4 changed files with 165 additions and 41 deletions

BIN
assistant.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 822 B

View File

@@ -8,9 +8,18 @@ import requests
import json
import yaml
from yaml import Loader
import pygame, sys
import pygame.locals
BACK_COLOR = (0,0,0)
REC_COLOR = (255,0,0)
TEXT_COLOR = (255,255,255)
REC_SIZE = 80
FONT_SIZE = 24
WIDTH = 320
HEIGHT = 240
if sys.version_info[0:3] != (3, 9, 13):
print('Warning, it was only tested with python 3.9.13, it may fail')
INPUT_DEFAULT_DURATION_SECONDS = 5
INPUT_FORMAT = pyaudio.paInt16
@@ -25,18 +34,60 @@ class Assistant:
def __init__(self):
self.config = self.initConfig()
print("Loading Whisper model...")
programIcon = pygame.image.load('assistant.png')
self.clock = pygame.time.Clock()
pygame.display.set_icon(programIcon)
pygame.display.set_caption("Assistant")
self.windowSurface = pygame.display.set_mode((WIDTH, HEIGHT), 0, 32)
self.font = pygame.font.SysFont(None, FONT_SIZE)
self.audio = pyaudio.PyAudio()
try:
self.audio.open(format=INPUT_FORMAT,
channels=INPUT_CHANNELS,
rate=INPUT_RATE,
input=True,
frames_per_buffer=INPUT_CHUNK).close()
except :
self.wait_exit()
self.display_message(self.config.messages.loadingModel)
self.model = whisper.load_model(self.config.whisperRecognition.modelPath)
self.tts = pyttsx3.init()
self.audio = pyaudio.PyAudio()
self.conversation_history = [self.config.conversation.context+self.config.conversation.greeting+"\n"]
self.conversation_history = [self.config.conversation.context,
self.config.conversation.greeting]
self.context = []
self.display_ready()
self.text_to_speech(self.config.conversation.greeting)
def wait_exit(self):
while True:
self.display_message(self.config.messages.noAudioInput)
self.clock.tick(60)
for event in pygame.event.get():
if event.type == pygame.locals.QUIT:
self.shutdown()
def shutdown(self):
self.audio.terminate()
pygame.quit()
sys.exit()
def initConfig(self):
class Inst:
pass
config=Inst();
config.messages = Inst()
config.messages.pressSpace = "Pressez sur espace pour parler puis relachez."
config.messages.loadingModel = "Loading model..."
config.messages.noAudioInput = "Erreur: Pas d'entrée son"
config.whisperRecognition = Inst()
config.whisperRecognition.modelPath = "whisper/large-v3.pt"
config.whisperRecognition.lang = "fr"
@@ -54,72 +105,139 @@ class Assistant:
#dic depth 2: map values to attributes
def dic2Object(dic, object):
for key in dic:
setattr(object, key, dic[key])
if hasattr(object, key):
setattr(object, key, dic[key])
else:
print("Ignoring unknow setting ", key)
#dic depth 1: fill depth 2 attributes
for key in dic:
dic2Object(dic[key], getattr(config, key))
if hasattr(config, key):
dic2Object(dic[key], getattr(config, key))
else:
print("Ignoring unknow setting ", key)
return config
def waveform_from_mic(self, duration=INPUT_DEFAULT_DURATION_SECONDS) -> np.ndarray:
def display_rec_start(self):
self.windowSurface.fill(BACK_COLOR)
pygame.draw.circle(self.windowSurface, REC_COLOR, (WIDTH/2, HEIGHT/2), REC_SIZE)
pygame.display.flip()
stream = self.audio.open(format=INPUT_FORMAT, channels=INPUT_CHANNELS,
rate=INPUT_RATE, input=True,
def display_message(self, text):
self.windowSurface.fill(BACK_COLOR)
label = self.font.render(text, 1, TEXT_COLOR)
size = label.get_rect()[2:4]
self.windowSurface.blit(label, (WIDTH/2 - size[0]/2, HEIGHT/2 - size[1]/2))
pygame.display.flip()
def display_ready(self):
self.display_message(self.config.messages.pressSpace)
def waveform_from_mic(self, key = pygame.K_SPACE) -> np.ndarray:
self.display_rec_start()
stream = self.audio.open(format=INPUT_FORMAT,
channels=INPUT_CHANNELS,
rate=INPUT_RATE,
input=True,
frames_per_buffer=INPUT_CHUNK)
frames = []
for _ in range(0, int(INPUT_RATE / INPUT_CHUNK * duration)):
data = stream.read(INPUT_CHUNK)
frames.append(data)
while True:
pygame.event.pump() # process event queue
pressed = pygame.key.get_pressed()
if pressed[key]:
data = stream.read(INPUT_CHUNK)
frames.append(data)
else:
break
stream.stop_stream()
stream.close()
self.audio.terminate()
self.display_ready()
return np.frombuffer(b''.join(frames), np.int16).astype(np.float32) * (1 / 32768.0)
def speech_to_text(self, waveform):
print("Finished recording, converting to text...")
self.text_to_speech(self.config.conversation.recognitionWaitMsg)
transcript = self.model.transcribe(waveform, language = self.config.whisperRecognition.lang, fp16=torch.cuda.is_available())
return transcript["text"]
transcript = self.model.transcribe(waveform,
language = self.config.whisperRecognition.lang,
fp16=torch.cuda.is_available())
text = transcript["text"]
self.text_to_speech(text)
return text
def ask_ollama(self, prompt):
print("Sending: ", prompt)
self.text_to_speech(prompt+self.config.conversation.llmWaitMsg)
def ask_ollama(self, prompt, responseCallback):
self.text_to_speech(self.config.conversation.llmWaitMsg)
self.conversation_history.append(prompt)
full_prompt = "\n".join(self.conversation_history)
response = requests.post(self.config.ollama.url, json= {"model": self.config.ollama.model,"stream":False,"prompt":full_prompt}, headers=OLLAMA_REST_HEADERS)
if response.status_code == 200:
data = json.loads(response.text)
response_text = data["response"]
self.conversation_history.append(response_text)
print("Received: ", response_text)
return response_text
else:
return "Erreur: " + response.text
jsonParam= {"model": self.config.ollama.model,
"stream":True,
"context":self.context,
"prompt":full_prompt}
print(jsonParam)
response = requests.post(self.config.ollama.url,
json=jsonParam,
headers=OLLAMA_REST_HEADERS,
stream=True)
response.raise_for_status()
tokens = []
for line in response.iter_lines():
print(line)
body = json.loads(line)
token = body.get('response', '')
tokens.append(token)
# the response streams one token at a time, print that as we receive it
if token == "." or token == ":":
responseCallback("".join(tokens))
tokens = []
if 'error' in body:
responseCallback("Erreur: " + body['error'])
if body.get('done', False):
self.context = body['context']
def text_to_speech(self, text):
print(text)
self.tts.say(text)
self.tts.runAndWait()
def main():
if sys.version_info[0:3] != (3, 9, 13):
print('Warning, it was only tested with python 3.9.13, it may fail')
pygame.init()
ass = Assistant()
ass.text_to_speech(ass.config.conversation.greeting)
print("Recording...")
push_to_talk_key = pygame.K_SPACE;
while True:
ass.clock.tick(60)
for event in pygame.event.get():
if event.type == pygame.KEYDOWN and event.key == push_to_talk_key:
print('Talk to me!')
speech = ass.waveform_from_mic(push_to_talk_key)
speech = ass.waveform_from_mic()
transcription = ass.speech_to_text(waveform=speech)
ass.ask_ollama(transcription, ass.text_to_speech)
print('Done')
transcription = ass.speech_to_text(waveform=speech)
response = ass.ask_ollama(transcription)
if event.type == pygame.locals.QUIT:
ass.shutdown()
ass.text_to_speech(text=response)
if __name__ == "__main__":
main()

View File

@@ -1,3 +1,8 @@
messages:
pressSpace: "Pressez sur espace pour parler puis relachez."
loadingModel: "Chargement du modèle..."
noAudioInput: "Erreur: Pas d'entrée son"
whisperRecognition:
modelPath: "whisper/large-v3.pt"
lang: "fr"
@@ -7,7 +12,7 @@ ollama:
model: "mistral"
conversation:
context: "This is a discussion in french.\\n"
greeting: " Je vous écoute."
recognitionWaitMsg: " J'interprète votre demande."
llmWaitMsg: " Laissez moi réfléchir."
context: "This is a discussion in french."
greeting: "Je vous écoute."
recognitionWaitMsg: "Oui."
llmWaitMsg: "Laissez moi réfléchir."

View File

@@ -9,4 +9,5 @@ openai==1.2.3
Wave==0.0.2
openai-whisper @ git+https://github.com/openai/whisper.git@fcfeaf1b61994c071bba62da47d7846933576ac9
PyAudio==0.2.14
pyyaml==6.0.1
pyyaml==6.0.1
pygame==2.5.2