mirror of
https://github.com/neuphonic/neutts-air.git
synced 2025-10-10 02:44:44 +03:00
working streaming example
This commit is contained in:
@@ -110,7 +110,7 @@ from neuttsair.neutts import NeuTTSAir
|
||||
import soundfile as sf
|
||||
|
||||
tts = NeuTTSAir(
|
||||
backbone_repo="neuphonic/neutts-air", # or 'neutts-air-q4-gguf' wit llama-cpp-python installed
|
||||
backbone_repo="neuphonic/neutts-air", # or 'neutts-air-q4-gguf' with llama-cpp-python installed
|
||||
backbone_device="cpu",
|
||||
codec_repo="neuphonic/neucodec",
|
||||
codec_device="cpu"
|
||||
|
||||
@@ -35,4 +35,16 @@ python -m examples.onnx_example \
|
||||
--ref_codes samples/dave.pt \
|
||||
--ref_text samples/dave.txt \
|
||||
--backbone neuphonic/neutts-air-q4-gguf
|
||||
```
|
||||
```
|
||||
|
||||
### Streaming Support
|
||||
|
||||
To stream the model output in chunks, try out the `onnx_streaming.py` example. For streaming, only the GGUF backends are currently supported. Ensure you have `llama-cpp-pyhon`, `onnxruntime` and `pyaudio` installed to run this example.
|
||||
|
||||
```bash
|
||||
python -m examples.onnx_example_streaming \
|
||||
--input_text "My name is Dave, and um, I'm from London" \
|
||||
--ref_codes samples/dave.pt \
|
||||
--ref_text samples/dave.txt \
|
||||
--backbone neuphonic/neutts-air-q4-gguf
|
||||
```
|
||||
|
||||
@@ -5,6 +5,9 @@ import numpy as np
|
||||
from neuttsair.neutts import NeuTTSAir
|
||||
import pyaudio
|
||||
|
||||
from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
||||
_ESPEAK_LIBRARY = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.1.1.48.dylib' #use the Path to the library.
|
||||
EspeakWrapper.set_library(_ESPEAK_LIBRARY)
|
||||
|
||||
def main(input_text, ref_codes_path, ref_text, backbone):
|
||||
assert backbone in ["neuphonic/neutts-air-q4-gguf", "neuphonic/neutts-air-q8-gguf"], "Must be a GGUF ckpt as streaming only supported by llama-cpp."
|
||||
@@ -36,7 +39,7 @@ def main(input_text, ref_codes_path, ref_text, backbone):
|
||||
print("Streaming...")
|
||||
for chunk in tts.infer_stream(input_text, ref_codes, ref_text):
|
||||
audio = (chunk * 32767).astype(np.int16)
|
||||
print(audio)
|
||||
print(audio.shape)
|
||||
stream.write(audio.tobytes())
|
||||
|
||||
stream.stop_stream()
|
||||
|
||||
@@ -53,9 +53,9 @@ class NeuTTSAir:
|
||||
self.max_context = 2048
|
||||
self.hop_length = 480
|
||||
self.streaming_overlap_frames = 1
|
||||
self.streaming_frames_per_chunk = 15
|
||||
self.streaming_lookforward = 50
|
||||
self.streaming_lookback = 150
|
||||
self.streaming_frames_per_chunk = 25
|
||||
self.streaming_lookforward = 5
|
||||
self.streaming_lookback = 50
|
||||
self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
|
||||
|
||||
# ggml & onnx flags
|
||||
|
||||
Reference in New Issue
Block a user