Enhance TTS functionality and improve voice management

- Refactored the TTS generation process to initialize the model globally and load voices dynamically, improving efficiency and usability.
- Introduced a new load_and_validate_voice function to ensure requested voices exist before loading, enhancing error handling.
- Updated generate_tts_with_logs to provide real-time logging during speech generation, including phoneme processing and audio saving.
- Improved audio conversion process with better error handling and temporary file management.
- Set default voice to 'af_bella' in the Gradio interface for improved user experience.
This commit is contained in:
Pierre Bruno
2025-01-16 17:03:54 +01:00
parent 3ae6e74c57
commit df828f0409
3 changed files with 116 additions and 48 deletions

View File

@@ -19,7 +19,6 @@ Dependencies:
""" """
import gradio as gr import gradio as gr
import subprocess
import os import os
import sys import sys
import platform import platform
@@ -28,11 +27,20 @@ import shutil
from pathlib import Path from pathlib import Path
import soundfile as sf import soundfile as sf
from pydub import AudioSegment from pydub import AudioSegment
from models import list_available_voices import torch
from models import (
list_available_voices, build_model, load_voice,
generate_speech, load_and_validate_voice
)
# Global configuration # Global configuration
CONFIG_FILE = "tts_config.json" # Stores user preferences and paths CONFIG_FILE = "tts_config.json" # Stores user preferences and paths
DEFAULT_OUTPUT_DIR = "outputs" # Directory for generated audio files DEFAULT_OUTPUT_DIR = "outputs" # Directory for generated audio files
SAMPLE_RATE = 22050
# Initialize model globally
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = None
def get_available_voices(): def get_available_voices():
"""Get list of available voice models.""" """Get list of available voice models."""
@@ -59,57 +67,55 @@ def convert_audio(input_path: str, output_path: str, format: str):
print(f"Error converting audio: {e}") print(f"Error converting audio: {e}")
return False return False
def generate_tts_with_logs(voice, text, format): def generate_tts_with_logs(voice_name, text, format):
"""Generate TTS audio with real-time logging and format conversion.""" """Generate TTS audio with real-time logging and format conversion."""
global model
if not text.strip(): if not text.strip():
return "❌ Error: Text required", None return "❌ Error: Text required", None
logs_text = "" logs_text = ""
try: try:
# Use sys.executable to ensure correct Python interpreter # Initialize model if not done yet
cmd = [sys.executable, "tts_demo.py", "--text", text, "--voice", voice] if model is None:
logs_text += "Loading model...\n"
model = build_model("kokoro-v0_19.pth", device)
# Use shell=True on Windows # Load voice
shell = platform.system().lower() == "windows" logs_text += f"Loading voice: {voice_name}\n"
yield logs_text, None
voice = load_and_validate_voice(voice_name, device)
process = subprocess.Popen( # Generate speech
cmd, logs_text += f"Generating speech for: '{text}'\n"
stdout=subprocess.PIPE, yield logs_text, None
stderr=subprocess.STDOUT, audio, phonemes = generate_speech(model, text, voice, lang='a', device=device)
universal_newlines=True,
shell=shell
)
while True: if audio is not None and phonemes:
output = process.stdout.readline() try:
if output == '' and process.poll() is not None: logs_text += f"Generated phonemes: {phonemes}\n"
break except UnicodeEncodeError:
if output: logs_text += "Generated phonemes: [Unicode display error]\n"
logs_text += output
yield logs_text, None
if process.returncode != 0:
logs_text += "❌ Generation failed\n"
yield logs_text, None
return
if not os.path.exists("output.wav"): # Save temporary WAV file
logs_text += "❌ No output generated\n" temp_wav = "output.wav"
yield logs_text, None sf.write(temp_wav, audio, SAMPLE_RATE)
return
# Convert to desired format
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"output_{timestamp}.{format}" filename = f"output_{timestamp}.{format}"
os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True) os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True)
output_path = Path(DEFAULT_OUTPUT_DIR) / filename output_path = Path(DEFAULT_OUTPUT_DIR) / filename
# Convert audio using pydub if convert_audio(temp_wav, str(output_path), format):
if convert_audio("output.wav", str(output_path), format): logs_text += f"✅ Saved: {output_path}\n"
logs_text += f"✅ Saved: {output_path}\n" os.remove(temp_wav)
os.remove("output.wav") yield logs_text, str(output_path)
yield logs_text, str(output_path) else:
logs_text += "❌ Audio conversion failed\n"
yield logs_text, None
else: else:
logs_text += "Audio conversion failed\n" logs_text += "Failed to generate audio\n"
yield logs_text, None yield logs_text, None
except Exception as e: except Exception as e:
@@ -180,7 +186,7 @@ def create_interface(server_name="0.0.0.0", server_port=7860):
voice = gr.Dropdown( voice = gr.Dropdown(
choices=get_available_voices(), choices=get_available_voices(),
label="🗣️ Select Voice", label="🗣️ Select Voice",
value=None value="af_bella"
) )
format = gr.Radio( format = gr.Radio(
choices=["wav", "mp3", "aac"], choices=["wav", "mp3", "aac"],

View File

@@ -15,13 +15,31 @@ warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn") warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1" os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
__all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech'] __all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech', 'load_and_validate_voice']
def get_voices_path(): def get_voices_path():
"""Get the path where voice files are stored.""" """Get the path where voice files are stored."""
# Store voices in a 'voices' directory in the project root # Store voices in a 'voices' directory in the project root
return str(Path(__file__).parent / "voices") return str(Path(__file__).parent / "voices")
def load_and_validate_voice(voice_name: str, device: str) -> torch.Tensor:
"""Load and validate the requested voice.
Args:
voice_name: Name of the voice to load
device: Device to load the voice on ('cuda' or 'cpu')
Returns:
Loaded voice tensor
Raises:
ValueError: If the requested voice doesn't exist
"""
available_voices = list_available_voices()
if voice_name not in available_voices:
raise ValueError(f"Voice '{voice_name}' not found. Available voices: {', '.join(available_voices)}")
return load_voice(voice_name, device)
def list_available_voices(): def list_available_voices():
"""List all available voices from the official voicepacks.""" """List all available voices from the official voicepacks."""
voices_path = get_voices_path() voices_path = get_voices_path()
@@ -109,6 +127,12 @@ def setup_espeak():
EspeakWrapper.set_library(lib_path) EspeakWrapper.set_library(lib_path)
EspeakWrapper.data_path = data_path EspeakWrapper.data_path = data_path
# Configure phonemizer for UTF-8
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = lib_path
os.environ["PHONEMIZER_ESPEAK_PATH"] = data_path
os.environ["PYTHONIOENCODING"] = "utf-8"
print("espeak-ng library paths set up successfully") print("espeak-ng library paths set up successfully")
except Exception as e: except Exception as e:
@@ -203,7 +227,37 @@ def generate_speech(model, text, voice=None, lang='a', device='cpu'):
kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py") kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py")
kokoro_module = import_module_from_path("kokoro", kokoro_py) kokoro_module = import_module_from_path("kokoro", kokoro_py)
# Generate speech
audio, phonemes = kokoro_module.generate(model, text, voice, lang=lang) audio, phonemes = kokoro_module.generate(model, text, voice, lang=lang)
# Handle phonemes encoding
if phonemes:
try:
# Debug info
print(f"Debug - Original phonemes type: {type(phonemes)}")
print(f"Debug - Original phonemes: {repr(phonemes)}")
# Convert to string if it's bytes
if isinstance(phonemes, bytes):
phonemes = phonemes.decode('utf-8', errors='replace')
# If it's a string, ensure it's valid UTF-8
elif isinstance(phonemes, str):
# Replace problematic characters with their ASCII approximations
replacements = {
'É™': 'ə',
'ÊŠ': 'ʊ',
'Ê': 'ʃ',
'æ': 'ae'
}
for old, new in replacements.items():
phonemes = phonemes.replace(old, new)
print(f"Debug - Processed phonemes: {repr(phonemes)}")
except Exception as e:
print(f"Debug - Encoding error: {str(e)}")
# Last resort: strip to ASCII
phonemes = ''.join(c for c in str(phonemes) if ord(c) < 128)
return audio, phonemes return audio, phonemes
except Exception as e: except Exception as e:
print(f"Error generating speech: {e}") print(f"Error generating speech: {e}")

View File

@@ -87,10 +87,18 @@ def main() -> None:
pbar.update(1) pbar.update(1)
if audio is not None: if audio is not None:
print(f"Generated phonemes: {phonemes}") try:
output_path = Path(args.output) if phonemes:
sf.write(output_path, audio, SAMPLE_RATE) try:
print(f"\nAudio saved to {output_path.absolute()}") print(f"Generated phonemes: {phonemes}")
except UnicodeEncodeError:
print("Generated phonemes: [Unicode display error - phonemes were generated but cannot be displayed]")
output_path = Path(args.output)
sf.write(output_path, audio, SAMPLE_RATE)
print(f"\nAudio saved to {output_path.absolute()}")
except Exception as e:
print(f"Error saving output: {e}")
print("Audio generation was successful, but saving failed.")
else: else:
print("Error: Failed to generate audio") print("Error: Failed to generate audio")