Files
Kokoro-TTS-Local/tts_demo.py
Pierre Bruno df828f0409 Enhance TTS functionality and improve voice management
- Refactored the TTS generation process to initialize the model globally and load voices dynamically, improving efficiency and usability.
- Introduced a new load_and_validate_voice function to ensure requested voices exist before loading, enhancing error handling.
- Updated generate_tts_with_logs to provide real-time logging during speech generation, including phoneme processing and audio saving.
- Improved audio conversion process with better error handling and temporary file management.
- Set default voice to 'af_bella' in the Gradio interface for improved user experience.
2025-01-16 17:03:54 +01:00

118 lines
4.5 KiB
Python

import torch
from typing import Optional, Tuple, List
from models import build_model, load_voice, generate_speech, list_available_voices
import argparse
from tqdm.auto import tqdm
import soundfile as sf
from pathlib import Path
# Constants
SAMPLE_RATE = 22050
DEFAULT_MODEL_PATH = 'kokoro-v0_19.pth'
DEFAULT_OUTPUT_FILE = 'output.wav'
DEFAULT_LANGUAGE = 'a' # TODO: Document why this is 'a' or make configurable
DEFAULT_TEXT = "Hello, welcome to this text-to-speech test."
# Configure tqdm for better Windows console support
tqdm.monitor_interval = 0 # Disable monitor thread to prevent encoding issues
def load_and_validate_voice(voice_name: str, device: str) -> torch.Tensor:
"""Load and validate the requested voice.
Args:
voice_name: Name of the voice to load
device: Device to load the voice on ('cuda' or 'cpu')
Returns:
Loaded voice tensor
Raises:
ValueError: If the requested voice doesn't exist
"""
available_voices = list_available_voices()
if voice_name not in available_voices:
raise ValueError(f"Voice '{voice_name}' not found. Available voices: {', '.join(available_voices)}")
return load_voice(voice_name, device)
def main() -> None:
try:
# Parse command line arguments
parser = argparse.ArgumentParser(description='Kokoro TTS Demo')
parser.add_argument('--text', type=str, help='Text to synthesize (optional)')
parser.add_argument('--voice', type=str, default='af_bella', help='Voice to use (default: af_bella)')
parser.add_argument('--list-voices', action='store_true', help='List all available voices')
parser.add_argument('--model', type=str, default=DEFAULT_MODEL_PATH, help=f'Path to model file (default: {DEFAULT_MODEL_PATH})')
parser.add_argument('--output', type=str, default=DEFAULT_OUTPUT_FILE, help=f'Output WAV file (default: {DEFAULT_OUTPUT_FILE})')
parser.add_argument('--lang', type=str, default=DEFAULT_LANGUAGE, help=f'Language code (default: {DEFAULT_LANGUAGE})')
args = parser.parse_args()
if args.list_voices:
voices = list_available_voices()
print("\nAvailable voices:")
for voice in voices:
print(f"- {voice}")
return
# Set up device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
# Build model and load voice with progress indication
print("\nLoading model...")
with tqdm(total=1, desc="Building model") as pbar:
model = build_model(args.model, device)
pbar.update(1)
print("\nLoading voice...")
with tqdm(total=1, desc="Loading voice") as pbar:
try:
voice = load_and_validate_voice(args.voice, device)
pbar.update(1)
except ValueError as e:
print(f"Error: {e}")
return
# Get text input
if args.text:
text = args.text
else:
print("\nEnter the text you want to convert to speech (or press Enter for default text):")
text = input("> ").strip()
if not text:
text = DEFAULT_TEXT
print(f"\nGenerating speech for: '{text}'")
with tqdm(total=1, desc="Generating speech") as pbar:
audio, phonemes = generate_speech(model, text, voice, lang=args.lang, device=device)
pbar.update(1)
if audio is not None:
try:
if phonemes:
try:
print(f"Generated phonemes: {phonemes}")
except UnicodeEncodeError:
print("Generated phonemes: [Unicode display error - phonemes were generated but cannot be displayed]")
output_path = Path(args.output)
sf.write(output_path, audio, SAMPLE_RATE)
print(f"\nAudio saved to {output_path.absolute()}")
except Exception as e:
print(f"Error saving output: {e}")
print("Audio generation was successful, but saving failed.")
else:
print("Error: Failed to generate audio")
except Exception as e:
print(f"Error in main: {e}")
import traceback
traceback.print_exc()
finally:
# Cleanup
if 'model' in locals():
del model
if 'voice' in locals():
del voice
torch.cuda.empty_cache()
if __name__ == "__main__":
main()