Files
Kokoro-TTS-Local/models.py
Pierre Bruno df828f0409 Enhance TTS functionality and improve voice management
- Refactored the TTS generation process to initialize the model globally and load voices dynamically, improving efficiency and usability.
- Introduced a new load_and_validate_voice function to ensure requested voices exist before loading, enhancing error handling.
- Updated generate_tts_with_logs to provide real-time logging during speech generation, including phoneme processing and audio saving.
- Improved audio conversion process with better error handling and temporary file management.
- Set default voice to 'af_bella' in the Gradio interface for improved user experience.
2025-01-16 17:03:54 +01:00

266 lines
11 KiB
Python

import torch
import os
import sys
import platform
import glob
import warnings
from huggingface_hub import hf_hub_download, list_repo_files
import espeakng_loader
from phonemizer.backend.espeak.wrapper import EspeakWrapper
from importlib.util import spec_from_file_location, module_from_spec
from pathlib import Path
# Filter out specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils.weight_norm")
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
__all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech', 'load_and_validate_voice']
def get_voices_path():
"""Get the path where voice files are stored."""
# Store voices in a 'voices' directory in the project root
return str(Path(__file__).parent / "voices")
def load_and_validate_voice(voice_name: str, device: str) -> torch.Tensor:
"""Load and validate the requested voice.
Args:
voice_name: Name of the voice to load
device: Device to load the voice on ('cuda' or 'cpu')
Returns:
Loaded voice tensor
Raises:
ValueError: If the requested voice doesn't exist
"""
available_voices = list_available_voices()
if voice_name not in available_voices:
raise ValueError(f"Voice '{voice_name}' not found. Available voices: {', '.join(available_voices)}")
return load_voice(voice_name, device)
def list_available_voices():
"""List all available voices from the official voicepacks."""
voices_path = get_voices_path()
try:
# Create voices directory if it doesn't exist
os.makedirs(voices_path, exist_ok=True)
# Download voices if they don't exist
if not any(f.endswith('.pt') for f in os.listdir(voices_path)):
print("Downloading voice files...")
repo_id = "hexgrad/Kokoro-82M"
repo_files = list_repo_files(repo_id)
voice_files = [f for f in repo_files if f.startswith('voices/') and f.endswith('.pt')]
for voice_file in voice_files:
try:
voice_name = os.path.splitext(os.path.basename(voice_file))[0]
target_path = os.path.join(voices_path, f"{voice_name}.pt")
if not os.path.exists(target_path):
print(f"Downloading voice: {voice_name}")
hf_hub_download(
repo_id=repo_id,
filename=voice_file,
local_dir=str(Path(voices_path).parent),
local_dir_use_symlinks=False
)
except Exception as e:
print(f"Error downloading voice {voice_file}: {e}")
# List available voice files
voices = [os.path.splitext(f)[0] for f in os.listdir(voices_path) if f.endswith('.pt')]
return sorted(voices)
except Exception as e:
print(f"Error accessing voices: {e}")
return ["af_bella", "af_nicole", "af_sarah", "af_sky", "am_adam", "am_michael",
"bf_emma", "bf_isabella", "bm_george", "bm_lewis"]
def get_platform_paths():
"""Get platform-specific paths for espeak-ng"""
system = platform.system().lower()
if system == "windows":
lib_path = os.path.join(os.getenv("ProgramFiles"), "eSpeak NG", "libespeak-ng.dll")
data_path = os.path.join(os.getenv("ProgramFiles"), "eSpeak NG", "espeak-ng-data")
elif system == "darwin": # macOS
lib_path = "/opt/homebrew/lib/libespeak-ng.dylib"
brew_data = "/opt/homebrew/share/espeak-ng-data"
sys_lib = "/usr/local/lib/libespeak-ng.dylib"
sys_data = "/usr/local/share/espeak-ng-data"
lib_path = lib_path if os.path.exists(lib_path) else sys_lib
data_path = brew_data if os.path.exists(brew_data) else sys_data
else: # Linux
data_path = "/usr/lib/x86_64-linux-gnu/espeak-ng-data"
lib_paths = [
"/lib/x86_64-linux-gnu/libespeak-ng.so.1",
"/usr/lib/x86_64-linux-gnu/libespeak-ng.so",
"/usr/lib/libespeak-ng.so",
"/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1",
"/usr/lib/aarch64-linux-gnu/libespeak-ng.so",
"/usr/lib64/libespeak-ng.so"
]
lib_path = None
for path in lib_paths:
if os.path.exists(path):
lib_path = path
break
if lib_path is None:
lib_path = lib_paths[0] # Default for error message
return lib_path, data_path
def setup_espeak():
"""Set up espeak library paths for phonemizer."""
try:
lib_path, data_path = get_platform_paths()
if not os.path.exists(lib_path):
raise FileNotFoundError(f"espeak-ng library not found at {lib_path}")
if not os.path.exists(data_path):
raise FileNotFoundError(f"espeak-ng data not found at {data_path}")
EspeakWrapper.set_library(lib_path)
EspeakWrapper.data_path = data_path
# Configure phonemizer for UTF-8
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = lib_path
os.environ["PHONEMIZER_ESPEAK_PATH"] = data_path
os.environ["PYTHONIOENCODING"] = "utf-8"
print("espeak-ng library paths set up successfully")
except Exception as e:
print(f"Error setting up espeak: {e}")
print("\nPlease ensure espeak-ng is installed:")
print("- Windows: Download from https://github.com/espeak-ng/espeak-ng/releases")
print("- macOS: brew install espeak-ng")
print("- Linux: sudo apt install espeak-ng")
raise e
def import_module_from_path(module_name, module_path):
"""Import a module from file path."""
try:
spec = spec_from_file_location(module_name, module_path)
module = module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
return module
except Exception as e:
print(f"Error importing module {module_name}: {e}")
raise e
def build_model(model_file, device='cpu'):
"""Build the Kokoro model following official implementation."""
try:
setup_espeak()
repo_id = "hexgrad/Kokoro-82M"
# Download all required files
model_path = hf_hub_download(repo_id=repo_id, filename="kokoro-v0_19.pth")
kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py")
models_py = hf_hub_download(repo_id=repo_id, filename="models.py")
istftnet_py = hf_hub_download(repo_id=repo_id, filename="istftnet.py")
plbert_py = hf_hub_download(repo_id=repo_id, filename="plbert.py")
config_json = hf_hub_download(repo_id=repo_id, filename="config.json")
# Import required modules
print("Importing plbert module...")
plbert_module = import_module_from_path("plbert", plbert_py)
print("Importing istftnet module...")
istftnet_module = import_module_from_path("istftnet", istftnet_py)
print("Importing models module...")
models_module = import_module_from_path("models", models_py)
print("Importing kokoro module...")
kokoro_module = import_module_from_path("kokoro", kokoro_py)
# Test phonemizer
from phonemizer import phonemize
test_phonemes = phonemize("Hello")
print(f"Phonemizer test successful: 'Hello' -> {test_phonemes}")
# Build model
print("Building model...")
model = models_module.build_model(model_path, device)
print(f"Model loaded successfully on {device}")
return model
except Exception as e:
print(f"Error building model: {e}")
import traceback
traceback.print_exc()
raise e
def load_voice(voice_name='af_bella', device='cpu'):
"""Load a voice from the local voices directory."""
try:
voices_path = get_voices_path()
voice_path = os.path.join(voices_path, f"{voice_name}.pt")
# Download voice if it doesn't exist locally
if not os.path.exists(voice_path):
print(f"Downloading voice: {voice_name}")
repo_id = "hexgrad/Kokoro-82M"
hf_hub_download(
repo_id=repo_id,
filename=f"voices/{voice_name}.pt",
local_dir=str(Path(voices_path).parent),
local_dir_use_symlinks=False
)
voice = torch.load(voice_path, weights_only=True).to(device)
print(f"Loaded voice: {voice_name}")
return voice
except Exception as e:
print(f"Error loading voice: {e}")
raise e
def generate_speech(model, text, voice=None, lang='a', device='cpu'):
"""Generate speech using the Kokoro model."""
try:
repo_id = "hexgrad/Kokoro-82M"
kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py")
kokoro_module = import_module_from_path("kokoro", kokoro_py)
# Generate speech
audio, phonemes = kokoro_module.generate(model, text, voice, lang=lang)
# Handle phonemes encoding
if phonemes:
try:
# Debug info
print(f"Debug - Original phonemes type: {type(phonemes)}")
print(f"Debug - Original phonemes: {repr(phonemes)}")
# Convert to string if it's bytes
if isinstance(phonemes, bytes):
phonemes = phonemes.decode('utf-8', errors='replace')
# If it's a string, ensure it's valid UTF-8
elif isinstance(phonemes, str):
# Replace problematic characters with their ASCII approximations
replacements = {
'É™': 'ə',
'ÊŠ': 'ʊ',
'Ê': 'ʃ',
'æ': 'ae'
}
for old, new in replacements.items():
phonemes = phonemes.replace(old, new)
print(f"Debug - Processed phonemes: {repr(phonemes)}")
except Exception as e:
print(f"Debug - Encoding error: {str(e)}")
# Last resort: strip to ASCII
phonemes = ''.join(c for c in str(phonemes) if ord(c) < 128)
return audio, phonemes
except Exception as e:
print(f"Error generating speech: {e}")
import traceback
traceback.print_exc()
return None, None