mirror of
https://github.com/PierrunoYT/Kokoro-TTS-Local.git
synced 2025-01-27 02:30:25 +03:00
Enhance TTS functionality and improve voice management
- Refactored the TTS generation process to initialize the model globally and load voices dynamically, improving efficiency and usability. - Introduced a new load_and_validate_voice function to ensure requested voices exist before loading, enhancing error handling. - Updated generate_tts_with_logs to provide real-time logging during speech generation, including phoneme processing and audio saving. - Improved audio conversion process with better error handling and temporary file management. - Set default voice to 'af_bella' in the Gradio interface for improved user experience.
This commit is contained in:
@@ -19,7 +19,6 @@ Dependencies:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
import subprocess
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import platform
|
||||||
@@ -28,11 +27,20 @@ import shutil
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from models import list_available_voices
|
import torch
|
||||||
|
from models import (
|
||||||
|
list_available_voices, build_model, load_voice,
|
||||||
|
generate_speech, load_and_validate_voice
|
||||||
|
)
|
||||||
|
|
||||||
# Global configuration
|
# Global configuration
|
||||||
CONFIG_FILE = "tts_config.json" # Stores user preferences and paths
|
CONFIG_FILE = "tts_config.json" # Stores user preferences and paths
|
||||||
DEFAULT_OUTPUT_DIR = "outputs" # Directory for generated audio files
|
DEFAULT_OUTPUT_DIR = "outputs" # Directory for generated audio files
|
||||||
|
SAMPLE_RATE = 22050
|
||||||
|
|
||||||
|
# Initialize model globally
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
model = None
|
||||||
|
|
||||||
def get_available_voices():
|
def get_available_voices():
|
||||||
"""Get list of available voice models."""
|
"""Get list of available voice models."""
|
||||||
@@ -59,57 +67,55 @@ def convert_audio(input_path: str, output_path: str, format: str):
|
|||||||
print(f"Error converting audio: {e}")
|
print(f"Error converting audio: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def generate_tts_with_logs(voice, text, format):
|
def generate_tts_with_logs(voice_name, text, format):
|
||||||
"""Generate TTS audio with real-time logging and format conversion."""
|
"""Generate TTS audio with real-time logging and format conversion."""
|
||||||
|
global model
|
||||||
|
|
||||||
if not text.strip():
|
if not text.strip():
|
||||||
return "❌ Error: Text required", None
|
return "❌ Error: Text required", None
|
||||||
|
|
||||||
logs_text = ""
|
logs_text = ""
|
||||||
try:
|
try:
|
||||||
# Use sys.executable to ensure correct Python interpreter
|
# Initialize model if not done yet
|
||||||
cmd = [sys.executable, "tts_demo.py", "--text", text, "--voice", voice]
|
if model is None:
|
||||||
|
logs_text += "Loading model...\n"
|
||||||
|
model = build_model("kokoro-v0_19.pth", device)
|
||||||
|
|
||||||
# Use shell=True on Windows
|
# Load voice
|
||||||
shell = platform.system().lower() == "windows"
|
logs_text += f"Loading voice: {voice_name}\n"
|
||||||
|
yield logs_text, None
|
||||||
|
voice = load_and_validate_voice(voice_name, device)
|
||||||
|
|
||||||
process = subprocess.Popen(
|
# Generate speech
|
||||||
cmd,
|
logs_text += f"Generating speech for: '{text}'\n"
|
||||||
stdout=subprocess.PIPE,
|
yield logs_text, None
|
||||||
stderr=subprocess.STDOUT,
|
audio, phonemes = generate_speech(model, text, voice, lang='a', device=device)
|
||||||
universal_newlines=True,
|
|
||||||
shell=shell
|
|
||||||
)
|
|
||||||
|
|
||||||
while True:
|
if audio is not None and phonemes:
|
||||||
output = process.stdout.readline()
|
try:
|
||||||
if output == '' and process.poll() is not None:
|
logs_text += f"Generated phonemes: {phonemes}\n"
|
||||||
break
|
except UnicodeEncodeError:
|
||||||
if output:
|
logs_text += "Generated phonemes: [Unicode display error]\n"
|
||||||
logs_text += output
|
|
||||||
yield logs_text, None
|
|
||||||
|
|
||||||
if process.returncode != 0:
|
|
||||||
logs_text += "❌ Generation failed\n"
|
|
||||||
yield logs_text, None
|
|
||||||
return
|
|
||||||
|
|
||||||
if not os.path.exists("output.wav"):
|
# Save temporary WAV file
|
||||||
logs_text += "❌ No output generated\n"
|
temp_wav = "output.wav"
|
||||||
yield logs_text, None
|
sf.write(temp_wav, audio, SAMPLE_RATE)
|
||||||
return
|
|
||||||
|
# Convert to desired format
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
filename = f"output_{timestamp}.{format}"
|
filename = f"output_{timestamp}.{format}"
|
||||||
os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True)
|
os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True)
|
||||||
output_path = Path(DEFAULT_OUTPUT_DIR) / filename
|
output_path = Path(DEFAULT_OUTPUT_DIR) / filename
|
||||||
|
|
||||||
# Convert audio using pydub
|
if convert_audio(temp_wav, str(output_path), format):
|
||||||
if convert_audio("output.wav", str(output_path), format):
|
logs_text += f"✅ Saved: {output_path}\n"
|
||||||
logs_text += f"✅ Saved: {output_path}\n"
|
os.remove(temp_wav)
|
||||||
os.remove("output.wav")
|
yield logs_text, str(output_path)
|
||||||
yield logs_text, str(output_path)
|
else:
|
||||||
|
logs_text += "❌ Audio conversion failed\n"
|
||||||
|
yield logs_text, None
|
||||||
else:
|
else:
|
||||||
logs_text += "❌ Audio conversion failed\n"
|
logs_text += "❌ Failed to generate audio\n"
|
||||||
yield logs_text, None
|
yield logs_text, None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -180,7 +186,7 @@ def create_interface(server_name="0.0.0.0", server_port=7860):
|
|||||||
voice = gr.Dropdown(
|
voice = gr.Dropdown(
|
||||||
choices=get_available_voices(),
|
choices=get_available_voices(),
|
||||||
label="🗣️ Select Voice",
|
label="🗣️ Select Voice",
|
||||||
value=None
|
value="af_bella"
|
||||||
)
|
)
|
||||||
format = gr.Radio(
|
format = gr.Radio(
|
||||||
choices=["wav", "mp3", "aac"],
|
choices=["wav", "mp3", "aac"],
|
||||||
|
|||||||
56
models.py
56
models.py
@@ -15,13 +15,31 @@ warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils
|
|||||||
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
|
warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
|
||||||
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
|
||||||
|
|
||||||
__all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech']
|
__all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech', 'load_and_validate_voice']
|
||||||
|
|
||||||
def get_voices_path():
|
def get_voices_path():
|
||||||
"""Get the path where voice files are stored."""
|
"""Get the path where voice files are stored."""
|
||||||
# Store voices in a 'voices' directory in the project root
|
# Store voices in a 'voices' directory in the project root
|
||||||
return str(Path(__file__).parent / "voices")
|
return str(Path(__file__).parent / "voices")
|
||||||
|
|
||||||
|
def load_and_validate_voice(voice_name: str, device: str) -> torch.Tensor:
|
||||||
|
"""Load and validate the requested voice.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
voice_name: Name of the voice to load
|
||||||
|
device: Device to load the voice on ('cuda' or 'cpu')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Loaded voice tensor
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the requested voice doesn't exist
|
||||||
|
"""
|
||||||
|
available_voices = list_available_voices()
|
||||||
|
if voice_name not in available_voices:
|
||||||
|
raise ValueError(f"Voice '{voice_name}' not found. Available voices: {', '.join(available_voices)}")
|
||||||
|
return load_voice(voice_name, device)
|
||||||
|
|
||||||
def list_available_voices():
|
def list_available_voices():
|
||||||
"""List all available voices from the official voicepacks."""
|
"""List all available voices from the official voicepacks."""
|
||||||
voices_path = get_voices_path()
|
voices_path = get_voices_path()
|
||||||
@@ -109,6 +127,12 @@ def setup_espeak():
|
|||||||
|
|
||||||
EspeakWrapper.set_library(lib_path)
|
EspeakWrapper.set_library(lib_path)
|
||||||
EspeakWrapper.data_path = data_path
|
EspeakWrapper.data_path = data_path
|
||||||
|
|
||||||
|
# Configure phonemizer for UTF-8
|
||||||
|
os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = lib_path
|
||||||
|
os.environ["PHONEMIZER_ESPEAK_PATH"] = data_path
|
||||||
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
||||||
|
|
||||||
print("espeak-ng library paths set up successfully")
|
print("espeak-ng library paths set up successfully")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -203,7 +227,37 @@ def generate_speech(model, text, voice=None, lang='a', device='cpu'):
|
|||||||
kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py")
|
kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py")
|
||||||
kokoro_module = import_module_from_path("kokoro", kokoro_py)
|
kokoro_module = import_module_from_path("kokoro", kokoro_py)
|
||||||
|
|
||||||
|
# Generate speech
|
||||||
audio, phonemes = kokoro_module.generate(model, text, voice, lang=lang)
|
audio, phonemes = kokoro_module.generate(model, text, voice, lang=lang)
|
||||||
|
|
||||||
|
# Handle phonemes encoding
|
||||||
|
if phonemes:
|
||||||
|
try:
|
||||||
|
# Debug info
|
||||||
|
print(f"Debug - Original phonemes type: {type(phonemes)}")
|
||||||
|
print(f"Debug - Original phonemes: {repr(phonemes)}")
|
||||||
|
|
||||||
|
# Convert to string if it's bytes
|
||||||
|
if isinstance(phonemes, bytes):
|
||||||
|
phonemes = phonemes.decode('utf-8', errors='replace')
|
||||||
|
# If it's a string, ensure it's valid UTF-8
|
||||||
|
elif isinstance(phonemes, str):
|
||||||
|
# Replace problematic characters with their ASCII approximations
|
||||||
|
replacements = {
|
||||||
|
'É™': 'ə',
|
||||||
|
'ÊŠ': 'ʊ',
|
||||||
|
'Ê': 'ʃ',
|
||||||
|
'æ': 'ae'
|
||||||
|
}
|
||||||
|
for old, new in replacements.items():
|
||||||
|
phonemes = phonemes.replace(old, new)
|
||||||
|
|
||||||
|
print(f"Debug - Processed phonemes: {repr(phonemes)}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Debug - Encoding error: {str(e)}")
|
||||||
|
# Last resort: strip to ASCII
|
||||||
|
phonemes = ''.join(c for c in str(phonemes) if ord(c) < 128)
|
||||||
|
|
||||||
return audio, phonemes
|
return audio, phonemes
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error generating speech: {e}")
|
print(f"Error generating speech: {e}")
|
||||||
|
|||||||
16
tts_demo.py
16
tts_demo.py
@@ -87,10 +87,18 @@ def main() -> None:
|
|||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
if audio is not None:
|
if audio is not None:
|
||||||
print(f"Generated phonemes: {phonemes}")
|
try:
|
||||||
output_path = Path(args.output)
|
if phonemes:
|
||||||
sf.write(output_path, audio, SAMPLE_RATE)
|
try:
|
||||||
print(f"\nAudio saved to {output_path.absolute()}")
|
print(f"Generated phonemes: {phonemes}")
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
print("Generated phonemes: [Unicode display error - phonemes were generated but cannot be displayed]")
|
||||||
|
output_path = Path(args.output)
|
||||||
|
sf.write(output_path, audio, SAMPLE_RATE)
|
||||||
|
print(f"\nAudio saved to {output_path.absolute()}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error saving output: {e}")
|
||||||
|
print("Audio generation was successful, but saving failed.")
|
||||||
else:
|
else:
|
||||||
print("Error: Failed to generate audio")
|
print("Error: Failed to generate audio")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user