Enhance TTS functionality and improve voice management

- Refactored the TTS generation process to initialize the model globally and load voices dynamically, improving efficiency and usability. - Introduced a new load_and_validate_voice function to ensure requested voices exist before loading, enhancing error handling. - Updated generate_tts_with_logs to provide real-time logging during speech generation, including phoneme processing and audio saving. - Improved audio conversion process with better error handling and temporary file management. - Set default voice to 'af_bella' in the Gradio interface for improved user experience.
2025-01-27 02:30:25 +03:00 · 2025-01-16 17:03:54 +01:00
parent 3ae6e74c57
commit df828f0409
3 changed files with 116 additions and 48 deletions
--- a/gradio_interface.py
+++ b/gradio_interface.py
@@ -19,7 +19,6 @@ Dependencies:
 """
 import gradio as gr
 import subprocess
 import os
 import sys
 import platform
@@ -28,11 +27,20 @@ import shutil
 from pathlib import Path
 import soundfile as sf
 from pydub import AudioSegment
-from models import list_available_voices
+import torch
 from models import (
    list_available_voices, build_model, load_voice,
    generate_speech, load_and_validate_voice
 )
 # Global configuration
 CONFIG_FILE = "tts_config.json"  # Stores user preferences and paths
 DEFAULT_OUTPUT_DIR = "outputs"    # Directory for generated audio files
 SAMPLE_RATE = 22050
 # Initialize model globally
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 model = None
 def get_available_voices():
    """Get list of available voice models."""
@@ -59,57 +67,55 @@ def convert_audio(input_path: str, output_path: str, format: str):
        print(f"Error converting audio: {e}")
        return False
-def generate_tts_with_logs(voice, text, format):
+def generate_tts_with_logs(voice_name, text, format):
    """Generate TTS audio with real-time logging and format conversion."""
    global model
    if not text.strip():
        return "❌ Error: Text required", None
    logs_text = ""
    try:
-        # Use sys.executable to ensure correct Python interpreter
+        # Initialize model if not done yet
-        cmd = [sys.executable, "tts_demo.py", "--text", text, "--voice", voice]
+        if model is None:
            logs_text += "Loading model...\n"
            model = build_model("kokoro-v0_19.pth", device)
-        # Use shell=True on Windows
+        # Load voice
-        shell = platform.system().lower() == "windows"
+        logs_text += f"Loading voice: {voice_name}\n"
        yield logs_text, None
        voice = load_and_validate_voice(voice_name, device)
-        process = subprocess.Popen(
+        # Generate speech
-            cmd,
+        logs_text += f"Generating speech for: '{text}'\n"
-            stdout=subprocess.PIPE,
+        yield logs_text, None
-            stderr=subprocess.STDOUT,
+        audio, phonemes = generate_speech(model, text, voice, lang='a', device=device)
            universal_newlines=True,
            shell=shell
        )
-        while True:
+        if audio is not None and phonemes:
-            output = process.stdout.readline()
+            try:
-            if output == '' and process.poll() is not None:
+                logs_text += f"Generated phonemes: {phonemes}\n"
-                break
+            except UnicodeEncodeError:
-            if output:
+                logs_text += "Generated phonemes: [Unicode display error]\n"
                logs_text += output
                yield logs_text, None
        if process.returncode != 0:
            logs_text += "❌ Generation failed\n"
            yield logs_text, None
            return
-        if not os.path.exists("output.wav"):
+            # Save temporary WAV file
-            logs_text += "❌ No output generated\n"
+            temp_wav = "output.wav"
-            yield logs_text, None
+            sf.write(temp_wav, audio, SAMPLE_RATE)
-            return
+            
-
+            # Convert to desired format
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"output_{timestamp}.{format}"
+            filename = f"output_{timestamp}.{format}"
-        os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True)
+            os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True)
-        output_path = Path(DEFAULT_OUTPUT_DIR) / filename
+            output_path = Path(DEFAULT_OUTPUT_DIR) / filename
-        
+            
-        # Convert audio using pydub
+            if convert_audio(temp_wav, str(output_path), format):
-        if convert_audio("output.wav", str(output_path), format):
+                logs_text += f"✅ Saved: {output_path}\n"
-            logs_text += f"✅ Saved: {output_path}\n"
+                os.remove(temp_wav)
-            os.remove("output.wav")
+                yield logs_text, str(output_path)
-            yield logs_text, str(output_path)
+            else:
                logs_text += "❌ Audio conversion failed\n"
                yield logs_text, None
        else:
-            logs_text += "❌ Audio conversion failed\n"
+            logs_text += "❌ Failed to generate audio\n"
            yield logs_text, None
    except Exception as e:
@@ -180,7 +186,7 @@ def create_interface(server_name="0.0.0.0", server_port=7860):
                    voice = gr.Dropdown(
                        choices=get_available_voices(),
                        label="🗣️ Select Voice",
-                        value=None
+                        value="af_bella"
                    )
                    format = gr.Radio(
                        choices=["wav", "mp3", "aac"],
--- a/models.py
+++ b/models.py
@@ -15,13 +15,31 @@ warnings.filterwarnings("ignore", category=FutureWarning, module="torch.nn.utils
 warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.rnn")
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
-__all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech']
+__all__ = ['list_available_voices', 'build_model', 'load_voice', 'generate_speech', 'load_and_validate_voice']
 def get_voices_path():
    """Get the path where voice files are stored."""
    # Store voices in a 'voices' directory in the project root
    return str(Path(__file__).parent / "voices")
 def load_and_validate_voice(voice_name: str, device: str) -> torch.Tensor:
    """Load and validate the requested voice.
    Args:
        voice_name: Name of the voice to load
        device: Device to load the voice on ('cuda' or 'cpu')
    Returns:
        Loaded voice tensor
    Raises:
        ValueError: If the requested voice doesn't exist
    """
    available_voices = list_available_voices()
    if voice_name not in available_voices:
        raise ValueError(f"Voice '{voice_name}' not found. Available voices: {', '.join(available_voices)}")
    return load_voice(voice_name, device)
 def list_available_voices():
    """List all available voices from the official voicepacks."""
    voices_path = get_voices_path()
@@ -109,6 +127,12 @@ def setup_espeak():
        EspeakWrapper.set_library(lib_path)
        EspeakWrapper.data_path = data_path
        # Configure phonemizer for UTF-8
        os.environ["PHONEMIZER_ESPEAK_LIBRARY"] = lib_path
        os.environ["PHONEMIZER_ESPEAK_PATH"] = data_path
        os.environ["PYTHONIOENCODING"] = "utf-8"
        print("espeak-ng library paths set up successfully")
    except Exception as e:
@@ -203,7 +227,37 @@ def generate_speech(model, text, voice=None, lang='a', device='cpu'):
        kokoro_py = hf_hub_download(repo_id=repo_id, filename="kokoro.py")
        kokoro_module = import_module_from_path("kokoro", kokoro_py)
        # Generate speech
        audio, phonemes = kokoro_module.generate(model, text, voice, lang=lang)
        # Handle phonemes encoding
        if phonemes:
            try:
                # Debug info
                print(f"Debug - Original phonemes type: {type(phonemes)}")
                print(f"Debug - Original phonemes: {repr(phonemes)}")
                # Convert to string if it's bytes
                if isinstance(phonemes, bytes):
                    phonemes = phonemes.decode('utf-8', errors='replace')
                # If it's a string, ensure it's valid UTF-8
                elif isinstance(phonemes, str):
                    # Replace problematic characters with their ASCII approximations
                    replacements = {
                        'É™': 'ə',
                        'ÊŠ': 'ʊ',
                        'Ê': 'ʃ',
                        'æ': 'ae'
                    }
                    for old, new in replacements.items():
                        phonemes = phonemes.replace(old, new)
                print(f"Debug - Processed phonemes: {repr(phonemes)}")
            except Exception as e:
                print(f"Debug - Encoding error: {str(e)}")
                # Last resort: strip to ASCII
                phonemes = ''.join(c for c in str(phonemes) if ord(c) < 128)
        return audio, phonemes
    except Exception as e:
        print(f"Error generating speech: {e}")
--- a/tts_demo.py
+++ b/tts_demo.py
@@ -87,10 +87,18 @@ def main() -> None:
            pbar.update(1)
        if audio is not None:
-            print(f"Generated phonemes: {phonemes}")
+            try:
-            output_path = Path(args.output)
+                if phonemes:
-            sf.write(output_path, audio, SAMPLE_RATE)
+                    try:
-            print(f"\nAudio saved to {output_path.absolute()}")
+                        print(f"Generated phonemes: {phonemes}")
                    except UnicodeEncodeError:
                        print("Generated phonemes: [Unicode display error - phonemes were generated but cannot be displayed]")
                output_path = Path(args.output)
                sf.write(output_path, audio, SAMPLE_RATE)
                print(f"\nAudio saved to {output_path.absolute()}")
            except Exception as e:
                print(f"Error saving output: {e}")
                print("Audio generation was successful, but saving failed.")
        else:
            print("Error: Failed to generate audio")