diff --git a/README.md b/README.md index 7649fc9..cec2af4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Kokoro TTS Local -A local implementation of the Kokoro Text-to-Speech model, featuring dynamic module loading and automatic dependency management. +A local implementation of the Kokoro Text-to-Speech model, featuring dynamic module loading, automatic dependency management, and a web interface. ## Current Status @@ -12,6 +12,7 @@ The project has been updated with: - Improved error handling and debugging - Interactive CLI interface - Cross-platform setup scripts +- Web interface with Gradio ## Features @@ -24,12 +25,24 @@ The project has been updated with: - Dynamic module loading from Hugging Face - Comprehensive error handling and logging - Cross-platform support (Windows, Linux, macOS) +- **NEW: Web Interface Features** + - Modern, user-friendly UI + - Real-time generation progress + - Multiple output formats (WAV, MP3, AAC) + - Network sharing capabilities + - Audio playback and download + - Voice selection dropdown + - Detailed process logging ## Prerequisites - Python 3.8 or higher - Git (for cloning the repository) - Internet connection (for initial model download) +- FFmpeg (required for MP3/AAC conversion): + - Windows: Automatically installed with pydub + - Linux: `sudo apt-get install ffmpeg` + - macOS: `brew install ffmpeg` ## Dependencies @@ -42,21 +55,37 @@ munch soundfile huggingface-hub espeakng-loader +gradio>=4.0.0 +pydub # For audio format conversion ``` ## Setup ### Windows -Run the PowerShell setup script: ```powershell +# Clone the repository +git clone https://github.com/PierrunoYT/Kokoro-TTS-Local.git +cd Kokoro-TTS-Local + +# Run the setup script .\setup.ps1 ``` ### Linux/macOS -Run the bash setup script: ```bash +# Clone the repository +git clone https://github.com/PierrunoYT/Kokoro-TTS-Local.git +cd Kokoro-TTS-Local + +# Run the setup script chmod +x setup.sh ./setup.sh + +# Install FFmpeg (if needed) +# Linux: +sudo apt-get install ffmpeg +# macOS: +brew install ffmpeg ``` ### Manual Setup @@ -79,34 +108,37 @@ python -m pip install --upgrade pip pip install -r requirements.txt ``` +3. Install system dependencies: +```bash +# Windows +# FFmpeg is automatically installed with pydub + +# Linux +sudo apt-get update +sudo apt-get install espeak-ng ffmpeg + +# macOS +brew install espeak ffmpeg +``` + ## Usage -### List Available Voices -To see all available voices from the Hugging Face repository: +### Web Interface ```bash -python tts_demo.py --list-voices +# Start the web interface +python gradio_interface.py ``` +This will: +1. Launch a web interface at http://localhost:7860 +2. Create a public share link (optional) +3. Allow you to: + - Input text to synthesize + - Select from available voices + - Choose output format (WAV/MP3/AAC) + - Monitor generation progress + - Play or download generated audio -### Basic Usage -Run the demo script with default text and voice: -```bash -python tts_demo.py -``` - -### Custom Text -Specify your own text: -```bash -python tts_demo.py --text "Your custom text here" -``` - -### Voice Selection -Choose a different voice (use --list-voices to see available options): -```bash -python tts_demo.py --voice "af" --text "Custom text with specific voice" -``` - -### Interactive Mode -If you run without any arguments, you'll be prompted to enter text interactively: +### Command Line Interface ```bash python tts_demo.py ``` @@ -133,6 +165,11 @@ The script will: - Interactive text input mode - Voice selection and listing - Error handling and user feedback +- `gradio_interface.py`: Web interface implementation + - Modern, responsive UI + - Real-time progress monitoring + - Multiple output formats + - Network sharing capabilities - `setup.ps1`: Windows PowerShell setup script - Environment creation - Dependency installation @@ -156,7 +193,7 @@ The project uses the Kokoro-82M model from Hugging Face: - Sample rate: 22050Hz - Input: Text in any language (English recommended) -- Output: WAV audio file +- Output: WAV/MP3/AAC audio file - Dependencies are automatically managed - Modules are dynamically loaded from Hugging Face - Error handling includes stack traces for debugging diff --git a/gradio_interface.py b/gradio_interface.py index ef34e38..f5a60f0 100644 --- a/gradio_interface.py +++ b/gradio_interface.py @@ -14,18 +14,20 @@ Key Features: Dependencies: - gradio: Web interface framework - soundfile: Audio file handling +- pydub: Audio format conversion - models: Custom module for voice model management """ import gradio as gr import subprocess import os +import sys import platform from datetime import datetime import shutil -import json -import soundfile as sf from pathlib import Path +import soundfile as sf +from pydub import AudioSegment # Global configuration CONFIG_FILE = "tts_config.json" # Stores user preferences and paths @@ -42,51 +44,52 @@ def get_default_voices_path(): def get_available_voices(): """Get list of available voice models by checking the directory.""" - voices_path = get_default_voices_path() # Use platform-agnostic path + voices_path = get_default_voices_path() try: - # List all files in the directory and filter by .pt extension + if not os.path.exists(voices_path): + print(f"Voices directory not found: {voices_path}") + return [] voices = [os.path.splitext(f)[0] for f in os.listdir(voices_path) if f.endswith('.pt')] - print("Available voices:", voices) # Debugging log + print("Available voices:", voices) return voices except Exception as e: print(f"Error retrieving voices: {e}") - return [] # Return an empty list if there's an error + return [] + +def convert_audio(input_path: str, output_path: str, format: str): + """Convert audio to specified format using pydub.""" + try: + audio = AudioSegment.from_wav(input_path) + if format == "mp3": + audio.export(output_path, format="mp3", bitrate="192k") + elif format == "aac": + audio.export(output_path, format="aac", bitrate="192k") + else: # wav + shutil.copy2(input_path, output_path) + return True + except Exception as e: + print(f"Error converting audio: {e}") + return False def generate_tts_with_logs(voice, text, format): - """Generate TTS audio with real-time logging and format conversion. - - This function: - 1. Validates input text - 2. Runs TTS generation subprocess - 3. Streams progress logs in real-time - 4. Converts output to requested format - 5. Saves with timestamp in output directory - - Args: - voice (str): Selected voice model identifier (e.g., "af", "af_bella") - text (str): Input text to synthesize - format (str): Output audio format ("wav", "mp3", or "aac") - - Yields: - tuple: (log_text, output_path) - - log_text (str): Accumulated process logs - - output_path (str): Path to generated audio file, or None on error - - Notes: - - Temporary WAV file is created and deleted after conversion - - Output filename includes timestamp to prevent overwrites - - Errors are caught and reported in logs - """ + """Generate TTS audio with real-time logging and format conversion.""" if not text.strip(): return "❌ Error: Text required", None logs_text = "" try: + # Use sys.executable to ensure correct Python interpreter + cmd = [sys.executable, "tts_demo.py", "--text", text, "--voice", voice] + + # Use shell=True on Windows + shell = platform.system().lower() == "windows" + process = subprocess.Popen( - ["python", "tts_demo.py", "--text", text, "--voice", voice], + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, - universal_newlines=True + universal_newlines=True, + shell=shell ) while True: @@ -112,15 +115,14 @@ def generate_tts_with_logs(voice, text, format): os.makedirs(DEFAULT_OUTPUT_DIR, exist_ok=True) output_path = Path(DEFAULT_OUTPUT_DIR) / filename - if format == "wav": - shutil.copy2("output.wav", output_path) + # Convert audio using pydub + if convert_audio("output.wav", str(output_path), format): + logs_text += f"✅ Saved: {output_path}\n" + os.remove("output.wav") + yield logs_text, str(output_path) else: - data, samplerate = sf.read("output.wav") - sf.write(str(output_path), data, samplerate) - - os.remove("output.wav") - logs_text += f"✅ Saved: {output_path}\n" - yield logs_text, str(output_path) + logs_text += "❌ Audio conversion failed\n" + yield logs_text, None except Exception as e: logs_text += f"❌ Error: {str(e)}\n" diff --git a/requirements.txt b/requirements.txt index a401a01..91e283e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,6 @@ scipy munch soundfile huggingface-hub -espeakng-loader \ No newline at end of file +espeakng-loader +gradio>=4.0.0 +pydub # For audio format conversion \ No newline at end of file