108 lines
4.2 KiB
Python
108 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Faster Whisper-based Speech Recognition MCP Service
|
|
Provides high-performance audio transcription with batch processing acceleration and multiple output formats
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
from mcp.server.fastmcp import FastMCP
|
|
|
|
from model_manager import get_model_info
|
|
from transcriber import transcribe_audio, batch_transcribe
|
|
|
|
# Log configuration
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Create FastMCP server instance
|
|
mcp = FastMCP(
|
|
name="fast-whisper-mcp-server",
|
|
version="0.1.1",
|
|
dependencies=["faster-whisper>=0.9.0", "torch==2.6.0+cu126", "torchaudio==2.6.0+cu126", "numpy>=1.20.0"]
|
|
)
|
|
|
|
@mcp.tool()
|
|
def get_model_info_api() -> str:
|
|
"""
|
|
Get available Whisper model information
|
|
"""
|
|
return get_model_info()
|
|
|
|
@mcp.tool()
|
|
def transcribe(audio_path: str, model_name: str = "large-v3", device: str = "auto",
|
|
compute_type: str = "auto", language: str = None, output_format: str = "vtt",
|
|
beam_size: int = 5, temperature: float = 0.0, initial_prompt: str = None,
|
|
output_directory: str = None) -> str:
|
|
"""
|
|
Transcribe audio files using Faster Whisper
|
|
|
|
Args:
|
|
audio_path: Path to the audio file
|
|
model_name: Model name (tiny, base, small, medium, large-v1, large-v2, large-v3)
|
|
device: Execution device (cpu, cuda, auto)
|
|
compute_type: Computation type (float16, int8, auto)
|
|
language: Language code (such as zh, en, ja, etc., auto-detect by default)
|
|
output_format: Output format (vtt, srt, json or txt)
|
|
beam_size: Beam search size, larger values may improve accuracy but reduce speed
|
|
temperature: Sampling temperature, greedy decoding
|
|
initial_prompt: Initial prompt text, can help the model better understand context
|
|
output_directory: Output directory path, defaults to the audio file's directory
|
|
|
|
Returns:
|
|
str: Transcription result, in VTT subtitle or JSON format
|
|
"""
|
|
return transcribe_audio(
|
|
audio_path=audio_path,
|
|
model_name=model_name,
|
|
device=device,
|
|
compute_type=compute_type,
|
|
language=language,
|
|
output_format=output_format,
|
|
beam_size=beam_size,
|
|
temperature=temperature,
|
|
initial_prompt=initial_prompt,
|
|
output_directory=output_directory
|
|
)
|
|
|
|
@mcp.tool()
|
|
def batch_transcribe_audio(audio_folder: str, output_folder: str = None, model_name: str = "large-v3",
|
|
device: str = "auto", compute_type: str = "auto", language: str = None,
|
|
output_format: str = "vtt", beam_size: int = 5, temperature: float = 0.0,
|
|
initial_prompt: str = None, parallel_files: int = 1) -> str:
|
|
"""
|
|
Batch transcribe audio files in a folder
|
|
|
|
Args:
|
|
audio_folder: Path to the folder containing audio files
|
|
output_folder: Output folder path, defaults to a 'transcript' subfolder in audio_folder
|
|
model_name: Model name (tiny, base, small, medium, large-v1, large-v2, large-v3)
|
|
device: Execution device (cpu, cuda, auto)
|
|
compute_type: Computation type (float16, int8, auto)
|
|
language: Language code (such as zh, en, ja, etc., auto-detect by default)
|
|
output_format: Output format (vtt, srt, json or txt)
|
|
beam_size: Beam search size, larger values may improve accuracy but reduce speed
|
|
temperature: Sampling temperature, 0 means greedy decoding
|
|
initial_prompt: Initial prompt text, can help the model better understand context
|
|
parallel_files: Number of files to process in parallel (only effective in CPU mode)
|
|
|
|
Returns:
|
|
str: Batch processing summary, including processing time and success rate
|
|
"""
|
|
return batch_transcribe(
|
|
audio_folder=audio_folder,
|
|
output_folder=output_folder,
|
|
model_name=model_name,
|
|
device=device,
|
|
compute_type=compute_type,
|
|
language=language,
|
|
output_format=output_format,
|
|
beam_size=beam_size,
|
|
temperature=temperature,
|
|
initial_prompt=initial_prompt,
|
|
parallel_files=parallel_files
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
print("starting mcp server for whisper stt transcriptor")
|
|
mcp.run() |