implement audio downloader cli tool for testing
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,5 +1,6 @@
|
||||
**/models
|
||||
**/static
|
||||
**/testdata
|
||||
__pycache__
|
||||
*.mp3
|
||||
*.wav
|
||||
|
||||
94
scripts/download_audio.py
Normal file
94
scripts/download_audio.py
Normal file
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YouTube Audio Downloader - Download audio from YouTube videos.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from loguru import logger
|
||||
import yt_dlp
|
||||
|
||||
# Configure loguru logger
|
||||
logger.remove()
|
||||
logger.add(sys.stderr, format="<level>{level: <8}</level> {message}")
|
||||
|
||||
|
||||
def download_youtube_audio(url, output_dir):
|
||||
"""
|
||||
Download audio from a YouTube video in the highest quality available as MP3.
|
||||
|
||||
Args:
|
||||
url (str): YouTube video URL
|
||||
output_dir (str): Directory to save the audio file
|
||||
|
||||
Returns:
|
||||
str: Path to the downloaded audio file or None if download failed
|
||||
"""
|
||||
# Configure options for MP3 download
|
||||
ydl_opts = {
|
||||
'format': 'bestaudio/best',
|
||||
'postprocessors': [{
|
||||
'key': 'FFmpegExtractAudio',
|
||||
'preferredcodec': 'mp3',
|
||||
'preferredquality': '192',
|
||||
}],
|
||||
'outtmpl': os.path.join(output_dir, "%(title)s.%(ext)s"),
|
||||
}
|
||||
|
||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
||||
info = ydl.extract_info(url, download=True)
|
||||
if info and 'title' in info:
|
||||
filename = ydl.prepare_filename(info)
|
||||
base, _ = os.path.splitext(filename)
|
||||
audio_file = f"{base}.mp3"
|
||||
|
||||
if os.path.exists(audio_file):
|
||||
logger.success(f"Downloaded: {os.path.basename(audio_file)}")
|
||||
return audio_file
|
||||
else:
|
||||
logger.error(f"Failed to find downloaded file for: {url}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.error(f"Failed to extract info from: {url}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
# Create argument parser
|
||||
parser = argparse.ArgumentParser(
|
||||
description="📱 DOWNLOAD_AUDIOS - Download YouTube videos as MP3 audio files",
|
||||
formatter_class=argparse.RawTextHelpFormatter
|
||||
)
|
||||
|
||||
# Add arguments
|
||||
parser.add_argument('urls', nargs='+', help='One or more YouTube URLs')
|
||||
parser.add_argument('-o', '--output-dir', required=True,
|
||||
help='Directory to save audio files (must exist)')
|
||||
|
||||
# Parse arguments
|
||||
args = parser.parse_args()
|
||||
|
||||
# Check if output directory exists
|
||||
if not os.path.isdir(args.output_dir):
|
||||
logger.error(f"Output directory '{args.output_dir}' does not exist.")
|
||||
sys.exit(1)
|
||||
|
||||
logger.info("📱 DOWNLOAD_AUDIOS")
|
||||
logger.info("===================")
|
||||
|
||||
# Download each URL
|
||||
successful = 0
|
||||
for url in args.urls:
|
||||
logger.info(f"▶ Processing: {url}")
|
||||
result = download_youtube_audio(url, args.output_dir)
|
||||
successful += 1
|
||||
|
||||
# Print summary
|
||||
logger.info("===================")
|
||||
logger.success(f"Successfully downloaded: {successful}")
|
||||
logger.info(f"🔊 Audio files saved to: {args.output_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -6,6 +6,7 @@ BASE_DIR = Path(__file__).parent
|
||||
UPLOAD_DIR = BASE_DIR / "uploads"
|
||||
STATIC_DIR = BASE_DIR / "static"
|
||||
TEMPLATES_DIR = BASE_DIR / "templates"
|
||||
TRANSCRIPT_DIR = BASE_DIR / "transcripts"
|
||||
|
||||
# Configuration
|
||||
ALLOWED_EXTENSIONS = {"mp3", "wav"}
|
||||
@@ -13,3 +14,4 @@ ALLOWED_EXTENSIONS = {"mp3", "wav"}
|
||||
# Create required directories
|
||||
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
||||
os.makedirs(STATIC_DIR, exist_ok=True)
|
||||
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)
|
||||
39
src/main.py
39
src/main.py
@@ -8,7 +8,7 @@ from fastapi.responses import HTMLResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from config import UPLOAD_DIR, ALLOWED_EXTENSIONS, STATIC_DIR, TEMPLATES_DIR
|
||||
from config import UPLOAD_DIR, ALLOWED_EXTENSIONS, STATIC_DIR, TEMPLATES_DIR, TRANSCRIPT_DIR
|
||||
from worker import audio_processor, FileStatus
|
||||
|
||||
|
||||
@@ -67,15 +67,27 @@ async def upload_file(file: UploadFile = File(...)):
|
||||
|
||||
|
||||
def get_file_list():
|
||||
"""Helper function to get file list with metadata"""
|
||||
"""Helper function to get file list with metadata and status"""
|
||||
files = []
|
||||
for file_path in UPLOAD_DIR.iterdir():
|
||||
if file_path.is_file():
|
||||
filename = file_path.name
|
||||
file_stats = file_path.stat()
|
||||
|
||||
# Get file status
|
||||
status = audio_processor.get_status(filename).value
|
||||
|
||||
# Check if transcript exists
|
||||
base_name = file_path.stem
|
||||
transcript_path = TRANSCRIPT_DIR / f"{base_name}.txt"
|
||||
has_transcript = transcript_path.exists()
|
||||
|
||||
files.append({
|
||||
"name": file_path.name,
|
||||
"name": filename,
|
||||
"size": file_stats.st_size,
|
||||
"created": file_stats.st_ctime
|
||||
"created": file_stats.st_ctime,
|
||||
"status": status,
|
||||
"has_transcript": has_transcript
|
||||
})
|
||||
return files
|
||||
|
||||
@@ -86,6 +98,25 @@ async def list_files():
|
||||
return {"files": get_file_list()}
|
||||
|
||||
|
||||
@app.post("/process/{filename}")
|
||||
async def process_file(filename: str):
|
||||
"""API endpoint to manually trigger file processing"""
|
||||
file_path = UPLOAD_DIR / filename
|
||||
|
||||
# Check if file exists
|
||||
if not file_path.exists():
|
||||
raise HTTPException(status_code=404, detail=f"File '{filename}' not found")
|
||||
|
||||
# Process the file
|
||||
success = await audio_processor.process_file(filename)
|
||||
if not success:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to process '{filename}'")
|
||||
|
||||
# Broadcast updated status
|
||||
await broadcast_file_list()
|
||||
|
||||
return {"filename": filename, "status": "processing_started"}
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket):
|
||||
"""WebSocket endpoint for real-time file updates"""
|
||||
|
||||
@@ -101,6 +101,22 @@
|
||||
background-color: #fcf8e3;
|
||||
color: #8a6d3b;
|
||||
}
|
||||
.status-badge {
|
||||
display: inline-block;
|
||||
padding: 2px 6px;
|
||||
border-radius: 10px;
|
||||
font-size: 0.8em;
|
||||
margin-right: 8px;
|
||||
}
|
||||
.status-pending { background-color: #fcf8e3; color: #8a6d3b; }
|
||||
.status-processing { background-color: #d9edf7; color: #31708f; }
|
||||
.status-completed { background-color: #dff0d8; color: #3c763d; }
|
||||
.status-failed { background-color: #f2dede; color: #a94442; }
|
||||
.process-btn {
|
||||
background-color: #5bc0de;
|
||||
padding: 3px 8px;
|
||||
font-size: 0.8em;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
@@ -241,35 +257,73 @@
|
||||
}
|
||||
|
||||
function updateFileList(files) {
|
||||
const fileListElement = document.getElementById('fileList');
|
||||
const fileListElement = document.getElementById('fileList');
|
||||
|
||||
if (!files || files.length === 0) {
|
||||
fileListElement.innerHTML = '<p>No files uploaded yet.</p>';
|
||||
return;
|
||||
}
|
||||
if (!files || files.length === 0) {
|
||||
fileListElement.innerHTML = '<p>No files uploaded yet.</p>';
|
||||
return;
|
||||
}
|
||||
|
||||
let html = '';
|
||||
files.forEach(file => {
|
||||
// Format the file size
|
||||
const fileSizeKB = Math.round(file.size / 1024);
|
||||
let fileSizeStr = fileSizeKB + ' KB';
|
||||
if (fileSizeKB >= 1024) {
|
||||
const fileSizeMB = (fileSizeKB / 1024).toFixed(1);
|
||||
fileSizeStr = fileSizeMB + ' MB';
|
||||
}
|
||||
let html = '';
|
||||
files.forEach(file => {
|
||||
// Format the file size
|
||||
const fileSizeKB = Math.round(file.size / 1024);
|
||||
let fileSizeStr = fileSizeKB + ' KB';
|
||||
if (fileSizeKB >= 1024) {
|
||||
const fileSizeMB = (fileSizeKB / 1024).toFixed(1);
|
||||
fileSizeStr = fileSizeMB + ' MB';
|
||||
}
|
||||
|
||||
// Format the date
|
||||
const date = new Date(file.created * 1000);
|
||||
const dateStr = date.toLocaleString();
|
||||
// Format the date
|
||||
const date = new Date(file.created * 1000);
|
||||
const dateStr = date.toLocaleString();
|
||||
|
||||
html += `<div class="file-item">
|
||||
<div class="file-name">${file.name}</div>
|
||||
<div class="file-meta">Size: ${fileSizeStr} | Uploaded: ${dateStr}</div>
|
||||
</div>`;
|
||||
// Status badge
|
||||
const statusClass = `status-${file.status || 'pending'}`;
|
||||
|
||||
// Process button (only show if not completed/processing)
|
||||
const processButton = file.status === 'completed' || file.status === 'processing'
|
||||
? ''
|
||||
: `<button class="process-btn" onclick="triggerProcessing('${file.name}')">Process</button>`;
|
||||
|
||||
// Transcript indicator
|
||||
const transcriptInfo = file.has_transcript
|
||||
? '<span class="transcript-available">✓ Transcript</span>'
|
||||
: '';
|
||||
|
||||
html += `<div class="file-item">
|
||||
<div>
|
||||
<div class="file-name">${file.name}</div>
|
||||
<div class="file-meta">Size: ${fileSizeStr} | Uploaded: ${dateStr}</div>
|
||||
</div>
|
||||
<div>
|
||||
<span class="status-badge ${statusClass}">${file.status || 'pending'}</span>
|
||||
${transcriptInfo}
|
||||
${processButton}
|
||||
</div>
|
||||
</div>`;
|
||||
});
|
||||
|
||||
fileListElement.innerHTML = html;
|
||||
}
|
||||
|
||||
// Add this function after updateFileList
|
||||
async function triggerProcessing(filename) {
|
||||
try {
|
||||
const response = await fetch(`/process/${filename}`, {
|
||||
method: 'POST'
|
||||
});
|
||||
|
||||
fileListElement.innerHTML = html;
|
||||
if (response.ok) {
|
||||
showResult(`Processing started for "${filename}"`, true);
|
||||
} else {
|
||||
const result = await response.json();
|
||||
showResult(`Error: ${result.detail}`, false);
|
||||
}
|
||||
} catch (error) {
|
||||
showResult(`Processing request failed: ${error.message}`, false);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize WebSocket connection when page loads
|
||||
document.addEventListener('DOMContentLoaded', connectWebSocket);
|
||||
|
||||
@@ -7,7 +7,7 @@ from enum import Enum
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from config import UPLOAD_DIR
|
||||
from config import UPLOAD_DIR, TRANSCRIPT_DIR
|
||||
|
||||
|
||||
class FileStatus(Enum):
|
||||
@@ -61,8 +61,21 @@ class AudioProcessor:
|
||||
self.file_status[filename] = FileStatus.PROCESSING
|
||||
|
||||
try:
|
||||
# TODO: Implement actual processing logic here
|
||||
await asyncio.sleep(3) # For now, just simulate processing with a delay
|
||||
### mock starts here
|
||||
# TODO: implement real transcription logic here (as stt engine)
|
||||
# Mock transcription process by creating a text file with the same name
|
||||
base_name = Path(filename).stem # Get filename without extension
|
||||
transcript_path = TRANSCRIPT_DIR / f"{base_name}.txt"
|
||||
|
||||
# Create a simple mock transcript file
|
||||
with open(transcript_path, "w") as f:
|
||||
f.write(f"Mock transcript for {filename}\n")
|
||||
f.write("This is a placeholder for the actual transcription.\n")
|
||||
f.write(f"Generated at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# Simulate some processing time
|
||||
await asyncio.sleep(3)
|
||||
### mock ends here
|
||||
|
||||
# Mark as completed
|
||||
self.file_status[filename] = FileStatus.COMPLETED
|
||||
|
||||
Reference in New Issue
Block a user