Revert "Revert "feat: add transcript download functionality""

This commit is contained in:
Kevin Watt
2025-05-30 12:03:04 +08:00
committed by GitHub
parent 0de9308a41
commit f27d22eb81
5 changed files with 181 additions and 3 deletions

View File

@@ -71,6 +71,12 @@ pip install yt-dlp
* Inputs:
* `url` (string, required): URL of the video
* **download_transcript**
* Download and clean video subtitles to produce a plain text transcript without timestamps or formatting
* Inputs:
* `url` (string, required): URL of the video
* `language` (string, optional): Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'
## Usage Examples
Ask your LLM to:
@@ -80,6 +86,8 @@ Ask your LLM to:
"Download Chinese subtitles from this video: https://youtube.com/watch?v=..."
"Download this video in 1080p: https://youtube.com/watch?v=..."
"Download audio from this YouTube video: https://youtube.com/watch?v=..."
"Get a clean transcript of this video: https://youtube.com/watch?v=..."
"Download Spanish transcript from this video: https://youtube.com/watch?v=..."
```
## Manual Start

View File

@@ -3,7 +3,8 @@
import { describe, test, expect } from '@jest/globals';
import * as os from 'os';
import * as path from 'path';
import { listSubtitles, downloadSubtitles } from '../modules/subtitle.js';
import { listSubtitles, downloadSubtitles, downloadTranscript } from '../modules/subtitle.js';
import { cleanSubtitleToTranscript } from '../modules/utils.js';
import { CONFIG } from '../config.js';
import * as fs from 'fs';
@@ -51,4 +52,60 @@ describe('Subtitle Functions', () => {
.toThrow();
});
});
describe('downloadTranscript', () => {
test('downloads and cleans transcript successfully', async () => {
const result = await downloadTranscript(testUrl, 'en', testConfig);
expect(typeof result).toBe('string');
expect(result.length).toBeGreaterThan(0);
expect(result).not.toContain('WEBVTT');
expect(result).not.toContain('-->');
expect(result).not.toMatch(/^\d+$/m);
}, 30000);
test('handles invalid URL', async () => {
await expect(downloadTranscript('invalid-url', 'en', testConfig))
.rejects
.toThrow();
});
});
describe('cleanSubtitleToTranscript', () => {
test('cleans SRT content correctly', () => {
const srtContent = `1
00:00:01,000 --> 00:00:03,000
Hello <i>world</i>
2
00:00:04,000 --> 00:00:06,000
This is a test
3
00:00:07,000 --> 00:00:09,000
<b>Bold text</b> here`;
const result = cleanSubtitleToTranscript(srtContent);
expect(result).toBe('Hello world This is a test Bold text here');
});
test('handles empty content', () => {
const result = cleanSubtitleToTranscript('');
expect(result).toBe('');
});
test('removes timestamps and sequence numbers', () => {
const srtContent = `1
00:00:01,000 --> 00:00:03,000
First line
2
00:00:04,000 --> 00:00:06,000
Second line`;
const result = cleanSubtitleToTranscript(srtContent);
expect(result).not.toContain('00:00');
expect(result).not.toMatch(/^\d+$/);
expect(result).toBe('First line Second line');
});
});
});

View File

@@ -15,7 +15,7 @@ import { CONFIG } from "./config.js";
import { _spawnPromise, safeCleanup } from "./modules/utils.js";
import { downloadVideo } from "./modules/video.js";
import { downloadAudio } from "./modules/audio.js";
import { listSubtitles, downloadSubtitles } from "./modules/subtitle.js";
import { listSubtitles, downloadSubtitles, downloadTranscript } from "./modules/subtitle.js";
const VERSION = '0.6.26';
@@ -148,6 +148,18 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
required: ["url"],
},
},
{
name: "download_transcript",
description: "Download and clean video subtitles to produce a plain text transcript without timestamps or formatting.",
inputSchema: {
type: "object",
properties: {
url: { type: "string", description: "URL of the video" },
language: { type: "string", description: "Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'" },
},
required: ["url"],
},
},
],
};
});
@@ -211,6 +223,11 @@ server.setRequestHandler(
() => downloadAudio(args.url, CONFIG),
"Error downloading audio"
);
} else if (toolName === "download_transcript") {
return handleToolExecution(
() => downloadTranscript(args.url, args.language || CONFIG.download.defaultSubtitleLanguage, CONFIG),
"Error downloading transcript"
);
} else {
return {
content: [{ type: "text", text: `Unknown tool: ${toolName}` }],

View File

@@ -2,7 +2,7 @@ import * as fs from "fs";
import * as path from "path";
import * as os from "os";
import type { Config } from '../config.js';
import { _spawnPromise, validateUrl } from "./utils.js";
import { _spawnPromise, validateUrl, cleanSubtitleToTranscript } from "./utils.js";
/**
* Lists all available subtitles for a video.
@@ -105,4 +105,65 @@ export async function downloadSubtitles(
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}
/**
* Downloads and cleans subtitles to produce a plain text transcript.
*
* @param url - The URL of the video
* @param language - Language code (e.g., 'en', 'zh-Hant', 'ja')
* @param config - Configuration object
* @returns Promise resolving to the cleaned transcript text
* @throws {Error} When URL is invalid, language is not available, or download fails
*
* @example
* ```typescript
* try {
* const transcript = await downloadTranscript('https://youtube.com/watch?v=...', 'en', config);
* console.log('Transcript:', transcript);
* } catch (error) {
* console.error('Failed to download transcript:', error);
* }
* ```
*/
export async function downloadTranscript(
url: string,
language: string,
config: Config
): Promise<string> {
if (!validateUrl(url)) {
throw new Error('Invalid or unsupported URL format');
}
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), config.file.tempDirPrefix));
try {
await _spawnPromise('yt-dlp', [
'--skip-download',
'--write-subs',
'--write-auto-subs',
'--sub-lang', language,
'--sub-format', 'ttml',
'--convert-subs', 'srt',
'--output', path.join(tempDir, 'transcript.%(ext)s'),
url
]);
const srtFiles = fs.readdirSync(tempDir)
.filter(file => file.endsWith('.srt'));
if (srtFiles.length === 0) {
throw new Error('No subtitle files found for transcript generation');
}
let transcriptContent = '';
for (const file of srtFiles) {
const srtContent = fs.readFileSync(path.join(tempDir, file), 'utf8');
transcriptContent += cleanSubtitleToTranscript(srtContent) + ' ';
}
return transcriptContent.trim();
} finally {
fs.rmSync(tempDir, { recursive: true, force: true });
}
}

View File

@@ -145,4 +145,39 @@ export function generateRandomFilename(extension: string = 'mp4'): string {
const timestamp = getFormattedTimestamp();
const randomId = randomBytes(4).toString('hex');
return `${timestamp}_${randomId}.${extension}`;
}
/**
* Cleans SRT subtitle content to produce a plain text transcript.
* Removes timestamps, sequence numbers, and HTML tags.
*
* @param srtContent - Raw SRT subtitle content
* @returns Cleaned transcript text
*
* @example
* ```typescript
* const cleanedText = cleanSubtitleToTranscript(srtContent);
* console.log(cleanedText); // 'Hello world this is a transcript...'
* ```
*/
export function cleanSubtitleToTranscript(srtContent: string): string {
return srtContent
.split('\n')
.filter(line => {
const trimmed = line.trim();
// Remove empty lines
if (!trimmed) return false;
// Remove sequence numbers (lines that are just digits)
if (/^\d+$/.test(trimmed)) return false;
// Remove timestamp lines
if (/^\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}$/.test(trimmed)) return false;
return true;
})
.map(line => {
// Remove HTML tags
return line.replace(/<[^>]*>/g, '');
})
.join(' ')
.replace(/\s+/g, ' ')
.trim();
}