Revert "Revert "feat: add transcript download functionality""
This commit is contained in:
@@ -71,6 +71,12 @@ pip install yt-dlp
|
||||
* Inputs:
|
||||
* `url` (string, required): URL of the video
|
||||
|
||||
* **download_transcript**
|
||||
* Download and clean video subtitles to produce a plain text transcript without timestamps or formatting
|
||||
* Inputs:
|
||||
* `url` (string, required): URL of the video
|
||||
* `language` (string, optional): Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'
|
||||
|
||||
## Usage Examples
|
||||
|
||||
Ask your LLM to:
|
||||
@@ -80,6 +86,8 @@ Ask your LLM to:
|
||||
"Download Chinese subtitles from this video: https://youtube.com/watch?v=..."
|
||||
"Download this video in 1080p: https://youtube.com/watch?v=..."
|
||||
"Download audio from this YouTube video: https://youtube.com/watch?v=..."
|
||||
"Get a clean transcript of this video: https://youtube.com/watch?v=..."
|
||||
"Download Spanish transcript from this video: https://youtube.com/watch?v=..."
|
||||
```
|
||||
|
||||
## Manual Start
|
||||
|
||||
@@ -3,7 +3,8 @@
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import { listSubtitles, downloadSubtitles } from '../modules/subtitle.js';
|
||||
import { listSubtitles, downloadSubtitles, downloadTranscript } from '../modules/subtitle.js';
|
||||
import { cleanSubtitleToTranscript } from '../modules/utils.js';
|
||||
import { CONFIG } from '../config.js';
|
||||
import * as fs from 'fs';
|
||||
|
||||
@@ -51,4 +52,60 @@ describe('Subtitle Functions', () => {
|
||||
.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe('downloadTranscript', () => {
|
||||
test('downloads and cleans transcript successfully', async () => {
|
||||
const result = await downloadTranscript(testUrl, 'en', testConfig);
|
||||
expect(typeof result).toBe('string');
|
||||
expect(result.length).toBeGreaterThan(0);
|
||||
expect(result).not.toContain('WEBVTT');
|
||||
expect(result).not.toContain('-->');
|
||||
expect(result).not.toMatch(/^\d+$/m);
|
||||
}, 30000);
|
||||
|
||||
test('handles invalid URL', async () => {
|
||||
await expect(downloadTranscript('invalid-url', 'en', testConfig))
|
||||
.rejects
|
||||
.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe('cleanSubtitleToTranscript', () => {
|
||||
test('cleans SRT content correctly', () => {
|
||||
const srtContent = `1
|
||||
00:00:01,000 --> 00:00:03,000
|
||||
Hello <i>world</i>
|
||||
|
||||
2
|
||||
00:00:04,000 --> 00:00:06,000
|
||||
This is a test
|
||||
|
||||
3
|
||||
00:00:07,000 --> 00:00:09,000
|
||||
<b>Bold text</b> here`;
|
||||
|
||||
const result = cleanSubtitleToTranscript(srtContent);
|
||||
expect(result).toBe('Hello world This is a test Bold text here');
|
||||
});
|
||||
|
||||
test('handles empty content', () => {
|
||||
const result = cleanSubtitleToTranscript('');
|
||||
expect(result).toBe('');
|
||||
});
|
||||
|
||||
test('removes timestamps and sequence numbers', () => {
|
||||
const srtContent = `1
|
||||
00:00:01,000 --> 00:00:03,000
|
||||
First line
|
||||
|
||||
2
|
||||
00:00:04,000 --> 00:00:06,000
|
||||
Second line`;
|
||||
|
||||
const result = cleanSubtitleToTranscript(srtContent);
|
||||
expect(result).not.toContain('00:00');
|
||||
expect(result).not.toMatch(/^\d+$/);
|
||||
expect(result).toBe('First line Second line');
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -15,7 +15,7 @@ import { CONFIG } from "./config.js";
|
||||
import { _spawnPromise, safeCleanup } from "./modules/utils.js";
|
||||
import { downloadVideo } from "./modules/video.js";
|
||||
import { downloadAudio } from "./modules/audio.js";
|
||||
import { listSubtitles, downloadSubtitles } from "./modules/subtitle.js";
|
||||
import { listSubtitles, downloadSubtitles, downloadTranscript } from "./modules/subtitle.js";
|
||||
|
||||
const VERSION = '0.6.26';
|
||||
|
||||
@@ -148,6 +148,18 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
||||
required: ["url"],
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "download_transcript",
|
||||
description: "Download and clean video subtitles to produce a plain text transcript without timestamps or formatting.",
|
||||
inputSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
url: { type: "string", description: "URL of the video" },
|
||||
language: { type: "string", description: "Language code (e.g., 'en', 'zh-Hant', 'ja'). Defaults to 'en'" },
|
||||
},
|
||||
required: ["url"],
|
||||
},
|
||||
},
|
||||
],
|
||||
};
|
||||
});
|
||||
@@ -211,6 +223,11 @@ server.setRequestHandler(
|
||||
() => downloadAudio(args.url, CONFIG),
|
||||
"Error downloading audio"
|
||||
);
|
||||
} else if (toolName === "download_transcript") {
|
||||
return handleToolExecution(
|
||||
() => downloadTranscript(args.url, args.language || CONFIG.download.defaultSubtitleLanguage, CONFIG),
|
||||
"Error downloading transcript"
|
||||
);
|
||||
} else {
|
||||
return {
|
||||
content: [{ type: "text", text: `Unknown tool: ${toolName}` }],
|
||||
|
||||
@@ -2,7 +2,7 @@ import * as fs from "fs";
|
||||
import * as path from "path";
|
||||
import * as os from "os";
|
||||
import type { Config } from '../config.js';
|
||||
import { _spawnPromise, validateUrl } from "./utils.js";
|
||||
import { _spawnPromise, validateUrl, cleanSubtitleToTranscript } from "./utils.js";
|
||||
|
||||
/**
|
||||
* Lists all available subtitles for a video.
|
||||
@@ -105,4 +105,65 @@ export async function downloadSubtitles(
|
||||
} finally {
|
||||
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Downloads and cleans subtitles to produce a plain text transcript.
|
||||
*
|
||||
* @param url - The URL of the video
|
||||
* @param language - Language code (e.g., 'en', 'zh-Hant', 'ja')
|
||||
* @param config - Configuration object
|
||||
* @returns Promise resolving to the cleaned transcript text
|
||||
* @throws {Error} When URL is invalid, language is not available, or download fails
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* try {
|
||||
* const transcript = await downloadTranscript('https://youtube.com/watch?v=...', 'en', config);
|
||||
* console.log('Transcript:', transcript);
|
||||
* } catch (error) {
|
||||
* console.error('Failed to download transcript:', error);
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export async function downloadTranscript(
|
||||
url: string,
|
||||
language: string,
|
||||
config: Config
|
||||
): Promise<string> {
|
||||
if (!validateUrl(url)) {
|
||||
throw new Error('Invalid or unsupported URL format');
|
||||
}
|
||||
|
||||
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), config.file.tempDirPrefix));
|
||||
|
||||
try {
|
||||
await _spawnPromise('yt-dlp', [
|
||||
'--skip-download',
|
||||
'--write-subs',
|
||||
'--write-auto-subs',
|
||||
'--sub-lang', language,
|
||||
'--sub-format', 'ttml',
|
||||
'--convert-subs', 'srt',
|
||||
'--output', path.join(tempDir, 'transcript.%(ext)s'),
|
||||
url
|
||||
]);
|
||||
|
||||
const srtFiles = fs.readdirSync(tempDir)
|
||||
.filter(file => file.endsWith('.srt'));
|
||||
|
||||
if (srtFiles.length === 0) {
|
||||
throw new Error('No subtitle files found for transcript generation');
|
||||
}
|
||||
|
||||
let transcriptContent = '';
|
||||
for (const file of srtFiles) {
|
||||
const srtContent = fs.readFileSync(path.join(tempDir, file), 'utf8');
|
||||
transcriptContent += cleanSubtitleToTranscript(srtContent) + ' ';
|
||||
}
|
||||
|
||||
return transcriptContent.trim();
|
||||
} finally {
|
||||
fs.rmSync(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
@@ -145,4 +145,39 @@ export function generateRandomFilename(extension: string = 'mp4'): string {
|
||||
const timestamp = getFormattedTimestamp();
|
||||
const randomId = randomBytes(4).toString('hex');
|
||||
return `${timestamp}_${randomId}.${extension}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleans SRT subtitle content to produce a plain text transcript.
|
||||
* Removes timestamps, sequence numbers, and HTML tags.
|
||||
*
|
||||
* @param srtContent - Raw SRT subtitle content
|
||||
* @returns Cleaned transcript text
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const cleanedText = cleanSubtitleToTranscript(srtContent);
|
||||
* console.log(cleanedText); // 'Hello world this is a transcript...'
|
||||
* ```
|
||||
*/
|
||||
export function cleanSubtitleToTranscript(srtContent: string): string {
|
||||
return srtContent
|
||||
.split('\n')
|
||||
.filter(line => {
|
||||
const trimmed = line.trim();
|
||||
// Remove empty lines
|
||||
if (!trimmed) return false;
|
||||
// Remove sequence numbers (lines that are just digits)
|
||||
if (/^\d+$/.test(trimmed)) return false;
|
||||
// Remove timestamp lines
|
||||
if (/^\d{2}:\d{2}:\d{2}[.,]\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[.,]\d{3}$/.test(trimmed)) return false;
|
||||
return true;
|
||||
})
|
||||
.map(line => {
|
||||
// Remove HTML tags
|
||||
return line.replace(/<[^>]*>/g, '');
|
||||
})
|
||||
.join(' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
Reference in New Issue
Block a user