implement transcriber, downloader, data_handler
This commit is contained in:
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
.DS_Store
|
||||
.idea
|
||||
audios
|
||||
models
|
||||
transcripts
|
||||
__pycache__
|
||||
18
datahandler.py
Normal file
18
datahandler.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import os
|
||||
import glob
|
||||
import datetime
|
||||
|
||||
class DataHandler:
|
||||
|
||||
def get_audio_files(self):
|
||||
self.audio_files = glob.glob(os.path.join(os.path.join(os.path.dirname(__file__), 'audios'), '*.m4a'))
|
||||
|
||||
def remove_audio_files(self):
|
||||
for audio_file in self.audio_files:
|
||||
os.remove(audio_file)
|
||||
|
||||
def save_text_as_file(self, text):
|
||||
timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
||||
filename = f"{timestamp}.txt"
|
||||
with open(os.path.join(os.path.join(os.path.dirname(__file__), 'transcripts'), filename), 'w') as f:
|
||||
f.write(text)
|
||||
36
downloader.py
Normal file
36
downloader.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from yt_dlp import YoutubeDL
|
||||
|
||||
|
||||
# url_list = ['https://www.youtube.com/watch?v=sw9s-jXEaOw']
|
||||
from datahandler import DataHandler
|
||||
|
||||
|
||||
class VideoToAudioDownloader():
|
||||
def __init__(self, params=None):
|
||||
if params == None:
|
||||
self.yt_opts = {
|
||||
'format': 'm4a/bestaudio/best',
|
||||
'outtmpl': 'audios/%(id)s_%(release_date)s.%(ext)s',
|
||||
# See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
|
||||
'postprocessors': [{
|
||||
# Extract audio using ffmpeg
|
||||
'key': 'FFmpegExtractAudio',
|
||||
'preferredcodec': 'm4a',
|
||||
}]
|
||||
}
|
||||
else: self.yt_opts = params
|
||||
self.DataHandler = DataHandler()
|
||||
|
||||
def download_audio_from_yt(self, url):
|
||||
with YoutubeDL(self.yt_opts) as ydl:
|
||||
ydl.download(url)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
"progress_hooks": [yt_dlp_monitor],
|
||||
|
||||
final_filename = None
|
||||
def yt_dlp_monitor(self, d):
|
||||
final_filename = d.get('info_dict').get('_filename')
|
||||
"""
|
||||
40
main.py
Normal file
40
main.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import click
|
||||
|
||||
from downloader import VideoToAudioDownloader
|
||||
from transcriber import Transcriber
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument('urls', nargs=-1)
|
||||
def download(urls):
|
||||
#click.echo(urls)
|
||||
vad = VideoToAudioDownloader()
|
||||
for url in urls:
|
||||
vad.download_audio_from_yt(url)
|
||||
|
||||
@cli.command()
|
||||
def transcribe():
|
||||
t = Transcriber()
|
||||
t.load_model()
|
||||
t.DataHandler.get_audio_files()
|
||||
for af in t.audio_files:
|
||||
t.transcribe(af)
|
||||
t.persist()
|
||||
t.cleanup()
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
30
transcriber.py
Normal file
30
transcriber.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import os
|
||||
import glob
|
||||
import datetime
|
||||
|
||||
import whisper
|
||||
|
||||
from datahandler import DataHandler
|
||||
|
||||
|
||||
class Transcriber:
|
||||
def __init__(self, model_name="base"):
|
||||
self.model_name = model_name
|
||||
self.DataHandler = DataHandler()
|
||||
|
||||
def load_model(self):
|
||||
self.model = whisper.load_model(name=self.model_name, download_root="models/")
|
||||
|
||||
def transcribe(self, audio_file):
|
||||
self.transcription = self.model.transcribe(
|
||||
audio=audio_file,
|
||||
verbose=True,
|
||||
fp16=False)
|
||||
self.raw_text = self.transcription["text"]
|
||||
|
||||
def persist(self):
|
||||
self.DataHandler.save_text_as_file(self.raw_text)
|
||||
|
||||
def cleanup(self):
|
||||
self.DataHandler.get_audio_files()
|
||||
self.DataHandler.remove_audio_files()
|
||||
Reference in New Issue
Block a user