implement transcriber, downloader, data_handler

This commit is contained in:
ALIHAN DIKEL
2023-04-26 23:31:32 +03:00
commit ca30f2432d
6 changed files with 133 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
.DS_Store
.idea
audios
models
transcripts
__pycache__

3
.gitkeep Normal file
View File

@@ -0,0 +1,3 @@
sounds
models
transcripts

18
datahandler.py Normal file
View File

@@ -0,0 +1,18 @@
import os
import glob
import datetime
class DataHandler:
def get_audio_files(self):
self.audio_files = glob.glob(os.path.join(os.path.join(os.path.dirname(__file__), 'audios'), '*.m4a'))
def remove_audio_files(self):
for audio_file in self.audio_files:
os.remove(audio_file)
def save_text_as_file(self, text):
timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
filename = f"{timestamp}.txt"
with open(os.path.join(os.path.join(os.path.dirname(__file__), 'transcripts'), filename), 'w') as f:
f.write(text)

36
downloader.py Normal file
View File

@@ -0,0 +1,36 @@
from yt_dlp import YoutubeDL
# url_list = ['https://www.youtube.com/watch?v=sw9s-jXEaOw']
from datahandler import DataHandler
class VideoToAudioDownloader():
def __init__(self, params=None):
if params == None:
self.yt_opts = {
'format': 'm4a/bestaudio/best',
'outtmpl': 'audios/%(id)s_%(release_date)s.%(ext)s',
# See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
'postprocessors': [{
# Extract audio using ffmpeg
'key': 'FFmpegExtractAudio',
'preferredcodec': 'm4a',
}]
}
else: self.yt_opts = params
self.DataHandler = DataHandler()
def download_audio_from_yt(self, url):
with YoutubeDL(self.yt_opts) as ydl:
ydl.download(url)
"""
"progress_hooks": [yt_dlp_monitor],
final_filename = None
def yt_dlp_monitor(self, d):
final_filename = d.get('info_dict').get('_filename')
"""

40
main.py Normal file
View File

@@ -0,0 +1,40 @@
import click
from downloader import VideoToAudioDownloader
from transcriber import Transcriber
@click.group()
def cli():
pass
@cli.command()
@click.argument('urls', nargs=-1)
def download(urls):
#click.echo(urls)
vad = VideoToAudioDownloader()
for url in urls:
vad.download_audio_from_yt(url)
@cli.command()
def transcribe():
t = Transcriber()
t.load_model()
t.DataHandler.get_audio_files()
for af in t.audio_files:
t.transcribe(af)
t.persist()
t.cleanup()
if __name__ == "__main__":
cli()

30
transcriber.py Normal file
View File

@@ -0,0 +1,30 @@
import os
import glob
import datetime
import whisper
from datahandler import DataHandler
class Transcriber:
def __init__(self, model_name="base"):
self.model_name = model_name
self.DataHandler = DataHandler()
def load_model(self):
self.model = whisper.load_model(name=self.model_name, download_root="models/")
def transcribe(self, audio_file):
self.transcription = self.model.transcribe(
audio=audio_file,
verbose=True,
fp16=False)
self.raw_text = self.transcription["text"]
def persist(self):
self.DataHandler.save_text_as_file(self.raw_text)
def cleanup(self):
self.DataHandler.get_audio_files()
self.DataHandler.remove_audio_files()