1
0
mirror of https://github.com/inzva/inzpeech.git synced 2021-06-01 09:25:07 +03:00

Files added.

This commit is contained in:
Bekci
2020-11-13 20:55:08 +03:00
parent 423fa43d61
commit 0fe0fe810f
37 changed files with 4228 additions and 48 deletions

152
.gitignore vendored Normal file
View File

@@ -0,0 +1,152 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
*.out
outs/
txts/
saved-models/
*.pkl
*.txt
.DS_Store
*.DS_Store
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
*__pycache__/*
# Celery stuff
celerybeat-schedule
celerybeat.pid
*.pth
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
*src*
*.pth

58
README.md Normal file
View File

@@ -0,0 +1,58 @@
# Speaker Identificition
inzva AI Projects #5 - Speaker Identification
## Project Description
In this project we tried to solve the problem of Speaker Identification which is a process of recognizing a person from a voice utterance. We implemented the methods propsed in [Deep CNNs With Self-Attention for Speaker Identification](https://ieeexplore.ieee.org/document/8721628) paper on both Tensorflow-Keras and Pytorch.
## Dataset
We used below datasets:
* [VCTK Corpus](https://datashare.is.ed.ac.uk/handle/10283/3443)
* [VoxCeleb](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/)
VCTK dataset is easy to use, no license agreement is required and it is easy to use after download.
For the VoxCeleb dataset, it is recommended to visit its website to sign up and find download and conversion scripts for the datasets.
The data [split text file](http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/iden_split.txt) for identification will be required.
The files under [dataloaders](dataloaders/) used for loading the data with datagens in Keras and dataloaders in Pytorch. The scripts can generate file paths in runtime or read from a txt file directly. It is recommended to generate txt files. Check [this notebook](Save_VoxCelebTxts.ipynb) to generate such a file.
It is also recommended to generate pickle files from audio features first and load them. Our data loaders works with that way too. Check out scripts under [utils](utils/) folder to create such files.
## Preprocess
Before feeding the audio files into our models, we extract filter bank coefficients from them. Check out [here](https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html) for the complete process. Our implementation is under [utils/preprocessed_feature_extraction.py](utils/preprocessed_feature_extraction.py)
## Models
We implemented below architectures:
* [VGG-like CNN](models/model_keras.py)
* [ResNet18](models/resnet18_keras.py)
* [ResNet50](ResNet/model.py)
## Results
We achieved
## Nearest Neighbor Search
After training our models, we extracted embeddings with the trained model and used knn algorithm to find closest neighboors of the extracted embeddings. Such system can be used to find the closest voice utterances and their class labels for a given audio signal.
Check out [extract_embeds.py](extract_embeds.py) and [closest_celeb.py](closest_celeb.py) scripts for the implementation of this method.
## Project Dependencies
- Keras
- Pytorch
- MatPlotLib
- TensorFlow
- Pickle
- Numpy
- Librosa

43
ResNet/create_charts.py Normal file
View File

@@ -0,0 +1,43 @@
import re
import matplotlib.pyplot as plt
file_name = 'out_17_10_2020.txt'
# Save loss fig
pattern = re.compile(".+loss: ([0-9].[0-9]+)")
loss_y = [re.match(pattern, line).group(1) if re.match(
pattern, line) else None for line in open(file_name)]
loss_y = [float(x) for x in loss_y if x is not None]
loss_x = [x for x in range(len(loss_y))]
plt.figure(figsize=(18, 12))
plt.plot(loss_x, loss_y)
plt.title('Model Train Loss Chart')
plt.xlabel('# of iterations')
plt.ylabel('Loss')
plt.savefig('loss_{}.png'.format(file_name))
# Save acc fig
val_pattern = re.compile("Val Acc: ([0-9]+.[0-9]+)")
train_pattern = re.compile("Train Acc: ([0-9]+.[0-9]+)")
train_acc_y = [re.match(train_pattern, line).group(1) if re.match(
train_pattern, line) else None for line in open(file_name)]
train_acc_y = [float(x) for x in train_acc_y if x is not None]
val_acc_y = [re.match(val_pattern, line).group(1) if re.match(
val_pattern, line) else None for line in open(file_name)]
val_acc_y = [float(x) for x in val_acc_y if x is not None]
acc_x = [x for x in range(len(train_acc_y))]
plt.figure(figsize=(18, 12))
plt.plot(acc_x, val_acc_y, label='Val')
plt.plot(acc_x, train_acc_y, label='Train')
plt.title('Model Train Accuracy Chart')
plt.xlabel('# of epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('acc_{}.png'.format(file_name))

222
ResNet/model.py Normal file
View File

@@ -0,0 +1,222 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
class SelfAttention(nn.Module):
def __init__(self, embed_size, heads):
super(SelfAttention, self).__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
assert (
self.head_dim * heads == embed_size
), "Embedding size needs to be divisible by heads"
self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
def forward(self, values, keys, query, mask=None):
# Get number of training examples
N = query.shape[0]
value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
# Split the embedding into self.heads different pieces
values = values.reshape(N, value_len, self.heads, self.head_dim)
keys = keys.reshape(N, key_len, self.heads, self.head_dim)
query = query.reshape(N, query_len, self.heads, self.head_dim)
values = self.values(values) # (N, value_len, heads, head_dim)
keys = self.keys(keys) # (N, key_len, heads, head_dim)
queries = self.queries(query) # (N, query_len, heads, heads_dim)
# Einsum does matrix mult. for query*keys for each training example
# with every other training example, don't be confused by einsum
# it's just how I like doing matrix multiplication & bmm
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
# queries shape: (N, query_len, heads, heads_dim),
# keys shape: (N, key_len, heads, heads_dim)
# energy: (N, heads, query_len, key_len)
# Mask padded indices so their weights become 0
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
# Normalize energy values similarly to seq2seq + attention
# so that they sum to 1. Also divide by scaling factor for
# better stability
attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
# attention shape: (N, heads, query_len, key_len)
out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
N, query_len, self.heads * self.head_dim
)
# attention shape: (N, heads, query_len, key_len)
# values shape: (N, value_len, heads, heads_dim)
# out after matrix multiply: (N, query_len, heads, head_dim), then
# we reshape and flatten the last two dimensions.
out = self.fc_out(out)
# Linear layer doesn't modify the shape, final shape will be
# (N, query_len, embed_size)
return out
class block(nn.Module):
def __init__(
self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1
):
super(block, self).__init__()
self.conv1 = nn.Conv2d(
in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0
)
self.bn1 = nn.BatchNorm2d(intermediate_channels)
self.conv2 = nn.Conv2d(
intermediate_channels,
intermediate_channels,
kernel_size=3,
stride=stride,
padding=1,
)
self.bn2 = nn.BatchNorm2d(intermediate_channels)
self.conv3 = nn.Conv2d(
intermediate_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
)
self.bn3 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU()
self.identity_downsample = identity_downsample
self.stride = stride
def forward(self, x):
identity = x.clone()
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = self.relu(x)
x = self.conv3(x)
x = self.bn3(x)
if self.identity_downsample is not None:
identity = self.identity_downsample(identity)
x += identity
x = self.relu(x)
return x
class Net(nn.Module):
def __init__(self, block, layers, image_channels, num_classes, expansion):
super(Net, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(
image_channels, 64, kernel_size=7, stride=2, padding=3)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# Essentially the entire ResNet architecture are in these 4 lines below
self.layer1 = self._make_layer(
block, layers[0], intermediate_channels=64, out_channels=64*expansion, stride=1
)
self.layer2 = self._make_layer(
block, layers[1], intermediate_channels=128, out_channels=128*expansion, stride=2
)
self.layer3 = self._make_layer(
block, layers[2], intermediate_channels=256, out_channels=256*expansion, stride=2
)
self.layer4 = self._make_layer(
block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2
)
self.attention = SelfAttention(heads=4, embed_size=512*expansion)
self.avgpool = nn.AvgPool2d((20, 1))
self.fc1 = nn.Linear(512*expansion, 512*expansion//2)
self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)
self.fc3 = nn.Linear(512*expansion//4, num_classes)
def forward(self, x):
# ResNet layer
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])
# Attention Layer
x = self.attention(x, x, x)
x = self.avgpool(x)
# FC Layer
x = x.reshape(x.shape[0], -1)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
return x
def _make_layer(self, block, num_residual_blocks, intermediate_channels, out_channels, stride):
identity_downsample = None
layers = []
# Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
# we need to adapt the Identity (skip connection) so it will be able to be added
# to the layer that's ahead
if stride != 1 or self.in_channels != out_channels:
identity_downsample = nn.Sequential(
nn.Conv2d(
self.in_channels,
out_channels,
kernel_size=1,
stride=stride,
),
nn.BatchNorm2d(out_channels),
)
layers.append(
block(self.in_channels, intermediate_channels,
out_channels, identity_downsample, stride)
)
self.in_channels = out_channels
# For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
# then finally back to 256. Hence no identity downsample is needed, since stride = 1,
# and also same amount of channels.
for i in range(num_residual_blocks - 1):
layers.append(
block(self.in_channels, intermediate_channels, out_channels))
return nn.Sequential(*layers)
def Net_ResNet50(img_channel=3, num_classes=1000):
return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)
def Net_ResNet101(img_channel=3, num_classes=1000):
return Net(block, [3, 4, 23, 3], img_channel, num_classes, expansion=4)
def Net_ResNet152(img_channel=3, num_classes=1000):
return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)

162
ResNet/run_model_vctk.py Normal file
View File

@@ -0,0 +1,162 @@
from torchsummary import summary
import os
import glob
import torch
import librosa
import pickle
import copy
import random
import numpy as np
import pandas as pd
import scipy.signal as signal
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from model import Net_ResNet50
from torch.utils.data import random_split, Dataset, DataLoader
from tqdm import tqdm
# Parameters
dataset_dir = '/home/bbekci/inzpeech/preprocessed_vctk.pkl'
max_epochs = 10
batch_size = 256
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:0' if use_cuda else 'cpu')
torch.backends.cudnn.benchmark = True
def test_val_calculations(data_set_loader, _n_classes, _net):
class_correct = [0] * _n_classes
class_total = [0] * _n_classes
with torch.no_grad():
for data in data_set_loader:
inputs = data[0].to(device, dtype=torch.float)
labels = data[1].to(device)
outputs = _net(inputs)
_, predicted = torch.max(outputs, 1)
c = (predicted == labels)
for i in range(len(labels)):
label = labels[i]
class_correct[label] += c[i].item()
class_total[label] += 1
mean_acc = 0
div_count = 0
for i in range(_n_classes):
if class_total[i] != 0:
mean_acc += (100 * class_correct[i] / class_total[i])
div_count += 1
return mean_acc / div_count
class PreprocessedDataset(Dataset):
def __init__(self, file_dir):
self.file_dir = file_dir
self.lst = None
with open(file_dir, 'rb') as pickle_load:
self.lst = pickle.load(pickle_load)
random.shuffle(self.lst)
def __len__(self):
return len(self.lst)
def n_class(self):
return np.max(np.array([i[0] for i in self.lst])) + 1
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sound_data = self.lst[idx][1]
label = self.lst[idx][0]
sound_data = np.expand_dims(sound_data, axis=0)
sample = (sound_data, label)
return sample
sound_data = PreprocessedDataset(file_dir=dataset_dir)
len_sound_data = len(sound_data)
n_classes = sound_data.n_class()
train_data_count = int(len_sound_data * 0.8)
val_data_count = int(len_sound_data * 0.1)
test_data_count = len_sound_data - val_data_count - train_data_count
train_data, val_data, test_data = random_split(sound_data,
[train_data_count,
val_data_count,
test_data_count]
)
train_dataset_loader = torch.utils.data.DataLoader(train_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
val_dataset_loader = torch.utils.data.DataLoader(val_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
test_dataset_loader = torch.utils.data.DataLoader(test_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
print('Test Data Size: %s' % len(test_dataset_loader.dataset))
print('Val Data Size: %s' % len(val_dataset_loader.dataset))
print('Train Data Size: %s' % len(train_dataset_loader.dataset))
net = Net_ResNet50(img_channel=1, num_classes=n_classes)
net.to(device)
# net.load_state_dict(torch.load('/home/bbekci/inzpeech/ResNet/model/mode.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
for epoch in range(max_epochs): # loop over the dataset multiple times
correct_pred = 0
for i, data in enumerate(train_dataset_loader):
# get the inputs; data is a list of [inputs, labels]
inputs = data[0].to(device, dtype=torch.float)
labels = data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
output = net(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
_, predicted = torch.max(output.data, 1)
correct_pred += (predicted == labels).float().sum()
print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, loss))
# Validation
val_acc = test_val_calculations(val_dataset_loader, n_classes, net)
print('Val Acc: %.6f' % val_acc)
# Calculate Train Accuracy
train_acc = 100 * correct_pred / len(train_data)
print('Train Acc: %.6f' % train_acc)
# torch.save(best_net.state_dict(), '/home/bbekci/inzpeech/ResNet/model/model.pth')
test_acc = test_val_calculations(test_dataset_loader, n_classes, net)
print('Test Acc: %.6f' % test_acc)

129
Save_VoxCelebTxts.ipynb Normal file
View File

@@ -0,0 +1,129 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from dataloaders.DatagenVoxCeleb import get_pkl_paths"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Progress: 5993 / 5994\r"
]
}
],
"source": [
"data_dir = '/media/data/bbekci/voxceleb2/data/dev/pkls/'\n",
"num_audio_per_video=1e5\n",
"num_video_per_person=1e5\n",
"split_by='audio'\n",
"split_size=0.2\n",
"tr_paths, val_paths = get_pkl_paths(data_dir, num_video_per_person, num_audio_per_video, split_by, split_size)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"with open('tr_voxceleb_audio_pkl_paths.txt', 'w') as path_file:\n",
" for p in tr_paths:\n",
" path_file.write(p + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"with open('tr_voxceleb_audio_pkl_paths.txt', 'r') as path_file:\n",
" valps = path_file.readlines()\n",
" valps = [pr.strip() for pr in valps]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Progress: 5993 / 5994\r"
]
}
],
"source": [
"data_dir = '/media/data/bbekci/voxceleb2/data/dev/pkls/'\n",
"num_audio_per_video=1e5\n",
"num_video_per_person=1e5\n",
"split_by='video'\n",
"split_size=0.2\n",
"vid_tr_paths, vid_val_paths = get_pkl_paths(data_dir, num_video_per_person, num_audio_per_video, split_by, split_size)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"with open('tr_voxceleb_video_pkl_paths.txt', 'w') as path_file:\n",
" for p in vid_tr_paths:\n",
" path_file.write(p + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"with open('val_voxceleb_video_pkl_paths.txt', 'w') as path_file:\n",
" for p in vid_val_paths:\n",
" path_file.write(p + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

0
TrainVoxCeleb2.py Normal file
View File

781
Train_VCTK_Keras.ipynb Normal file
View File

@@ -0,0 +1,781 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
"from tensorflow.keras.utils import Sequence, to_categorical\n",
"from tensorflow.keras.models import Model\n",
"from tensorflow.keras.optimizers import Adam\n",
"from models.vggish import VGGish\n",
"from utils import apply_melspectrogram_to_file\n",
"import math\n",
"import numpy as np\n",
"from models.model_keras_dropout import vgg_att\n",
"from dataloaders.DatagenVoxCeleb import get_keras_datagens"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"data_dir = '/media/data/bbekci/voxceleb2/data/dev/pkls/'\n",
"tr_txt = 'txts/tr_voxceleb_video_pkl_paths.txt'\n",
"val_txt = 'txts/val_voxceleb_video_pkl_paths.txt' \n",
"batch_size = 32"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tr_gen, val_gen = get_keras_datagens(data_dir, batch_size, split_by='video', split_size=0.3, txt_dirs=[tr_txt, val_txt])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"tags": [
"outputPrepend"
]
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" 364 2552 4947 1090 2189 3961 4623\n",
" 4922 2577 964 2048 5547 1662 4686 3146 2605 2089 3819 5493 2437 1326\n",
" 5154 940 5694 3133]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3636 1114 831 4209 866 529 3704 2501 2403 2645 1307 386 4005 4568\n",
" 5172 5787 4859 4257 547 1116 268 1585 354 1716 5165 1408 5708 4017\n",
" 3690 4062 4107 602]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 592 381 1795 1333 5038 3831 4154 5876 5845 168 5659 2375 4812 4068\n",
" 5806 1244 5672 2005 2416 3612 1101 4249 5884 3992 3385 4303 3103 5586\n",
" 4038 4148 139 5458]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1374 5100 5892 2967 1174 3046 3818 4641 5891 4187 1212 4310 4210 2796\n",
" 592 3003 4590 1918 2457 3856 3692 4718 3635 3314 5029 4960 1683 10\n",
" 5867 2898 826 5025]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5856 3013 3428 3755 3080 2667 5730 7 2841 3147 1519 793 3551 3098\n",
" 818 528 216 302 695 3115 1450 302 2475 4594 1595 5073 372 4836\n",
" 266 3012 2619 2905]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3064 4663 3212 5901 1248 3894 116 3284 4043 388 5904 162 2889 2444\n",
" 3755 2 1685 1095 4169 5486 5113 2235 5558 460 3657 4148 2138 1702\n",
" 4362 1437 5170 5415]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4789 1600 793 5460 5571 5074 2069 5383 5884 959 4637 3846 4748 3840\n",
" 3948 2562 3689 4302 4408 4558 3745 3000 3779 3155 3011 1656 3179 2235\n",
" 3748 2297 2176 5257]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5165 3259 4202 3536 2545 5784 4765 2015 5280 4562 4448 2262 4377 100\n",
" 3864 4011 3787 4301 4307 1666 3583 4402 835 3339 3678 5789 1354 3211\n",
" 3092 5935 1736 249]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4875 5844 4131 5100 3996 4222 2733 4666 5479 3199 2098 4869 4644 2420\n",
" 5384 4918 4804 403 1464 5059 2202 236 1814 5462 3696 3837 3095 4024\n",
" 4937 2809 5134 2014]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3395 1061 2385 5608 1499 2550 3185 678 5415 4573 5590 4458 5584 3732\n",
" 278 114 1274 5152 2053 2122 4318 4040 5700 98 873 2555 2529 2796\n",
" 10 3617 3105 471]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2438 3190 5925 5483 720 4630 690 1564 2772 5705 2125 4647 2519 476\n",
" 2053 2938 5409 1966 4315 4123 2011 3763 4064 1437 5818 2327 2844 4666\n",
" 2887 2753 3188 683]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5392 1333 4178 73 2472 4321 134 975 1681 3818 2925 3211 3651 4321\n",
" 5447 1397 1379 1351 2433 5226 4275 1068 405 2933 4585 5839 3662 653\n",
" 1098 4337 4072 620]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1219 3569 2549 1912 2066 2362 1887 781 2385 5730 41 5060 4376 449\n",
" 5133 1390 2098 5916 4656 2632 4011 5406 1589 1690 4212 2430 2176 3590\n",
" 2782 92 4893 5392]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2599 1904 3732 5206 1007 2857 5600 33 2366 3077 4534 3807 2080 4197\n",
" 4808 5321 1007 4827 2450 200 4595 3039 4821 3987 3337 811 1276 2007\n",
" 2778 4318 5153 1554]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1033 3157 2085 5760 3713 3213 4969 5806 4862 5022 2973 1291 2677 2504\n",
" 2289 2877 4660 3306 541 5117 3133 5695 1790 1451 4969 32 5864 290\n",
" 3394 3569 1638 4083]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2716 5294 5461 2563 5825 4565 3402 3595 4267 2793 940 1629 271 1438\n",
" 1662 4781 2945 2450 1169 485 2951 3893 1075 4179 5392 3448 2163 2491\n",
" 434 2069 5912 3228]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1770 4103 2267 1149 2558 4236 3083 2992 3634 5813 2096 4486 3847 1078\n",
" 569 5784 5426 4491 4588 5282 1661 3704 3324 1432 5430 554 1268 807\n",
" 1095 519 41 230]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3992 2733 3214 1015 336 4270 2028 1015 1834 4893 5260 5323 2923 2258\n",
" 3590 5618 3996 5126 347 5925 3303 3718 2409 5452 3598 3260 3160 3926\n",
" 3563 4015 3095 2266]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3098 57 5419 3739 1355 534 2082 1799 472 2165 2577 3207 164 2551\n",
" 3382 615 3227 4422 106 3430 2964 4773 3781 2192 5242 3675 5109 5941\n",
" 4839 2048 3490 1178]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5757 2110 1124 489 5880 5022 2997 1110 4784 966 4305 1834 89 482\n",
" 3821 1352 5787 5843 5368 89 522 2075 4213 954 1205 3496 651 4050\n",
" 5217 2053 1044 5601]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2610 5629 636 4937 4370 896 1974 452 2148 3729 3696 2475 5312 3359\n",
" 2533 3815 3595 5702 1089 217 3802 2670 1637 2063 4568 991 3089 5555\n",
" 2573 1935 5383 3129]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5725 3669 1642 5205 151 4755 358 853 175 5951 690 5852 3632 5480\n",
" 326 508 5352 358 1529 4471 2718 615 2866 1358 2363 650 2915 3943\n",
" 933 3031 3652 1134]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1753 1096 3334 5107 2915 3279 1839 146 4234 3248 1498 5680 4686 4358\n",
" 3629 2552 2396 890 843 433 1836 3430 899 1254 1531 2867 3848 2153\n",
" 1687 1060 4637 1528]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 611 1244 568 4955 4817 771 4255 3697 2721 2113 5453 3481 1376 1249\n",
" 3339 194 3569 5563 2811 697 5057 3178 4294 3133 2549 2858 2289 1778\n",
" 2819 5083 2305 20]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1880 1977 1273 2573 1360 3319 4314 4147 1947 604 3236 1326 3393 1541\n",
" 2107 4877 5402 5634 2555 5353 926 2010 5411 4584 2895 2076 1502 4932\n",
" 4742 5272 5597 5514]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2994 5286 2005 1647 4391 1695 5331 2463 3227 4466 5876 5636 5190 110\n",
" 4320 3740 5450 587 3401 1084 3481 3917 4244 3180 386 5608 5844 168\n",
" 3823 3707 1574 3287]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5764 18 2731 2764 5685 2475 2877 1063 4875 3532 1076 114 3055 801\n",
" 59 1405 3230 2008 3012 3780 1615 3540 4775 434 4730 382 5704 870\n",
" 4739 92 5644 3452]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3465 4347 3551 2249 4517 3608 5358 5392 1733 4231 1360 5825 404 363\n",
" 1385 3680 784 3152 3762 2003 2148 2994 2181 624 5565 2780 233 981\n",
" 3272 748 3917 4937]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2381 2245 387 5353 436 3076 1023 3823 2271 3288 2244 5231 3844 1814\n",
" 826 781 294 3716 2293 3277 405 2162 1201 247 2096 2214 5935 5116\n",
" 1694 2692 1913 4377]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3577 4489 3044 5818 1242 1884 2164 697 4562 5115 3755 3642 4209 4716\n",
" 3422 3819 1964 3456 2014 4555 2897 4685 5358 2925 1934 1054 4335 2805\n",
" 3493 121 3316 2205]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4179 854 4138 4604 4641 1334 1933 1579 4199 139 2114 4948 1308 3449\n",
" 5758 516 275 1913 1574 3840 5607 1653 5126 1351 5379 3110 4614 1117\n",
" 4857 1182 3691 5971]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3952 386 3601 4271 978 5932 2848 651 5198 1925 1685 1841 2208 3319\n",
" 4582 1964 1178 2250 4699 2838 526 5498 1913 105 2676 2069 4764 1685\n",
" 4193 128 456 4336]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2829 2254 4013 4811 4914 1615 452 117 3547 2498 93 4491 4530 5288\n",
" 2008 2396 4957 2369 104 1407 2639 2755 4026 5196 3065 3326 411 2864\n",
" 3079 2593 3168 63]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 231 5621 115 3171 4265 3575 2118 1099 4461 2549 3925 3709 2597 11\n",
" 3157 5455 939 2233 553 5002 5028 3143 358 232 3821 1220 117 5547\n",
" 1999 4039 3232 3717]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1615 4463 4923 1506 2533 939 132 836 4814 1761 4178 913 5740 4490\n",
" 2283 4601 5014 3450 4565 2517 4485 160 2488 3287 1736 2478 5554 4931\n",
" 4103 1529 2437 2084]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1520 973 4257 1216 2312 3408 2449 5111 547 5488 3550 1279 3740 2423\n",
" 5008 2491 2465 2831 3795 5904 1553 3048 1913 2463 2795 380 117 3445\n",
" 4867 1689 3801 2113]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 137 2844 570 2796 690 5037 2371 3996 5165 5833 95 1827 2807 2236\n",
" 5558 800 2639 3486 1693 2287 2977 3957 1621 1307 5165 5518 3718 5503\n",
" 4822 1959 2713 4096]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2745 5107 899 4387 2673 1007 1469 2851 4569 5099 5166 4345 4852 5164\n",
" 4463 117 476 1199 4257 1700 4322 4446 74 1721 5517 1942 4552 1430\n",
" 1254 2001 601 571]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3835 4615 5057 5580 2819 3406 4867 2445 320 4680 4023 4492 43 996\n",
" 1023 4536 2858 2186 747 4301 2961 2654 1390 260 2639 1658 5460 3199\n",
" 4867 1665 5381 3394]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 177 2808 2297 4859 210 3111 1673 447 379 2751 4243 2148 4459 2918\n",
" 4271 4381 2549 3069 3773 663 1809 3084 1182 1517 1145 3025 3265 4709\n",
" 5089 1569 3446 3001]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2090 4615 2621 3270 5562 4756 2698 5402 1219 3519 2220 2247 3765 5172\n",
" 5157 1937 3424 4266 2048 1881 1889 5048 4709 5771 406 5605 2231 288\n",
" 1397 2773 4422 3691]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1950 204 2807 2342 1630 3668 2291 5729 655 4159 1595 5058 3147 5860\n",
" 4843 447 4197 5408 4201 4932 5833 1429 1595 3780 4481 4835 3025 196\n",
" 2042 52 4220 5153]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3944 5481 5051 552 2237 892 4582 207 2608 5117 1060 2893 5458 677\n",
" 2600 254 3583 2784 1262 933 2611 5138 2680 5884 5876 3392 59 2541\n",
" 4217 4600 3576 5025]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3996 5675 3663 4907 5074 4212 2023 1715 276 3200 2749 1713 1702 1420\n",
" 372 5375 244 3540 2056 5924 1015 1738 5589 2813 2508 5793 2971 4266\n",
" 4136 4750 1655 4702]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 816 5425 2929 1133 5366 2843 776 3606 1887 1465 2699 1095 5909 3459\n",
" 853 4306 1860 5559 5961 3259 5121 1012 5460 3868 264 4462 5207 3452\n",
" 4615 486 1169 2617]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5853 3826 5584 5709 2878 4329 4806 2888 1889 4585 1940 3934 5699 4831\n",
" 2465 3391 893 5046 4870 4796 5793 4740 4805 167 2595 3298 1419 2133\n",
" 4831 24 1921 47]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3961 3652 767 207 2715 2439 4639 773 4923 4767 3032 2679 3961 1498\n",
" 3735 624 1742 4734 3209 4427 2765 3790 5243 2445 1294 472 5594 4591\n",
" 378 4397 5825 5899]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5971 2427 4750 4108 3527 5550 784 2599 2504 159 5631 133 3064 82\n",
" 1152 5444 3707 3077 3319 1461 2857 4834 2936 955 3048 1827 1584 2725\n",
" 3481 996 3069 1607]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1772 1363 4558 930 2809 3763 4340 1638 3384 5153 3477 5060 4600 4379\n",
" 195 1335 2341 185 2362 5682 2889 400 724 190 4558 1839 225 1064\n",
" 742 3212 1963 1182]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3763 1959 3106 3844 5783 5396 3076 3376 996 1767 5796 4955 5009 4600\n",
" 4199 2668 2113 1411 5090 4262 3768 3133 783 3718 2846 2831 5477 4826\n",
" 3011 2730 2219 3011]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1548 3831 2412 5978 5846 4462 3247 2608 3463 2541 1092 5378 5693 591\n",
" 3362 5258 1064 2102 5758 1062 4756 2046 2104 4530 5283 4398 3623 2198\n",
" 2739 3267 245 1398]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4512 4944 605 4131 5716 2395 4258 69 926 1425 164 5415 3831 5031\n",
" 3984 973 2827 3685 811 3235 3385 519 5842 3392 489 728 193 1341\n",
" 47 2745 1622 471]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3649 4727 2864 3802 2547 2483 245 1070 4188 4775 4107 5304 4610 508\n",
" 4461 5012 1852 5878 1220 3046 2370 1551 3751 2210 4145 1092 3260 2933\n",
" 2007 5116 3092 2527]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5034 3290 3567 4441 5354 772 571 387 4839 4330 4466 254 2118 3526\n",
" 1802 5455 2207 5598 649 377 4990 1832 1209 3540 3218 1576 5612 2312\n",
" 5352 932 170 5409]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3436 1976 5818 4299 1113 1219 1657 2829 907 268 4111 2021 441 557\n",
" 1585 590 569 3471 1022 5713 4021 2712 40 3705 170 5787 5991 5740\n",
" 1968 2360 510 2343]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2395 2091 598 2785 1135 3383 4215 5067 678 569 1382 2483 2381 3594\n",
" 2342 3788 982 1272 1556 5158 2502 5476 5354 720 3787 1096 508 5479\n",
" 5014 5878 4695 1652]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4912 85 124 2164 3834 3402 3848 3961 2884 2485 4459 1852 5703 3415\n",
" 1963 2337 4894 2887 4062 5873 1088 1700 5504 3441 3256 2169 3929 5809\n",
" 698 5571 1836 2077]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4610 3206 880 3766 1259 2532 2617 2369 966 2285 1558 3456 5895 3011\n",
" 3013 3331 1300 694 3966 3756 4825 2240 2508 3639 3408 4278 5580 4114\n",
" 285 2955 2102 1514]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2317 1142 4614 1796 914 1904 4678 4154 3062 4209 1399 5068 5724 60\n",
" 2361 3242 2963 1785 5389 2966 4040 5166 2883 3791 4149 3072 4105 1698\n",
" 1925 220 5729 2492]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4846 178 3133 3404 4569 2072 3721 2153 2448 1624 4271 5322 1736 4943\n",
" 381 2208 846 4300 4488 2098 2472 4580 4951 3083 2629 5591 943 5244\n",
" 5698 4063 3809 2125]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2481 3857 5661 2854 3064 3517 5536 3649 5172 5040 5430 4986 3046 873\n",
" 2957 1103 5269 3359 5439 4176 1606 1171 4058 834 5873 1098 1793 4947\n",
" 3943 5725 4666 3416]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 783 2602 5707 4343 3087 796 5950 5853 3859 4082 5547 32 1165 1126\n",
" 4189 1452 4300 784 3988 3817 1975 5814 527 831 2034 150 1438 4212\n",
" 2931 1553 2150 4685]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 306 4197 4303 1605 1940 5686 3642 1700 1566 4067 2559 5815 4457 3114\n",
" 2199 1602 2507 4699 5501 4727 1195 712 2906 2153 2971 4972 381 1098\n",
" 5621 1407 3856 4521]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3012 1075 2098 2387 380 5326 1034 3842 5009 4383 2747 931 1990 4450\n",
" 5785 322 3802 4421 2923 3634 778 3772 2483 4059 3939 58 456 4969\n",
" 1224 1959 4280 1522]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4955 835 4182 1655 5618 3239 278 4285 317 5497 5784 406 3362 1774\n",
" 2662 4701 2667 5388 5747 2337 4379 2104 5622 1367 4937 2624 1178 2675\n",
" 3531 5474 2416 3361]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3840 2339 3542 5905 3565 3078 2998 4945 1905 2933 3618 675 2574 5029\n",
" 400 2204 14 3861 5783 2249 2654 3709 766 3321 3122 1023 3797 4039\n",
" 5774 2838 4492 112]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 231 1083 607 3430 1239 2077 3640 2507 2829 3629 2771 4992 230 4355\n",
" 531 1500 339 175 1722 1964 5828 783 4606 3185 3170 1292 3518 4023\n",
" 167 2577 4862 3785]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 55 1685 2505 5068 4310 1499 3337 5400 1702 1702 2103 2433 5973 2858\n",
" 3957 3563 4763 2294 4042 2747 2861 764 5237 5292 2683 1276 2590 2109\n",
" 1244 4875 3230 2591]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5343 2228 1178 2588 516 4469 4595 5551 5153 1832 1934 168 3704 1964\n",
" 3763 3042 2437 5884 3087 4928 3874 1654 3287 3659 4108 3191 624 1224\n",
" 1521 4728 5443 3875]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2724 1740 2430 4303 1126 2611 3732 1281 5030 22 5216 4756 5570 1526\n",
" 2506 5698 5698 5356 3622 3087 1047 4584 1998 746 71 467 2590 1636\n",
" 4398 4131 1221 4635]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 159 2416 4508 3550 1631 1360 733 3558 1245 5240 1864 720 2773 3478\n",
" 1411 3461 3024 288 2148 53 2787 2931 649 2183 3961 916 2918 1015\n",
" 4446 2670 5536 5980]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3247 1245 5389 3777 2077 5175 3618 3093 4346 3894 4582 3167 2437 382\n",
" 1749 4015 719 5623 4458 1934 4269 443 2524 2236 1367 1804 5476 362\n",
" 4635 4623 4557 18]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4727 5732 5515 4599 4547 3958 2778 3421 918 2931 3865 3397 3714 462\n",
" 3691 3092 99 3095 1864 74 2304 3130 1089 621 5022 698 2239 5078\n",
" 5962 1521 1224 3939]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2294 3948 4615 1574 895 1372 4011 953 4626 3281 5046 1608 5479 423\n",
" 5978 3912 3937 1279 1736 1776 199 2369 3253 3875 5321 5246 1519 4917\n",
" 476 4864 3615 2043]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4786 2930 3706 708 2729 3284 2379 386 3209 4378 2932 1065 3705 4294\n",
" 2543 3480 3212 1096 1276 1647 296 3358 2869 452 5359 4246 217 3913\n",
" 2562 5458 722 271]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5231 3077 638 4236 684 4811 2340 1711 1529 2670 5241 2441 4323 4454\n",
" 4932 949 3732 1162 2303 1735 1286 1550 1206 2597 5475 374 5359 4260\n",
" 2299 4582 5812 540]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4385 2804 4568 1458 5795 5045 4435 3505 333 5505 2574 73 297 1435\n",
" 1235 145 3353 957 264 5515 4702 3003 4582 5405 4100 66 4318 5746\n",
" 2945 278 5961 5214]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 634 3600 150 4284 3940 5927 678 5864 2909 4071 75 321 2792 3570\n",
" 3684 873 935 399 2643 4663 778 14 792 1087 1635 3153 4206 4977\n",
" 2522 945 3775 200]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2755 2963 722 4825 1496 913 3147 1605 5418 1351 1624 5974 666 5877\n",
" 752 4059 5890 2279 1292 1178 2138 444 818 302 5446 5079 178 3121\n",
" 651 4236 1654 1526]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5991 2766 2676 5237 3394 2436 634 5548 4894 5152 4117 535 5859 3103\n",
" 3913 3568 728 2756 5001 634 5004 4125 5511 3099 4309 2374 3322 4704\n",
" 5078 1519 5433 5081]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 207 4008 4204 533 708 4729 2042 3947 2001 780 3408 1905 1655 4009\n",
" 920 3961 4301 3717 2997 4932 1188 5477 2312 638 3033 4710 1336 2665\n",
" 1087 387 628 3209]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 294 1534 3437 5799 3804 3799 4602 1545 3803 4847 4614 3918 4164 4580\n",
" 2542 2923 2213 1661 2209 1688 1096 2109 117 3110 4033 4147 4302 950\n",
" 2815 2250 4789 3708]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1161 3240 17 2386 2645 5650 4380 2310 715 2690 5166 5120 825 204\n",
" 2843 705 3575 1159 4170 4507 4283 1458 4679 3134 4508 2498 2954 4007\n",
" 5312 3398 400 651]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2091 2504 679 1083 1169 1774 245 4941 994 4524 5580 5154 2096 1770\n",
" 560 1398 5472 2084 4282 2772 89 4093 1736 1495 2608 4955 4424 4694\n",
" 5226 5059 2294 1366]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1949 5378 1852 1306 137 301 1832 306 3095 5473 4356 603 2668 5400\n",
" 5638 5384 3041 5239 3065 2525 2403 4874 5704 4584 2285 3926 2450 4210\n",
" 4084 397 3326 3512]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5076 3205 3806 1100 1439 2772 5558 4131 1599 4062 1426 5952 4425 5897\n",
" 4690 3607 5359 124 4867 3745 5046 3013 1577 1635 4146 3883 2050 2642\n",
" 1918 4083 440 3374]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1526 2685 4633 1977 4800 2790 1291 1846 1243 2007 3848 220 2450 4702\n",
" 2639 95 213 5060 816 4132 5337 4509 501 2562 2667 404 3263 1041\n",
" 2433 5858 2522 3913]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1263 5384 1220 2572 2878 3625 3303 3392 1713 1832 5525 5610 5274 1151\n",
" 5352 1865 5758 592 3065 2608 1597 2043 3905 4308 4283 32 3238 4852\n",
" 4580 3588 5489 3355]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1913 1620 1081 3014 4811 3598 1037 5039 5480 5159 231 4908 2498 5356\n",
" 2129 2001 1670 519 1114 4448 3646 4617 4159 5020 464 1935 1172 3618\n",
" 1306 100 2857 2876]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[ 797 5504 581 2585 5700 5216 2883 5774 1407 5524 47 862 3713 4751\n",
" 931 1233 2951 5582 1456 935 5455 5280 2107 5899 18 2202 2406 639\n",
" 3184 4990 5241 3339]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4773 1652 4685 5229 2739 2199 1695 1884 5921 5698 5504 3755 3831 2020\n",
" 3283 670 440 73 3157 5693 1072 3129 4806 410 2836 793 5782 913\n",
" 1064 2876 3490 533]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5185 5325 2814 4384 5707 3024 3329 1425 4919 1300 3135 2488 5237 2524\n",
" 5584 121 5959 5688 2014 5647 2306 3430 3528 2784 5919 3557 3298 3728\n",
" 1892 3540 5409 3097]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4264 1790 3400 4448 1256 1801 2454 907 3558 5730 4789 1964 2743 1048\n",
" 3636 2237 5208 4025 4627 2803 5702 2695 4396 3917 1614 1299 917 4422\n",
" 1514 5833 5366 4557]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2687 3173 4197 1818 3587 1083 1832 2347 699 4943 1527 5828 5747 5891\n",
" 2769 2698 5971 1168 1942 1964 1548 4178 844 2871 5953 2573 4303 4620\n",
" 318 2994 4981 4862]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5739 4362 4887 2920 3406 954 5682 3917 1435 1997 4612 2734 159 3988\n",
" 1793 3934 5872 5198 2568 2429 1473 2342 4048 4906 4461 3636 5210 921\n",
" 4164 5765 3560 1505]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1113 914 859 80 4011 5355 3517 2715 661 838 381 5644 3031 4380\n",
" 4900 3031 942 3074 5990 3156 5288 472 3618 4135 4272 5738 553 1880\n",
" 254 4271 270 4595]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4814 5383 1389 4446 3732 3259 5970 3419 940 949 3422 470 3326 11\n",
" 1721 5698 3083 3884 1936 4886 3788 5561 697 675 797 774 4246 560\n",
" 3235 3623 1122 5396]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1764 91 1599 3841 1796 4898 2715 4650 264 2386 4843 1086 4804 1587\n",
" 1122 4197 2750 5339 5352 1087 3086 2626 1376 98 4176 3772 2142 4249\n",
" 1505 4016 5614 5115]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4736 1702 5331 5438 1584 405 679 2042 4971 4350 5598 3359 3802 3396\n",
" 5856 1758 3144 89 5245 935 4617 3769 134 3994 3301 4351 3168 4347\n",
" 3645 5169 4322 3618]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1194 2643 1737 3779 5839 3699 3964 4947 5717 2217 2054 5242 3171 5980\n",
" 5207 2905 4928 5332 2605 5565 5827 3207 4270 2708 245 4538 851 4841\n",
" 4041 2500 2654 1856]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3499 4801 2870 1522 260 3407 3065 3908 1617 2104 352 2865 2579 2573\n",
" 2705 440 1667 3418 3798 1103 5168 4935 5237 3077 4595 796 5667 3105\n",
" 2606 3970 5452 1866]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[2713 5100 3346 1595 4766 2325 2235 5425 382 522 3493 2498 207 4207\n",
" 4511 4366 258 3071 465 3831 379 4868 2882 3592 4525 4145 2865 1425\n",
" 1915 2524 2803 973]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1312 2971 4193 3001 978 1606 5458 3287 3080 690 5785 4748 2398 1774\n",
" 507 5034 5831 2547 885 480 5458 5093 2895 463 5617 4989 2876 3027\n",
" 1221 5716 2519 3929]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3709 3976 1851 5014 5229 3248 4345 3909 2647 3675 5643 1986 2769 3988\n",
" 417 1914 5582 1062 400 767 5931 4063 706 4166 5237 3995 4264 5525\n",
" 4863 581 4080 5460]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5439 5042 3780 2113 1602 3194 1802 3559 2772 5286 4725 2854 4078 4576\n",
" 3874 1929 3215 1703 2150 198 188 960 2109 855 2848 1995 364 1300\n",
" 5427 781 2688 1053]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1421 3191 5780 139 1656 250 604 1664 1461 5768 5143 4828 2870 4326\n",
" 2613 5558 5955 1807 3874 5381 1666 1905 1074 1653 946 5034 5790 2093\n",
" 2007 1207 765 5736]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[4248 5104 2962 5350 5724 4081 2802 2034 2046 4208 5304 2531 1622 2976\n",
" 3034 3953 3812 1608 3929 4726 3576 1981 4666 1945 1227 3512 4494 406\n",
" 1009 1090 965 5459]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[5799 2311 5727 2747 89 5565 5349 4063 1261 105 4096 3069 22 2489\n",
" 881 2007 4607 522 4038 954 3147 5374 129 5176 5622 2898 5575 3141\n",
" 3230 836 4192 5644]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[1207 3663 3471 508 1999 5930 5584 2180 217 3548 1929 5857 472 4375\n",
" 4041 2186 4009 1861 950 633 2150 1371 3410 4343 3780 2638 1654 2118\n",
" 2021 1435 608 5680]\n"
]
},
{
"output_type": "error",
"ename": "KeyboardInterrupt",
"evalue": "",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-977ada5cc803>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtr_gen\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/utils/data_utils.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[0;34m\"\"\"Create a generator that iterate over the Sequence.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 471\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 472\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mitem\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/utils/data_utils.py\u001b[0m in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 469\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 470\u001b[0m \u001b[0;34m\"\"\"Create a generator that iterate over the Sequence.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 471\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mitem\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 472\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mitem\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/inzpeech/dataloaders/DatagenVoxCeleb.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, idx)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatagen\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_batch_sample\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0midx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mon_epoch_end\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/inzpeech/dataloaders/DatagenVoxCeleb.py\u001b[0m in \u001b[0;36mget_batch_sample\u001b[0;34m(self, idx, batch_size)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_pickle_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpickle_load\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mloaded_sample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_load\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0midname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvideoname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeatures\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloaded_sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/numpy/core/numeric.py\u001b[0m in \u001b[0;36m_frombuffer\u001b[0;34m(buf, dtype, shape, order)\u001b[0m\n\u001b[1;32m 1810\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1812\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_frombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1813\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfrombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"for x,y in tr_gen:\n",
" print(x.shape)\n",
" print(y.shape)\n",
" print(y)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"sample_per_person = 300\n",
"batch_size = 10\n",
"num_class = 1251 #or 109\n",
"input_shape = (100, 40, 1)\n",
"persons_to_include = ['p228', 'p227', 'p225', 'p245', 'p247', 'p246', 'p228', 'p250', 'p251', 'p248', 'p231', 'p253']"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'tr_gen, val_gen, te_gen = get_datagen(sample_per_person, batch_size, apply_melspectrogram_to_file, include_person=None)'"
]
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"\"\"\"tr_gen, val_gen, te_gen = get_datagen(sample_per_person, batch_size, apply_melspectrogram_to_file, include_person=None)\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'for x, y in tr_gen:\\n print(x.shape)\\n print(y.shape)\\n print(np.unique(np.argmax(y, axis=1)))\\n break'"
]
},
"metadata": {},
"execution_count": 17
}
],
"source": [
"\"\"\"for x, y in tr_gen:\n",
" print(x.shape)\n",
" print(y.shape)\n",
" print(np.unique(np.argmax(y, axis=1)))\n",
" break\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(None, 150, 20, 64)\n(None, 75, 10, 128)\n(None, 38, 5, 256)\n(None, 19, 3, 512)\nafter reshape\n(None, 19, 1536)\nafter attention\n(None, 4, 1536)\nafter avgpool\n(None, 1, 1536)\nModel: \"model_1\"\n_________________________________________________________________\nLayer (type) Output Shape Param # \n=================================================================\ninput_2 (InputLayer) [(None, 300, 40, 1)] 0 \n_________________________________________________________________\nblock1_conv1 (Conv2D) (None, 300, 40, 64) 640 \n_________________________________________________________________\nblock1_conv2 (Conv2D) (None, 300, 40, 64) 36928 \n_________________________________________________________________\nbatch_normalization_4 (Batch (None, 300, 40, 64) 256 \n_________________________________________________________________\nmax_pooling2d_4 (MaxPooling2 (None, 150, 20, 64) 0 \n_________________________________________________________________\nblock2_conv1 (Conv2D) (None, 150, 20, 128) 73856 \n_________________________________________________________________\nblock2_conv2 (Conv2D) (None, 150, 20, 128) 147584 \n_________________________________________________________________\nbatch_normalization_5 (Batch (None, 150, 20, 128) 512 \n_________________________________________________________________\nmax_pooling2d_5 (MaxPooling2 (None, 75, 10, 128) 0 \n_________________________________________________________________\nblock3_conv1 (Conv2D) (None, 75, 10, 256) 295168 \n_________________________________________________________________\nblock3_conv2 (Conv2D) (None, 75, 10, 256) 590080 \n_________________________________________________________________\nbatch_normalization_6 (Batch (None, 75, 10, 256) 1024 \n_________________________________________________________________\nmax_pooling2d_6 (MaxPooling2 (None, 38, 5, 256) 0 \n_________________________________________________________________\nblock4_conv1 (Conv2D) (None, 38, 5, 512) 1180160 \n_________________________________________________________________\nblock4_conv2 (Conv2D) (None, 38, 5, 512) 2359808 \n_________________________________________________________________\nbatch_normalization_7 (Batch (None, 38, 5, 512) 2048 \n_________________________________________________________________\nmax_pooling2d_7 (MaxPooling2 (None, 19, 3, 512) 0 \n_________________________________________________________________\nreshape_1 (Reshape) (None, 19, 1536) 0 \n_________________________________________________________________\nself_attention_1 (SelfAttent (None, 4, 1536) 394240 \n_________________________________________________________________\naverage_pooling1d_1 (Average (None, 1, 1536) 0 \n_________________________________________________________________\nflatten_1 (Flatten) (None, 1536) 0 \n_________________________________________________________________\ndense_2 (Dense) (None, 256) 393472 \n_________________________________________________________________\ndense_3 (Dense) (None, 1251) 321507 \n=================================================================\nTotal params: 5,797,283\nTrainable params: 5,795,363\nNon-trainable params: 1,920\n_________________________________________________________________\n"
]
}
],
"source": [
"model = vgg_att(num_class)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"opt = Adam(lr=1e-3)\n",
"model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Epoch 1/20\n"
]
},
{
"output_type": "error",
"ename": "ValueError",
"evalue": "in user code:\n\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function *\n outputs = self.distribute_strategy.run(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run **\n return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica\n return self._call_for_each_replica(fn, args, kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica\n return fn(*args, **kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step **\n loss = self.compiled_loss(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__\n loss_value = loss_obj(y_t, y_p, sample_weight=sw)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__\n losses = self.call(y_true, y_pred)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call\n return self.fn(y_true, y_pred, **self._fn_kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy\n return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy\n target.shape.assert_is_compatible_with(output.shape)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with\n raise ValueError(\"Shapes %s and %s are incompatible\" % (self, other))\n\n ValueError: Shapes (None, 1) and (None, 1251) are incompatible\n",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-20-fc52f26b8f63>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtr_gen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mval_gen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py\u001b[0m in \u001b[0;36m_method_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_method_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_in_multi_worker_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint: disable=protected-access\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 67\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;31m# Running inside `run_distribute_coordinator` already.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[1;32m 846\u001b[0m batch_size=batch_size):\n\u001b[1;32m 847\u001b[0m \u001b[0mcallbacks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mon_train_batch_begin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 848\u001b[0;31m \u001b[0mtmp_logs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 849\u001b[0m \u001b[0;31m# Catch OutOfRangeError for Datasets of unknown size.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 850\u001b[0m \u001b[0;31m# This blocks until the batch has finished executing.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[0mxla_context\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mExit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 580\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 581\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtracing_count\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_count\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36m_call\u001b[0;34m(self, *args, **kwds)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[0;31m# This is the first call of __call__, so we have to initialize.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0minitializers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 627\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_initialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0madd_initializers_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitializers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 628\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 629\u001b[0m \u001b[0;31m# At this point we know that the initialization is complete (or less\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36m_initialize\u001b[0;34m(self, args, kwds, add_initializers_to)\u001b[0m\n\u001b[1;32m 503\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_graph_deleter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mFunctionDeleter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lifted_initializer_graph\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 504\u001b[0m self._concrete_stateful_fn = (\n\u001b[0;32m--> 505\u001b[0;31m self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access\n\u001b[0m\u001b[1;32m 506\u001b[0m *args, **kwds))\n\u001b[1;32m 507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/function.py\u001b[0m in \u001b[0;36m_get_concrete_function_internal_garbage_collected\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 2444\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2445\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_lock\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2446\u001b[0;31m \u001b[0mgraph_function\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_define_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2447\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2448\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/function.py\u001b[0m in \u001b[0;36m_maybe_define_function\u001b[0;34m(self, args, kwargs)\u001b[0m\n\u001b[1;32m 2775\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2776\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_function_cache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmissed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcall_context_key\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2777\u001b[0;31m \u001b[0mgraph_function\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_create_graph_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2778\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_function_cache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprimary\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcache_key\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2779\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgraph_function\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/function.py\u001b[0m in \u001b[0;36m_create_graph_function\u001b[0;34m(self, args, kwargs, override_flat_arg_shapes)\u001b[0m\n\u001b[1;32m 2655\u001b[0m \u001b[0marg_names\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbase_arg_names\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mmissing_arg_names\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2656\u001b[0m graph_function = ConcreteFunction(\n\u001b[0;32m-> 2657\u001b[0;31m func_graph_module.func_graph_from_py_func(\n\u001b[0m\u001b[1;32m 2658\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2659\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_python_function\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py\u001b[0m in \u001b[0;36mfunc_graph_from_py_func\u001b[0;34m(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)\u001b[0m\n\u001b[1;32m 979\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moriginal_func\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_decorator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munwrap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpython_func\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 980\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 981\u001b[0;31m \u001b[0mfunc_outputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpython_func\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mfunc_args\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfunc_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 982\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 983\u001b[0m \u001b[0;31m# invariant: `func_outputs` contains only Tensors, CompositeTensors,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36mwrapped_fn\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;31m# __wrapped__ allows AutoGraph to swap in a converted function. We give\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;31m# the function a weak reference to itself to avoid a reference cycle.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__wrapped__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweakref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwrapped_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint:disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"ag_error_metadata\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mag_error_metadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: in user code:\n\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function *\n outputs = self.distribute_strategy.run(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run **\n return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica\n return self._call_for_each_replica(fn, args, kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica\n return fn(*args, **kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step **\n loss = self.compiled_loss(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__\n loss_value = loss_obj(y_t, y_p, sample_weight=sw)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__\n losses = self.call(y_true, y_pred)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call\n return self.fn(y_true, y_pred, **self._fn_kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy\n return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy\n target.shape.assert_is_compatible_with(output.shape)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with\n raise ValueError(\"Shapes %s and %s are incompatible\" % (self, other))\n\n ValueError: Shapes (None, 1) and (None, 1251) are incompatible\n"
]
}
],
"source": [
"model.fit(tr_gen, validation_data=val_gen, epochs=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"name": "Python 3.8.5 64-bit ('inzpeech': conda)",
"display_name": "Python 3.8.5 64-bit ('inzpeech': conda)",
"metadata": {
"interpreter": {
"hash": "fcc15a4440aa802b6aa76ba989d07fd1e1f9e303ad2563ebf174689c6e63879d"
}
}
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5-final"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

71
Train_VoxCeleb1_Class.py Normal file
View File

@@ -0,0 +1,71 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from models.vggish import VGGish
import math
import numpy as np
import tensorflow_addons as tfa
from dataloaders.DatagenVoxCeleb1 import get_keras_datagens
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from models.model_keras_dropout import vgg_att
from models.vggish import VGGish
import pickle
import tensorflow as tf
from models.resnet18_keras import resnet18
def focal_loss(y_true, y_pred):
# Define epsilon so that the backpropagation will not result in NaN
# for 0 divisor case
gamma=2.0
alpha=0.25
epsilon = K.epsilon()
# Add the epsilon to prediction value
#y_pred = y_pred + epsilon
# Clip the prediction value
y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
# Calculate cross entropy
cross_entropy = -y_true*K.log(y_pred)
# Calculate weight that consists of modulating factor and weighting factor
weight = alpha * y_true * K.pow((1-y_pred), gamma)
# Calculate focal loss
loss = weight * cross_entropy
# Sum the losses in mini_batch
loss = K.sum(loss, axis=1)
return loss
# FOR VOXCELEB
txt_dir = '/media/data/bbekci/voxceleb/iden_split.txt'
data_dir = '/media/data/bbekci/voxceleb/pkls_colwise_normed/'
batch_size = 128
input_shape = (300, 40, 1)
# VOX CELEB
tr_gen, val_gen, te_gen = get_keras_datagens(data_dir, txt_dir, batch_size, feature_len=300, ratios=[1.0, 1.0, 1.0], vid_per_person=200000)
n_class = tr_gen.datagen.get_n_class()
vgg_base_model = vgg_att(n_class)
#vgg_base_model = VGGish(input_shape, n_class, load_weight=True)
#resnet_model = resnet18(n_class, True)
opt = Adam(lr=2e-4)
vgg_base_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
save_dir = os.path.join('saved-models', 'voxceleb1_attention_vgg_dropout_tfkeras_fullset.h5')
check = ModelCheckpoint(save_dir, verbose=True, save_best_only=True, monitor='val_accuracy')
reduceLR = ReduceLROnPlateau(factor=0.5, patience=10, verbose=True, monitor='val_accuracy')
earlyStop = EarlyStopping(patience=50, verbose=True, monitor='val_accuracy')
history = vgg_base_model.fit(tr_gen, epochs=3000, validation_data=val_gen, callbacks=[check, reduceLR, earlyStop])
te_loss, te_acc = vgg_base_model.evaluate(te_gen)
with open('saved-models/voxceleb1_tfkeras_vgg_att_dropout_full', 'wb') as file_pi:
pickle.dump(history.history, file_pi)
print("Test Loss: ", te_loss, " test acc: ", te_acc)

57
Train_VoxCeleb_Class.py Normal file
View File

@@ -0,0 +1,57 @@
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda, Dense
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
from models.vggish import VGGish
import math
import numpy as np
import tensorflow_addons as tfa
from dataloaders.DatagenVoxCeleb import get_keras_datagens
#from dataloaders.DatagenVoxCeleb1 import get_keras_datagens
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from models.model_keras_dropout import vgg_att
# FOR VOXCELEB2
data_dir = '/media/data/bbekci/voxceleb2/data/dev/pkls/'
tr_txt = 'txts/tr_voxceleb_audio_pkl_paths.txt'
val_txt = 'txts/val_voxceleb_audio_pkl_paths.txt'
# FOR VOXCELEB
#txt_dir = '/media/data/bbekci/voxceleb/iden_split.txt'
#data_dir = '/media/data/bbekci/voxceleb/pkls/'
batch_size = 128
input_shape = (300, 40, 1)
# VOX CELEB
#tr_gen, val_gen, te_gen = get_keras_datagens(data_dir, txt_dir, batch_size, feature_len=300, ratios=[0.05, 0.1, 0.1])
# VOX CELEB2
tr_gen, val_gen = get_keras_datagens(data_dir, batch_size, txt_dirs=[tr_txt, val_txt], ratios=[0.25, 0.1])
"""
for x,y in tr_gen:
print(x.shape)
print(y.shape)
print(y)
break
"""
n_class = tr_gen.datagen.get_n_class()
print(n_class)
vgg_base_model = vgg_att(n_class)
opt = Adam(lr=2e-4)
vgg_base_model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
save_dir = os.path.join('saved-models', 'vggish_attention_voxceleb2_dropout.h5')
check = ModelCheckpoint(save_dir, verbose=True, save_best_only=True)
reduceLR = ReduceLROnPlateau(factor=0.5, patience=3, verbose=True)
earlyStop = EarlyStopping(patience=15, verbose=True)
vgg_base_model.evaluate(val_gen)
history = vgg_base_model.fit(tr_gen, epochs=45, validation_data=val_gen, callbacks=[check, reduceLR, earlyStop])

36
closest_celeb.py Normal file
View File

@@ -0,0 +1,36 @@
import os
import fnmatch
import pickle5 as pickle
import numpy as np
from scipy.spatial.distance import cdist
def load_embeds(embeding_main_path):
pkl_matches = []
for root, dirname, filenames in os.walk(embeding_main_path):
for filename in fnmatch.filter(filenames, '*.pkl'):
pkl_matches.append(os.path.join(root, filename))
return pkl_matches
class NearestNeighboor(object):
@classmethod
def init_neighbor(cls, data_path):
pkl_paths = load_embeds(data_path)
cls.labels = []
#self.embeds = np.zeros((len(pkl_paths), 256))
cls.embeds = np.zeros((len(pkl_paths), 256))
for i, p in enumerate(pkl_paths):
print("Progress: ", i , " / ", len(pkl_paths), end='\r')
with open(p, 'rb') as pfile:
loaded_pkl = pickle.load(pfile)
cls.labels.append(loaded_pkl[1])
cls.embeds[i] = loaded_pkl[0]
@classmethod
def closest_labels(cls, test_sample, k):
# Get euclidean distances as 2D array
dist = cdist(cls.embeds, test_sample, 'sqeuclidean').reshape(-1)
# Find the k smallest distances
indx = np.argpartition(dist, k)[: k]
return np.unique(np.array(cls.labels)[indx])

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,159 @@
#!/usr/bin/env python
# coding: utf-8
import os
import pickle
import numpy as np
from torch.utils.data import Dataset
from tensorflow.keras.utils import Sequence, to_categorical
import math
import random
from dataloaders.datautil import get_pkl_paths
class DataVoxCeleb():
def __init__(self, data_dir, pkl_paths, feature_len=300):
self.ids = [fname for fname in os.listdir(data_dir) if 'id' in fname]
self.id_2_labels = { k:i for i, k in enumerate(self.ids)}
self.feature_len = feature_len
self.pkl_paths = pkl_paths
self.shuffle_set()
def shuffle_set(self):
np.random.shuffle(self.pkl_paths)
def get_num_ex(self):
return len(self.pkl_paths)
def get_n_class(self):
return len(self.id_2_labels.keys())
def get_name_to_label(self, name):
return int(self.id_2_labels[name])
def get_sample(self, idx):
sample_pickle_path = self.pkl_paths[idx]
with open(sample_pickle_path, 'rb') as pickle_load:
loaded_sample = pickle.load(pickle_load)
idname, videoname, features = loaded_sample
feature_len = features.shape[0]
upper_limit = feature_len - 300
feature_start = random.randint(0, upper_limit)
return features[feature_start:(feature_start+self.feature_len), :], self.get_name_to_label(idname)
def get_batch_sample(self, idx, batch_size):
sample_pickle_path = self.pkl_paths[idx*batch_size:(idx+1)*batch_size]
num_example = len(sample_pickle_path)
batch_features = np.zeros((num_example, self.feature_len, 40))
batch_labels = np.zeros(num_example, dtype=np.int)
for i, pp in enumerate(sample_pickle_path):
with open(pp, 'rb') as pickle_load:
loaded_sample = pickle.load(pickle_load)
idname, videoname, features = loaded_sample
feature_len = features.shape[0]
upper_limit = feature_len - 300
feature_start = random.randint(0, upper_limit)
batch_features[i] = features[feature_start:(feature_start+self.feature_len), :].copy()
batch_labels[i] = self.get_name_to_label(idname)
return batch_features, batch_labels
class DataTorchVoxCeleb(Dataset):
def __init__(self, file_dir, pkl_paths, feature_len):
self.datagen = DataVoxCeleb(file_dir, pkl_paths, feature_len)
def __len__(self):
return self.datagen.get_num_ex()
def n_class(self):
return self.datagen.get_n_class()
def __getitem__(self, idx):
data, label = self.datagen.get_sample(idx)
data = np.expand_dims(data, axis=0)
return data, label
class DataKerasVoxCeleb(Sequence):
def __init__(self, file_dir, pkl_paths, feature_len, batch_size, shuffle=False):
self.datagen = DataVoxCeleb(file_dir, pkl_paths, feature_len)
self.shuffle = shuffle
self.batch_size = batch_size
def __len__(self):
return math.ceil(self.datagen.get_num_ex() / self.batch_size)
def n_class(self):
return self.datagen.get_n_class()
def __getitem__(self, idx):
#return self.datagen.get_batch_sample(idx, self.batch_size)
data, label = self.datagen.get_batch_sample(idx, self.batch_size)
return data, to_categorical(label, num_classes=self.n_class())
def on_epoch_end(self):
if self.shuffle == True:
self.datagen.shuffle_set()
def get_torch_datagens(data_dir, feature_len=300, num_video_per_person=1e4, num_audio_per_video=1e4, split_by='audio', split_size=0.2, txt_dirs=None, ratios=[1.0, 1.0]):
"""
Returns datagens for torch
Params:
data_dir: Parent directory for the pickle files. Assumed that each person has a separate folder in data_dir and each
video has separate folder in each persons' folder. Lastly pickle files included in video folders.
feature_len: Number of features samples for each audio sample. Deafult is 300.
num_video_per_person: How many videos will be selected from each person folder
num_audio_per_video: How many audio files will be selected from each video folder
split_by: One of "video" or "audio". If "video" provided than audio files from a single video will be included in either
train or validation set. If the parameter passed as "audio", all pickle files will be splitted into train and validation.
txt_dirs: Directory for the train and validation text files. Pass as [train_file_path, validation_file_path]
ratios: Ratio of the subsets.
Returns:
train and validation pickle paths
"""
tr_paths, val_paths = get_pkl_paths(data_dir, num_video_per_person, num_audio_per_video, split_by, split_size, txt_dirs)
subset_tr = np.random.choice(tr_paths, size=math.ceil(len(tr_paths) * ratios[0]), replace=False)
subset_val = np.random.choice(val_paths, size=math.ceil(len(val_paths) * ratios[1]), replace=False)
tr_gen = DataTorchVoxCeleb(data_dir, subset_tr, feature_len)
val_gen = DataTorchVoxCeleb(data_dir, subset_val, feature_len)
return tr_gen, val_gen
def get_keras_datagens(data_dir, batch_size, feature_len=300, num_video_per_person=1e4, num_audio_per_video=1e4, split_by='audio', split_size=0.2, txt_dirs=None, ratios=[1.0, 1.0]):
"""
Returns datagens for keras
Params:
data_dir: Parent directory for the pickle files. Assumed that each person has a separate folder in data_dir and each
video has separate folder in each persons' folder. Lastly pickle files included in video folders.
batch_size: Batch size.
feature_len: Number of features samples for each audio sample. Deafult is 300.
num_video_per_person: How many videos will be selected from each person folder
num_audio_per_video: How many audio files will be selected from each video folder
split_by: One of "video" or "audio". If "video" provided than audio files from a single video will be included in either
train or validation set. If the parameter passed as "audio", all pickle files will be splitted into train and validation.
txt_dirs: Directory for the train and validation text files. Pass as [train_file_path, validation_file_path]
ratios: Ratio of the subsets.
Returns:
train and validation pickle paths
"""
tr_paths, val_paths = get_pkl_paths(data_dir, num_video_per_person, num_audio_per_video, split_by, split_size, txt_dirs)
subset_tr = np.random.choice(tr_paths, size=math.ceil(len(tr_paths) * ratios[0]), replace=False)
subset_val = np.random.choice(val_paths, size=math.ceil(len(val_paths) * ratios[1]), replace=False)
tr_gen = DataKerasVoxCeleb(data_dir, subset_tr, feature_len, batch_size, True)
val_gen = DataKerasVoxCeleb(data_dir, subset_val, feature_len, batch_size, False)
return tr_gen, val_gen

View File

@@ -0,0 +1,179 @@
import os
import math
import numpy as np
from dataloaders.DatagenVoxCeleb import DataTorchVoxCeleb, DataKerasVoxCeleb
class Video:
def __init__(self, name):
self.name = name
self.audios = []
def add_audio(self, audio_path):
self.audios.append(audio_path)
def __eq__(self, other):
if len(self.audios)==len(other.audios):
return True
return False
def __lt__(self, other):
if len(self.audios) < len(other.audios):
return True
return False
class ID:
def __init__(self, name):
self.name = name
self.videos = []
def add_audio(self, video_name, audio_path):
vid_found = False
for v in self.videos:
if v.name == video_name:
vid_found = True
v.add_audio(audio_path)
break
if not vid_found:
v = Video(video_name)
v.add_audio(audio_path)
self.videos.append(v)
def get_audio_count(self):
count = 0
for v in self.videos:
count += len(v.audios)
return count
def get_person_audio_paths(self):
paths = []
for v in self.videos:
paths.extend(v.audios)
return paths
class Dataset:
def __init__(self):
self.ids = []
def add_audio(self, id_name, video_name, audio_path):
id_found = False
for i in self.ids:
if i.name == id_name:
id_found = True
i.add_audio(video_name, audio_path)
break
if not id_found:
id = ID(id_name)
id.add_audio(video_name, audio_path)
self.ids.append(id)
def get_cleaned_paths(self, vid_per_person, return_max):
final_paths = []
for id in self.ids:
id.videos.sort()
if return_max:
for v in id.videos[-vid_per_person:]:
final_paths.extend(v.audios)
else:
for v in id.videos[:vid_per_person]:
final_paths.extend(v.audios)
return final_paths
def get_balanced_paths(self, sample_per_person):
final_paths = []
for pid in self.ids:
person_audio_paths = pid.get_person_audio_paths()
sampled_paths = np.random.choice(person_audio_paths, min(sample_per_person, len(person_audio_paths)), replace=False)
final_paths.extend(sampled_paths)
return final_paths
def clean_trainset(tr_paths, vid_per_person, return_max):
trset = Dataset()
for p in tr_paths:
sliced_path = p.split('/')
aud_name = sliced_path[-1]
vid_name = sliced_path[-2]
p_name = sliced_path[-3]
trset.add_audio(p_name, vid_name, p)
return trset.get_balanced_paths(70)
def get_voxceleb1_path(data_dir, txt_path, ratios, vid_per_person, return_max):
with open(txt_path, 'r') as identxt:
lines = identxt.readlines()
train_paths = []
test_paths = []
val_paths = []
for line in lines:
subset, path = line.strip().split(' ')
if subset == '1':
train_paths.append(os.path.join(data_dir, path.replace('.wav','.pkl')))
elif subset == '2':
val_paths.append(os.path.join(data_dir, path.replace('.wav','.pkl')))
elif subset == '3':
test_paths.append(os.path.join(data_dir, path.replace('.wav','.pkl')))
subset_tr = np.random.choice(train_paths, size=math.ceil(len(train_paths) * ratios[0]), replace=False)
#subset_tr = clean_trainset(train_paths, vid_per_person, return_max)
subset_val = np.random.choice(val_paths, size=math.ceil(len(val_paths) * ratios[1]), replace=False)
subset_te = np.random.choice(test_paths, size=math.ceil(len(test_paths) * ratios[2]), replace=False)
print("Original size of the training: {} size of the subset: {}".format(len(train_paths), len(subset_tr)))
print("Original size of the validation: {} size of the subset: {}".format(len(val_paths), len(subset_val)))
print("Original size of the testing: {} size of the subset: {}".format(len(test_paths), len(subset_te)))
return subset_tr, subset_val, subset_te
def get_torch_datagens(data_dir, txt_dir, feature_len=300, ratios=[1.0, 1.0, 1.0], vid_per_person=1, return_max=True):
"""
Returns datagens for torch
Params:
data_dir: Parent directory for the pickle files. Assumed that each person has a separate folder in data_dir and each
video has separate folder in each persons' folder. Lastly pickle files included in video folders.
txt_dir: Directory for the subset split of the dataset.
feature_len: Number of features samples for each audio sample. Deafult is 300.
ratios: Ratio for splitting each train, validation and test sets. Can be used to work with smaller dataset. Default value is [1., 1., 1.] for [train, val, test]
vid_per_person: Select samples of audios from how many videos per person.
return_max: Whether select the videos containing max audio samples.
Returns:
train, validation and test datagens
"""
tr_paths, val_paths, test_paths = get_voxceleb1_path(data_dir, txt_dir, ratios, vid_per_person, return_max)
tr_gen = DataTorchVoxCeleb(data_dir, tr_paths, feature_len)
val_gen = DataTorchVoxCeleb(data_dir, val_paths, feature_len)
test_gen = DataTorchVoxCeleb(data_dir, test_paths, feature_len)
return tr_gen, val_gen, test_gen
def get_keras_datagens(data_dir, txt_dir, batch_size, feature_len=300, ratios=[1.0, 1.0, 1.0], vid_per_person=1, return_max=True):
"""
Returns datagens for keras
Params:
data_dir: Parent directory for the pickle files. Assumed that each person has a separate folder in data_dir and each
video has separate folder in each persons' folder. Lastly pickle files included in video folders.
txt_dir: Directory for the subset split of the dataset.
batch_size: Batch size to use.
feature_len: Number of features samples for each audio sample. Deafult is 300.
ratios: Ratio for splitting each train, validation and test sets. Can be used to work with smaller dataset. Default value is [1., 1., 1.] for [train, val, test]
vid_per_person: Select samples of audios from how many videos per person.
return_max: Whether select the videos containing max audio samples.
Returns:
train, validation and test datagens
"""
tr_paths, val_paths, test_paths = get_voxceleb1_path(data_dir, txt_dir, ratios, vid_per_person, return_max)
tr_gen = DataKerasVoxCeleb(data_dir, tr_paths, feature_len, batch_size, True)
val_gen = DataKerasVoxCeleb(data_dir, val_paths, feature_len, batch_size, False)
te_gen = DataKerasVoxCeleb(data_dir, test_paths, feature_len, batch_size, False)
return tr_gen, val_gen, te_gen

View File

@@ -0,0 +1,49 @@
import os
txt_dir = '/media/data/bbekci/voxceleb/iden_split.txt'
tr_idens = {}
val_idens = {}
te_idens = {}
with open(txt_dir, 'r') as identxt:
lines = identxt.readlines()
train_paths = []
test_paths = []
val_paths = []
for line in lines:
subset, path = line.strip().split(' ')
if subset == '1':
train_paths.append(path)
elif subset == '2':
val_paths.append(path)
elif subset == '3':
test_paths.append(path)
print(test_paths[:20])
for p in train_paths:
iden, vid, aud = p.split('/')
if iden not in tr_idens:
tr_idens[iden] = []
if vid not in tr_idens[iden]:
tr_idens[iden].append(vid)
for p in test_paths:
iden, vid, aud = p.split('/')
if iden not in te_idens:
te_idens[iden] = []
if vid not in te_idens[iden]:
te_idens[iden].append(vid)
for p in val_paths:
iden, vid, aud = p.split('/')
if iden not in val_idens:
val_idens[iden] = []
if vid not in val_idens[iden]:
val_idens[iden].append(vid)
for k in te_idens:
print("ID: ", k , " aud: ", len(te_idens[k]))

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python
# coding: utf-8
from tensorflow.keras.utils import Sequence, to_categorical
from load_vctk import get_model_data
import math
import numpy as np
import os
data_main_dir = os.path.join('..', 'datasets', 'vctk', 'wav48_silence_trimmed')
class VCTKDatagen(Sequence):
def __init__(self, audio_paths, labels, batch_size, num_class, audio_load_func, shuffle=False):
self.aud_paths = audio_paths
self.labels = labels
self.b_size = batch_size
self.num_class = num_class
self.audio_load_func = audio_load_func
self.shuffle = audio_load_func
def __len__(self):
return math.ceil( len( self.aud_paths) / self.b_size )
def __getitem__(self, idx):
# Get portion of data for batch
batch_paths = self.aud_paths[idx*self.b_size:(idx+1)*self.b_size]
batch_labels = self.labels[idx*self.b_size:(idx+1)*self.b_size]
model_in = np.array([self.audio_load_func(ap) for ap in batch_paths])
model_out = to_categorical(batch_labels, num_classes=self.num_class)
return np.expand_dims(model_in, axis=-1), model_out
def on_epoch_end(self):
if self.shuffle:
idx = np.arange(len(self.aud_paths))
np.random.shuffle(idx)
self.aud_paths = np.array(self.aud_paths)[idx].tolist()
self.labels = np.array(self.labels)[idx].tolist()
def get_datagen(sample_per_person, batch_size, audio_load_func, split=[0.1, 0.1], shuffle=True, mics=[1, 2], include_person=None):
"""
Get datagens for vctk dataset.
Params:
sample_per_person: Number of samples to select for each person.
batch_size: Batch size of the model
audio_load_func: Function to use audio files
split: Ratios for the test and validation sets. Default values are 0.1 for test and 0.1 for validation.
shuffle: Whether to shuffle the paths and labels before returning them. If you pass this false, consecutive audio files
will obtanied from same person.
mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
The code could return same audio files recorded from both mics.
include_person: Persons to include in the data. Default is None. When passed None, it takes audios from all.
Returns:
Datagens for train, validation and test sets
"""
[tr_aud, tr_label], [val_aud, val_label], [te_aud, te_label] = get_model_data(data_main_dir , sample_per_person, split, shuffle, mics, include_person=include_person)
# -2 for s5 and log.txt files
n_person = len(os.listdir(data_main_dir)) - 1
if include_person:
n_person = len(include_person)
tr_gen = VCTKDatagen(tr_aud, tr_label, batch_size, n_person, audio_load_func, shuffle)
val_gen = VCTKDatagen(val_aud, val_label, batch_size, n_person, audio_load_func, shuffle)
te_gen = VCTKDatagen(te_aud, te_label, batch_size, n_person, audio_load_func, shuffle)
return tr_gen, val_gen, te_gen

67
dataloaders/datautil.py Normal file
View File

@@ -0,0 +1,67 @@
import os
import numpy as np
from sklearn.model_selection import train_test_split
def get_pkl_paths(data_dir, num_video_per_person, num_audio_per_video, split_by, split_size, txt_dirs=None):
"""
Returns pickle file paths from the given data directory.
Params:
data_dir: Parent directory for the pickle files. Assumed that each person has a separate folder in data_dir and each
video has separate folder in each persons' folder. Lastly pickle files included in video folders.
num_video_per_person: How many videos will be selected from each person folder
num_audio_per_video: How many audio files will be selected from each video folder
split_by: One of "video" or "audio". If "video" provided than audio files from a single video will be included in either
train or validation set. If the parameter passed as "audio", all pickle files will be splitted into train and validation.
txt_dirs: Directory for the train and validation text files. Pass as [train_file_path, validation_file_path]
Returns:
train and validation pickle paths
"""
tr_pkl_paths = []
val_pkl_paths = []
if txt_dirs is not None:
with open(txt_dirs[0], 'r') as path_file:
tr_pkl_paths = path_file.readlines()
tr_pkl_paths = [pr.strip() for pr in tr_pkl_paths]
with open(txt_dirs[1], 'r') as path_file:
val_pkl_paths = path_file.readlines()
val_pkl_paths = [pr.strip() for pr in val_pkl_paths]
return tr_pkl_paths, val_pkl_paths
ids = [fname for fname in os.listdir(data_dir) if 'id' in fname]
for i, nid in enumerate(ids):
print("Progress: {} / {}".format(i, len(ids)), end='\r')
idpath = os.path.join(data_dir, nid)
videos_names = os.listdir(idpath)
tr_video_names = np.random.choice(videos_names, size=min(num_video_per_person, len(videos_names)), replace=False)
val_video_names = []
if split_by == 'video':
tr_video_names, val_video_names = train_test_split(tr_video_names, random_state=42, test_size=split_size)
for vname in tr_video_names:
val_audio_names = []
vidpath = os.path.join(idpath, vname)
audio_names = os.listdir(vidpath)
tr_audio_names = np.random.choice(audio_names, size=min(num_audio_per_video, len(audio_names)), replace=False)
# There must be at least 1 audio file for validation
if split_by == 'audio' and (len(tr_audio_names) * split_size) > 0.99:
tr_audio_names, val_audio_names = train_test_split(tr_audio_names, random_state=42, test_size=split_size)
tr_pkl_paths = tr_pkl_paths + [os.path.join(vidpath, aud_name) for aud_name in tr_audio_names if '.pkl' in aud_name]
val_pkl_paths = val_pkl_paths + [os.path.join(vidpath, aud_name) for aud_name in val_audio_names if '.pkl' in aud_name]
for vname in val_video_names:
vidpath = os.path.join(idpath, vname)
audio_names = os.listdir(vidpath)
selected_audio_names = np.random.choice(audio_names, size=min(num_audio_per_video, len(audio_names)), replace=False)
val_pkl_paths = val_pkl_paths + [os.path.join(vidpath, aud_name) for aud_name in selected_audio_names if '.pkl' in aud_name]
return tr_pkl_paths, val_pkl_paths

63
extract_embeds.py Normal file
View File

@@ -0,0 +1,63 @@
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "1"
import pickle
import numpy as np
from tensorflow.keras.models import load_model, Model
import tensorflow as tf
from models.resnet18_keras import SelfAttention
model_dir = os.path.join('saved-models', 'voxceleb1_attention_vgg_dropout_keras_fullset.h5')
data_dir = '/media/data/bbekci/voxceleb/pkls_colwise_normed/'
embed_main_dir = '/media/data/bbekci/voxceleb_id_embeds_vgg/'
def load_audio_pickle(ppath):
with open(ppath, 'rb') as pickle_load:
loaded_sample = pickle.load(pickle_load)
idname, videoname, features = loaded_sample
feature_len = features.shape[0]
iter_count = feature_len // 300
sample_features = np.zeros((iter_count, 300, 40, 1))
for i in range(iter_count):
feature_start = i * 300
feature_end = (i+1) * 300
sample_features[i] = np.expand_dims(features[feature_start:feature_end], axis=-1)
return sample_features
model = tf.keras.models.load_model(model_dir, custom_objects={'SelfAttention': SelfAttention, 'GlorotUniform': tf.keras.initializers.GlorotUniform()})
model.summary()
saved_model = Model(model.input, model.get_layer('dense').output)
pids = os.listdir(data_dir)
for pid in pids:
pid_path = os.path.join(data_dir, pid)
p_embed = np.zeros((1, 256))
total_audios = 0
video_names = os.listdir(pid_path)
for video_name in video_names:
video_path = os.path.join(pid_path, video_name)
audio_names = os.listdir(video_path)
for audio_name in audio_names:
total_audios += 1
audio_path = os.path.join(video_path, audio_name)
# load wav file first
loaded_wav = load_audio_pickle(audio_path)
preds = saved_model.predict(loaded_wav)
#mean_embeds = np.mean(preds, axis=0)
p_embed += np.sum(preds, axis=0)
dest_path = os.path.join(embed_main_dir, pid)
os.makedirs(dest_path, exist_ok=True)
dest_pickle_path = os.path.join(dest_path, audio_name)
mean_embeds = p_embed / total_audios
with open(dest_pickle_path, 'wb') as pickle_file:
pickle.dump([mean_embeds, pid], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

112
load_vctk.py Normal file
View File

@@ -0,0 +1,112 @@
#!/usr/bin/env python
# coding: utf-8
import os
import numpy as np
from sklearn.model_selection import train_test_split
def get_person_label(pname):
return int(pname.replace('p', ''))
def get_samples_from_person(person_path, sample_count, mics):
"""
Return path of audio samples selected from a person folder.
Params:
person_path: Path for the person
sample_count: Number of samples to select
mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
The code could return same audio files recorded from both mics.
Returns:
audio_paths: Relative path of the audio samples
"""
audio_files = os.listdir(person_path)
mic_string = ['mic'+ str(n) for n in mics ]
audio_files = [af for af in audio_files if af.split('.')[0].split('_')[-1] in mic_string]
sample_count = min(len(audio_files), sample_count)
audio_paths = [os.path.join(person_path, af) for af in audio_files]
return np.random.choice(audio_paths, sample_count, replace=False).tolist()
def get_model_data(data_main_dir, sample_per_person, split=[0.1, 0.1], shuffle=True, mics=[1,2], include_person=None):
"""
Return audio file paths and corresponding labels.
Params:
data_main_dir: Parent directory for the dataset
sample_per_person: Number of samples to select
split: Ratios for the test and validation sets. Default values are 0.1 for test and 0.1 for validation.
shuffle: Whether to shuffle the paths and labels before returning them. If you pass this false, consecutive audio files
will obtanied from same person.
mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
The code could return same audio files recorded from both mics.
include_person: Persons to include in the data. Default is None. When passed None, it takes audios from all.
Returns:
audio paths and labels for each subset. Audio paths and labels are given as a single list for each subset
"""
all_audio_paths = []
labels = []
person_names = [pname for pname in os.listdir(data_main_dir) if 'p' in pname]
if include_person:
person_names = [pname for pname in person_names if pname in include_person]
person_paths = [os.path.join(data_main_dir, p) for p in person_names]
for i, ppath in enumerate(person_paths):
audio_paths = get_samples_from_person(ppath, sample_per_person, mics)
labels = labels + len(audio_paths) * [i]
all_audio_paths = all_audio_paths + audio_paths
if shuffle:
idx = np.arange(len(labels))
np.random.shuffle(idx)
labels = np.array(labels)[idx].tolist()
all_audio_paths = np.array(all_audio_paths)[idx].tolist()
tr_val_audio, test_audio, tr_val_labels, te_labels = train_test_split(all_audio_paths, labels, test_size=split[0], random_state=42)
tr_audio, val_audio, tr_labels, val_labels = train_test_split(tr_val_audio, tr_val_labels, test_size=split[1], random_state=42)
return [tr_audio, tr_labels], [val_audio, val_labels], [test_audio, te_labels]
def get_model_data_for_batch(data_main_dir, sample_per_person, person_count_per_batch , shuffle=True, mics=[1,2]):
"""
Return audio file paths and corresponding labels for a batch.
Params:
data_main_dir: Parent directory for the dataset
sample_per_person: Number of samples to select
person_count_per_batch: Number of persons to be added for each batch. Note that the batch number will be equal to
sample_per_person * person_count_per_batch
shuffle: Whether to shuffle the paths and labels before returning them. If you pass this false, consecutive audio files
will obtanied from same person.
mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
The code could return same audio files recorded from both mics.
Returns:
audio_paths: Relative path of the audio samples
"""
all_audio_paths = []
labels = []
person_names = [pname for pname in os.listdir(data_main_dir) if 'p' in pname]
person_paths = [os.path.join(data_main_dir, p) for p in person_names]
person_labels = [get_person_label(pname) for pname in person_names]
# Sample persons
idx = np.arange(len(person_paths))
selected_idx = np.random.choice(idx, person_count_per_batch, replace=False)
# Select person names, paths and corresponding labels
person_names = np.array(person_names)[selected_idx].tolist()
person_paths = np.array(person_paths)[selected_idx].tolist()
for i, ppath in enumerate(person_paths):
audio_paths = get_samples_from_person(ppath, sample_per_person, mics)
labels = labels + len(audio_paths) * [i]
all_audio_paths = all_audio_paths + audio_paths
if shuffle:
idx = np.arange(len(labels))
np.random.shuffle(idx)
labels = np.array(labels)[idx].tolist()
all_audio_paths = np.array(all_audio_paths)[idx].tolist()
return all_audio_paths, labels

File diff suppressed because one or more lines are too long

150
models/model_keras.py Normal file
View File

@@ -0,0 +1,150 @@
import keras
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling1D, BatchNormalization ,Reshape
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Activation, Layer
import tensorflow.keras.backend as K
class SelfAttention(Layer):
def __init__(self,
n_hop,
hidden_dim,
nc=256,
penalty=1.0,
return_attention=False,
kernel_initializer='glorot_uniform',
kernel_regularizer=None,
kernel_constraint=None,
**kwargs):
self.n_hop = n_hop
self.hidden_dim = hidden_dim
self.nc=nc
self.penalty = penalty
self.kernel_initializer = keras.initializers.get(kernel_initializer)
self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
self.kernel_constraint = keras.constraints.get(kernel_constraint)
self.return_attention = return_attention
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
# input_shape: (None, Sequence_size, Sequence_hidden_dim)
assert len(input_shape) >= 3
batch_size, T, nh = input_shape
self.Ws1 = self.add_weight(shape=(self.hidden_dim, self.nc),
initializer=self.kernel_initializer,
name='SelfAttention-Ws1',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.Ws2 = self.add_weight(shape=(self.nc, self.n_hop),
initializer=self.kernel_initializer,
name='SelfAttention-Ws2',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
super(SelfAttention, self).build(input_shape)
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 3
assert input_shape[-1]
batch_size, sequence_size, sequence_hidden_dim = input_shape
output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
if self.return_attention:
attention_shape = tuple([batch_size, self.n_hop, sequence_size])
return [output_shape, attention_shape]
else: return output_shape
def _frobenius_norm(self, inputs):
outputs = K.sqrt(K.sum(K.square(inputs)))
return outputs
def call(self, inputs):
shape=inputs.shape
H=inputs
x = K.tanh(tf.matmul(H,self.Ws1))
x = tf.matmul(x,self.Ws2)
A = K.softmax(x,axis=0) # A = softmax(dot(Ws2, d1))
At=K.permute_dimensions(A,(0,2,1))
E = tf.matmul(At,H)
return E
def get_config(self):
config = super().get_config().copy()
config.update({
'n_hop': self.n_hop,
'hidden_dim': self.hidden_dim,
'nc': self.nc,
'penalty': self.penalty,
'kernel_initializer': self.kernel_initializer,
'kernel_regularizer': self.kernel_regularizer,
'kernel_constraint': self.kernel_constraint,
'return_attention': self.return_attention,
})
return config
def vgg_att(n_class):
inputs = keras.Input(shape=(300,40,1))
x=Conv2D(64, (3, 3), padding='same', name='block1_conv1',activation='relu')(inputs)
x=Conv2D(64, (3, 3), padding='same', name='block1_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
print(x.shape)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv1',activation='relu')(x)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
print(x.shape)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv1',activation='relu')(x)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2),padding="same")(x)
print(x.shape)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv1',activation='relu')(x)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2),padding="same")(x)
print(x.shape)
att=SelfAttention(n_hop=4,hidden_dim=1536)
x=Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x)
print("after reshape")
print(x.shape)
x=att(x)
print("after attention")
print(x.shape)
x=AveragePooling1D(pool_size=4,data_format="channels_last")(x)
print("after avgpool")
print(x.shape)
x = Flatten()(x)
x = Dense(256, activation = 'relu')(x)
output = Dense(n_class,activation = 'softmax')(x)
model = keras.Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy',optimizer ='adam')#need hyperparam-tuning
model.summary()
return model

View File

@@ -0,0 +1,158 @@
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Dropout, GlobalMaxPooling2D
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling1D, BatchNormalization ,Reshape
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Activation, Layer
from tensorflow.keras.initializers import GlorotUniform
import tensorflow.keras.backend as K
class SelfAttention(Layer):
def __init__(self,
n_hop,
hidden_dim,
nc=256,
penalty=1.0,
return_attention=False,
kernel_initializer=GlorotUniform(),
kernel_regularizer=None,
kernel_constraint=None,
**kwargs):
self.n_hop = n_hop
self.hidden_dim = hidden_dim
self.nc=nc
self.penalty = penalty
self.kernel_initializer = GlorotUniform() # tf.keras.initializers.get(kernel_initializer)
self.kernel_regularizer = None #tf.keras.regularizers.get(kernel_regularizer)
self.kernel_constraint = None #tf.keras.constraints.get(kernel_constraint)
self.return_attention = return_attention
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
# input_shape: (None, Sequence_size, Sequence_hidden_dim)
assert len(input_shape) >= 3
batch_size, T, nh = input_shape
self.Ws1 = self.add_weight(shape=(self.hidden_dim, self.nc),
initializer=self.kernel_initializer,
name='SelfAttention-Ws1',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.Ws2 = self.add_weight(shape=(self.nc, self.n_hop),
initializer=self.kernel_initializer,
name='SelfAttention-Ws2',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
super(SelfAttention, self).build(input_shape)
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 3
assert input_shape[-1]
batch_size, sequence_size, sequence_hidden_dim = input_shape
output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
if self.return_attention:
attention_shape = tuple([batch_size, self.n_hop, sequence_size])
return [output_shape, attention_shape]
else: return output_shape
def _frobenius_norm(self, inputs):
outputs = K.sqrt(K.sum(K.square(inputs)))
return outputs
def call(self, inputs):
shape=inputs.shape
H=inputs
x = K.tanh(tf.matmul(H,self.Ws1))
x = tf.matmul(x,self.Ws2)
A = K.softmax(x,axis=0) # A = softmax(dot(Ws2, d1))
At=K.permute_dimensions(A,(0,2,1))
E = tf.matmul(At,H)
return E
def get_config(self):
config = super().get_config().copy()
config.update({
'n_hop': self.n_hop,
'hidden_dim': self.hidden_dim,
'nc': self.nc,
'penalty': self.penalty,
'kernel_initializer': self.kernel_initializer,
'kernel_regularizer': self.kernel_regularizer,
'kernel_constraint': self.kernel_constraint,
'return_attention': self.return_attention,
})
return config
def vgg_att(n_class):
inputs = Input(shape=(300,40,1))
x=Conv2D(64, (3, 3), padding='same', name='block1_conv1',activation='relu')(inputs)
x=Conv2D(64, (3, 3), padding='same', name='block1_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv1',activation='relu')(x)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv1',activation='relu')(x)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2),padding="same")(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv1',activation='relu')(x)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2),padding="same")(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
att=SelfAttention(n_hop=4,hidden_dim=1536)
x=Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x)
print("after reshape")
print(x.shape)
x=att(x)
print("after attention")
print(x.shape)
x=AveragePooling1D(pool_size=4,data_format="channels_last")(x)
#x = GlobalMaxPooling2D()(x)
print("after avgpool")
print(x.shape)
x = Flatten()(x)
x = Dense(256, activation = 'relu')(x)
x=Dropout(0.4)(x)
output = Dense(n_class,activation = 'softmax')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy',optimizer ='adam')#need hyperparam-tuning
model.summary()
return model

View File

@@ -0,0 +1,158 @@
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import Dropout, GlobalMaxPooling2D
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling1D, BatchNormalization ,Reshape
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Activation, Layer
from tensorflow.keras.initializers import GlorotUniform
import tensorflow.keras.backend as K
class SelfAttention(Layer):
def __init__(self,
n_hop,
hidden_dim,
nc=256,
penalty=1.0,
return_attention=False,
kernel_initializer=GlorotUniform(),
kernel_regularizer=None,
kernel_constraint=None,
**kwargs):
self.n_hop = n_hop
self.hidden_dim = hidden_dim
self.nc=nc
self.penalty = penalty
self.kernel_initializer = GlorotUniform() # tf.keras.initializers.get(kernel_initializer)
self.kernel_regularizer = None #tf.keras.regularizers.get(kernel_regularizer)
self.kernel_constraint = None #tf.keras.constraints.get(kernel_constraint)
self.return_attention = return_attention
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
# input_shape: (None, Sequence_size, Sequence_hidden_dim)
assert len(input_shape) >= 3
batch_size, T, nh = input_shape
self.Ws1 = self.add_weight(shape=(self.hidden_dim, self.nc),
initializer=self.kernel_initializer,
name='SelfAttention-Ws1',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.Ws2 = self.add_weight(shape=(self.nc, self.n_hop),
initializer=self.kernel_initializer,
name='SelfAttention-Ws2',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
super(SelfAttention, self).build(input_shape)
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 3
assert input_shape[-1]
batch_size, sequence_size, sequence_hidden_dim = input_shape
output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
if self.return_attention:
attention_shape = tuple([batch_size, self.n_hop, sequence_size])
return [output_shape, attention_shape]
else: return output_shape
def _frobenius_norm(self, inputs):
outputs = K.sqrt(K.sum(K.square(inputs)))
return outputs
def call(self, inputs):
shape=inputs.shape
H=inputs
x = K.tanh(tf.matmul(H,self.Ws1))
x = tf.matmul(x,self.Ws2)
A = K.softmax(x,axis=0) # A = softmax(dot(Ws2, d1))
At=K.permute_dimensions(A,(0,2,1))
E = tf.matmul(At,H)
return E
def get_config(self):
config = super().get_config().copy()
config.update({
'n_hop': self.n_hop,
'hidden_dim': self.hidden_dim,
'nc': self.nc,
'penalty': self.penalty,
'kernel_initializer': self.kernel_initializer,
'kernel_regularizer': self.kernel_regularizer,
'kernel_constraint': self.kernel_constraint,
'return_attention': self.return_attention,
})
return config
def vgg_att(n_class):
inputs = Input(shape=(300,40,1))
x=Conv2D(64, (3, 3), padding='same', name='block1_conv1',activation='relu')(inputs)
x=Conv2D(64, (3, 3), padding='same', name='block1_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv1',activation='relu')(x)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv1',activation='relu')(x)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2),padding="same")(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv1',activation='relu')(x)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv2',activation='relu')(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2),padding="same")(x)
x=BatchNormalization()(x)
x=Dropout(0.2)(x)
print(x.shape)
att=SelfAttention(n_hop=4,hidden_dim=1536)
x=Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x)
print("after reshape")
print(x.shape)
x=att(x)
print("after attention")
print(x.shape)
x=AveragePooling1D(pool_size=4,data_format="channels_last")(x)
#x = GlobalMaxPooling2D()(x)
print("after avgpool")
print(x.shape)
x = Flatten()(x)
x = Dense(256, activation = 'relu')(x)
x=Dropout(0.4)(x)
output = Dense(n_class,activation = 'softmax')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy',optimizer ='adam')#need hyperparam-tuning
model.summary()
return model

162
models/resnet18_keras.py Normal file
View File

@@ -0,0 +1,162 @@
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling1D, BatchNormalization ,Reshape, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Activation, Layer, Add, Input, GlobalAveragePooling2D
import tensorflow.keras.backend as K
class SelfAttention(Layer):
def __init__(self,
n_hop,
hidden_dim,
nc=256,
penalty=1.0,
return_attention=False,
kernel_initializer='glorot_uniform',
kernel_regularizer=None,
kernel_constraint=None,
**kwargs):
self.n_hop = n_hop
self.hidden_dim = hidden_dim
self.nc=nc
self.penalty = penalty
self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
self.return_attention = return_attention
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
# input_shape: (None, Sequence_size, Sequence_hidden_dim)
assert len(input_shape) >= 3
batch_size, T, nh = input_shape
self.Ws1 = self.add_weight(shape=(self.hidden_dim, self.nc),
initializer=self.kernel_initializer,
name='SelfAttention-Ws1',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.Ws2 = self.add_weight(shape=(self.nc, self.n_hop),
initializer=self.kernel_initializer,
name='SelfAttention-Ws2',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
super(SelfAttention, self).build(input_shape)
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 3
assert input_shape[-1]
batch_size, sequence_size, sequence_hidden_dim = input_shape
output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
if self.return_attention:
attention_shape = tuple([batch_size, self.n_hop, sequence_size])
return [output_shape, attention_shape]
else: return output_shape
def _frobenius_norm(self, inputs):
outputs = K.sqrt(K.sum(K.square(inputs)))
return outputs
def call(self, inputs):
shape=inputs.shape
H=inputs
x = K.tanh(tf.matmul(H,self.Ws1))
x = tf.matmul(x,self.Ws2)
A = K.softmax(x,axis=0) # A = softmax(dot(Ws2, d1))
At=K.permute_dimensions(A,(0,2,1))
E = tf.matmul(At,H)
return E
def get_config(self):
config = super().get_config().copy()
config.update({
'n_hop': self.n_hop,
'hidden_dim': self.hidden_dim,
'nc': self.nc,
'penalty': self.penalty,
'kernel_initializer': self.kernel_initializer,
'kernel_regularizer': self.kernel_regularizer,
'kernel_constraint': self.kernel_constraint,
'return_attention': self.return_attention,
})
return config
def resnet_block(input_tensor, kernel_size, filters, downsample):
first_stride = 1
if downsample:
first_stride = 2
# First Block
x = Conv2D(kernel_size=kernel_size, filters=filters, padding="same", strides=first_stride)(input_tensor)
x = BatchNormalization()(x)
x = Activation("relu")(x)
# Second Block
x = Conv2D(kernel_size=kernel_size, filters=filters, padding="same", strides=1)(x)
x = BatchNormalization()(x)
input_tensor = Conv2D(kernel_size=(1,1), filters=filters, padding='same', strides=first_stride)(input_tensor)
# Final Add Layer
x = Add()([input_tensor, x])
x = Activation("relu")(x)
return x
def resnet18(n_class, add_attention):
input_layer = Input(shape=(300,40,1))
x=Conv2D(32, (7, 7), padding='same' ,strides=1)(input_layer)
x = BatchNormalization()(x)
x = Activation("relu")(x)
x = MaxPooling2D(pool_size=(3,3), strides=2, padding='same')(x)
x = Dropout(0.15)(x)
x = resnet_block(x, 3, 32, True)
x = resnet_block(x, 3, 32, False)
x = Dropout(0.15)(x)
x = resnet_block(x, 3, 64, True)
x = resnet_block(x, 3, 64, False)
x = Dropout(0.15)(x)
x = resnet_block(x, 3, 128, True)
x = resnet_block(x, 3, 128, False)
x = Dropout(0.15)(x)
x = resnet_block(x, 3, 256, True)
x = resnet_block(x, 3, 256, False)
x = Dropout(0.15)(x)
# Attention here
if add_attention:
att=SelfAttention(n_hop=4,hidden_dim=512)
x=Reshape((x.shape[1], x.shape[2]*x.shape[3]))(x)
x=att(x)
x=AveragePooling1D(pool_size=4,data_format="channels_last")(x)
x = Flatten()(x)
else:
x = GlobalAveragePooling2D()(x)
x = Dropout(0.15)(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.3)(x)
preds = Dense(1251, activation='softmax')(x)
model = Model(input_layer, preds)
model.summary()
return model

58
models/vggish.py Normal file
View File

@@ -0,0 +1,58 @@
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation, BatchNormalization
from tensorflow.keras import backend as K
model_path = '/home/bbekci/inzpeech/models/vggish_audioset_weights_without_fc2.h5'
def VGGish(input_shape, num_classes, add_output=True, load_weight=False):
aud_input = Input(shape=input_shape, name='input_1')
# Block 1
x = Conv2D(64, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv1')(aud_input)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
# Block 2
x = Conv2D(128, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv2')(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
# Block 3
x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_1')(x)
x = Activation('relu')(x)
x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_2')(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
# Block 4
x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_1')(x)
x = Activation('relu')(x)
x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_2')(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
base_model = Model(aud_input, x)
base_model.load_weights(model_path)
if load_weight:
base_model.load_weights(model_path)
x = GlobalMaxPooling2D()(base_model.output)
x = Dense(4096, activation=None, name='vggish_fc1/fc1_1')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(4096, activation=None, name='vggish_fc1/fc1_2')(x)
x = BatchNormalization()(x)
preds = Dense(num_classes, activation='softmax', name='vggish_fc2')(x)
if add_output:
model = Model(aud_input, preds, name='VGGish')
else:
model = Model(aud_input, x, name='VGGish')
return model

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,163 @@
import os
import glob
import torch
import pickle
from os import listdir
from os.path import isfile, join
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from scipy.fftpack import dct
from torch.utils.data import random_split, Dataset, DataLoader
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
def display_spectrogram(spectrogram):
librosa.display.specshow(spectrogram.transpose(), hop_length=220.5,y_axis='mel', fmax=8000, x_axis='s')
#getting 7 second in time axis, it should be 3, why???
plt.title('Mel Spectrogram')
plt.colorbar(format='%+2.0f dB')
plt.show()
def logmel_filterbanks(filename,pre_emphasis=0.97,frame_size = 0.025,frame_stride = 0.01,nfilt=40,normalize=True):
target_len = 66150
signal, sample_rate = librosa.load(filename,duration=3)
while(signal.shape[0] != target_len):
signal = np.append(signal, signal[:target_len - signal.shape[0]])
#Pre-Emphasis step
emphasized_signal = np.empty(shape=len(signal)+1)
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
#Framing
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) + 1 # Make sure that we have at least 1 frame
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
#Hamming-Window
frames *= np.hamming(frame_length)
#FFT
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
#Filter-Bank
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
if normalize==True:
filter_banks = (filter_banks - filter_banks.mean()) / (filter_banks.max() - filter_banks.min())
return filter_banks
def mfcc(filter_banks,num_ceps=13):
return dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)]
dataset_dir = '/home/bbekci/datasets/vctk/wav48_silence_trimmed'
data = []
c2i, i2c = {}, {}
for indx, cla in enumerate(os.listdir(dataset_dir)):
main_path = dataset_dir + '/' + cla + '/*.flac'
for file_path in glob.glob(main_path):
data.append((file_path, cla))
c2i[cla] = indx
i2c[indx] = cla
with open('preprocessed_vctk.pkl', 'wb') as pickle_file:
result=[]
for i in range(0,len(data)):
sample = []
sound_path, class_name = data[i]
sound_data = logmel_filterbanks(sound_path)
label = c2i[class_name]
sample = [label, sound_data]
result.append((sample))
pickle.dump(result, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
class PreprocessedDataset(Dataset):
def __init__(self, file_dir):
self.file_dir = file_dir
self.lst = 0
with open(file_dir, 'rb') as pickle_load:
self.lst = pickle.load(pickle_load)
def __len__(self):
return len(self.lst)
def n_class(self):
return self.lst[-1][0]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sound_data = self.lst[idx][1]
label = self.lst[idx][0]
sample = (sound_data, label)
return sample
dataset_dir = '/home/bbekci/inzpeech/preprocessed_vctk.pkl'
offset_dict = {}
max_epochs = 25
batch_size = 256
sound_data = PreprocessedDataset(file_dir=dataset_dir)
n_classes = sound_data.n_class()
train_data, test_data = random_split(sound_data,
[int(len(sound_data) * 0.8),
len(sound_data) - int(len(sound_data) * 0.8)]
)
train_dataset_loader = torch.utils.data.DataLoader(train_data,
batch_size=batch_size,
shuffle=True,
num_workers=4)
test_dataset_loader = torch.utils.data.DataLoader(test_data,
batch_size=batch_size,
shuffle=True,
num_workers=4)

View File

@@ -0,0 +1,131 @@
import os
import glob
import torch
import librosa
import pickle
import copy
import random
import numpy as np
import pandas as pd
import scipy.signal as signal
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from ResNet.model import Net_ResNet50
from torch.utils.data import random_split, Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchsummary import summary
from dataloaders.DatagenVoxCeleb1 import get_torch_datagens
# Parameters
max_epochs = 40
txt_dir = '/media/data/bbekci/voxceleb/iden_split.txt'
dataset_dir = '/media/data/bbekci/voxceleb/pkls/'
batch_size = 128
input_shape = (300, 40, 1)
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:1' if use_cuda else 'cpu')
torch.backends.cudnn.benchmark = True
def test_val_calculations(data_set_loader, _n_classes, _net):
class_correct = [0] * _n_classes
class_total = [0] * _n_classes
with torch.no_grad():
for data in data_set_loader:
inputs = data[0].to(device, dtype=torch.float)
labels = data[1].to(device)
outputs = _net(inputs)
_, predicted = torch.max(outputs, 1)
c = (predicted == labels)
for i in range(len(labels)):
label = labels[i]
class_correct[label] += c[i].item()
class_total[label] += 1
mean_acc = 0
div_count = 0
for i in range(_n_classes):
if class_total[i] != 0:
mean_acc += (100 * class_correct[i] / class_total[i])
div_count += 1
return mean_acc / div_count
train_sound_data, val_sound_data, test_sound_data = get_torch_datagens( data_dir=dataset_dir, txt_dir=txt_dir)
len_train_sound_data = len(train_sound_data)
n_classes = train_sound_data.n_class()
train_data_count = int(len_train_sound_data * 0.8)
train_dataset_loader = torch.utils.data.DataLoader(train_sound_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
val_dataset_loader = torch.utils.data.DataLoader(val_sound_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
test_dataset_loader = torch.utils.data.DataLoader(test_sound_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
print('Test Data Size: %s' % len(test_dataset_loader.dataset))
print('Val Data Size: %s' % len(val_dataset_loader.dataset))
print('Train Data Size: %s' % len(train_dataset_loader.dataset))
net = Net_ResNet50(img_channel=1, num_classes=n_classes)
net.to(device)
# # net.load_state_dict(torch.load('/home/bbekci/inzpeech/ResNet/model/mode.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
for epoch in range(max_epochs): # loop over the dataset multiple times
correct_pred = 0
for i, data in enumerate(train_dataset_loader):
# get the inputs; data is a list of [inputs, labels]
inputs = data[0].to(device, dtype=torch.float)
labels = data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
output = net(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
_, predicted = torch.max(output.data, 1)
correct_pred += (predicted == labels).float().sum()
print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, loss))
# Validation
val_acc = test_val_calculations(val_dataset_loader, n_classes, net)
print('Val Acc: %.6f' % val_acc)
# Calculate Train Accuracy
train_acc = 100 * correct_pred / len(train_sound_data)
print('Train Acc: %.6f' % train_acc)
# # torch.save(best_net.state_dict(), '/home/bbekci/inzpeech/ResNet/model/model.pth')
test_acc = test_val_calculations(test_dataset_loader, n_classes, net)
print('Test Acc: %.6f' % test_acc)

138
run_model_voxceleb.py Normal file
View File

@@ -0,0 +1,138 @@
import os
import glob
import torch
import librosa
import pickle
import copy
import random
import numpy as np
import pandas as pd
import scipy.signal as signal
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from ResNet.model import Net_ResNet50
from torch.utils.data import random_split, Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torchsummary import summary
from dataloaders.DatagenVoxCeleb import get_torch_datagens
# Parameters
dataset_dir = '/media/data/bbekci/voxceleb2/data/dev/pkls'
max_epochs = 10
batch_size = 256
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device('cuda:1' if use_cuda else 'cpu')
torch.backends.cudnn.benchmark = True
def test_val_calculations(data_set_loader, _n_classes, _net):
class_correct = [0] * _n_classes
class_total = [0] * _n_classes
with torch.no_grad():
for data in data_set_loader:
inputs = data[0].to(device, dtype=torch.float)
labels = data[1].to(device)
outputs = _net(inputs)
_, predicted = torch.max(outputs, 1)
c = (predicted == labels)
for i in range(len(labels)):
label = labels[i]
class_correct[label] += c[i].item()
class_total[label] += 1
mean_acc = 0
div_count = 0
for i in range(_n_classes):
if class_total[i] != 0:
mean_acc += (100 * class_correct[i] / class_total[i])
div_count += 1
return mean_acc / div_count
txt_dir_list = ('/home/bbekci/inzpeech/txts/tr_voxceleb_video_pkl_paths.txt',
'/home/bbekci/inzpeech/txts/val_voxceleb_video_pkl_paths.txt')
train_sound_data, val_sound_data = get_torch_datagens(
data_dir=dataset_dir, txt_dirs=txt_dir_list)
# print('1', len(train_sound_data))
# print('2', len(val_sound_data))
len_train_sound_data = len(train_sound_data)
n_classes = train_sound_data.n_class()
train_data_count = int(len_train_sound_data * 0.8)
test_data_count = len_train_sound_data - train_data_count
train_sound_data, test_sound_data = random_split(train_sound_data,
[train_data_count,
test_data_count]
)
train_dataset_loader = torch.utils.data.DataLoader(train_sound_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
val_dataset_loader = torch.utils.data.DataLoader(val_sound_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
test_dataset_loader = torch.utils.data.DataLoader(test_sound_data,
batch_size=batch_size,
shuffle=True,
num_workers=16)
print('Test Data Size: %s' % len(test_dataset_loader.dataset))
print('Val Data Size: %s' % len(val_dataset_loader.dataset))
print('Train Data Size: %s' % len(train_dataset_loader.dataset))
net = Net_ResNet50(img_channel=1, num_classes=n_classes)
net.to(device)
# # net.load_state_dict(torch.load('/home/bbekci/inzpeech/ResNet/model/mode.pth'))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)
for epoch in range(max_epochs): # loop over the dataset multiple times
correct_pred = 0
for i, data in enumerate(train_dataset_loader):
# get the inputs; data is a list of [inputs, labels]
inputs = data[0].to(device, dtype=torch.float)
labels = data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
output = net(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
_, predicted = torch.max(output.data, 1)
correct_pred += (predicted == labels).float().sum()
print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, loss))
# Validation
val_acc = test_val_calculations(val_dataset_loader, n_classes, net)
print('Val Acc: %.6f' % val_acc)
# Calculate Train Accuracy
train_acc = 100 * correct_pred / len(train_sound_data)
print('Train Acc: %.6f' % train_acc)
# # torch.save(best_net.state_dict(), '/home/bbekci/inzpeech/ResNet/model/model.pth')
test_acc = test_val_calculations(test_dataset_loader, n_classes, net)
print('Test Acc: %.6f' % test_acc)

41
train_vctk_keras.py Normal file
View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from models.vggish import VGGish
from datagen_vctk import get_datagen
from utils import apply_melspectrogram_to_file
import math
import numpy as np
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
sample_per_person = 30000
batch_size = 64
num_class = 109
input_shape = (300, 40, 1)
tr_gen, val_gen, te_gen = get_datagen(sample_per_person, batch_size, apply_melspectrogram_to_file)
for x, y in tr_gen:
print(x.shape)
print(y.shape)
break
reduceLR = ReduceLROnPlateau(factor=0.5, patience=5, verbose=True)
earlystop = EarlyStopping(patience=15, verbose=True)
model = VGGish(input_shape, num_class)
opt = Adam(lr=2e-3)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(tr_gen, validation_data=val_gen, epochs=15, callbacks=[reduceLR])

View File

@@ -24,41 +24,66 @@ hop_length = number of samples between successive frames
https://librosa.org/doc/latest/generated/librosa.feature.melspectrogram.html
"""
def return_files(directory):
all_files = []
for file in os.listdir(os.getcwd()+'/'+directory):
if file.endswith(".wav"):
all_files.append(os.getcwd()+'/'+directory+'/'+file)
print(all_files)
return all_files
def return_spectograms(directory):
files = return_files(directory)
spectograms = []
for file in files:
spectograms.append(apply_melspectrogram_to_file(file))
return spectograms
def apply_melspectrogram_to_file(filename):
y, sample_rate = librosa.load(filename,duration=3)
duration=len(y) / sample_rate
print(duration)
target_len = 66150
y, sample_rate = librosa.load(filename, duration=3)
while(y.shape[0] != target_len):
y = np.append(y, y[:target_len - y.shape[0]])
duration = len(y) / sample_rate
librosa.display.waveplot(y=y,sr=sample_rate)
#librosa.display.waveplot(y=y, sr=sample_rate)
if y.shape[0] == 0:
print("y.shape[0] == 0")
return None
else:
print(y.shape)
window_time = .025
hop_time = .01
n_fft = sample_rate * window_time
print(n_fft)
hop_len = sample_rate*hop_time
#print(int(n_fft))
melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y), sr=sample_rate, n_mels=40,n_fft=int(n_fft), hop_length = int(hop_len),window=signal.windows.hamming)
# print(int(n_fft))
melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(
y), sr=sample_rate, n_mels=40, n_fft=int(n_fft), hop_length=int(hop_len), window=signal.windows.hamming)
#melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y), sr=sample_rate,n_mels=40,window=signal.windows.hamming)
log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
#normalized_melspectrogram = (log_melspectrogram - log_melspectrogram.mean()) / log_melspectrogram.std()
melspectrogram = log_melspectrogram.transpose()[:-1]
melspectrogram=log_melspectrogram.transpose()[:-1]
print(melspectrogram.shape)
return melspectrogram
def display_all_spectograms(spectrograms):
for i in range(0, len(spectrograms)):
display_spectrogram(spectrograms[i])
def display_spectrogram(spectrogram):
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='s')
librosa.display.specshow(spectrogram.transpose(),
y_axis='mel', fmax=8000, x_axis='s')
plt.title('Mel Spectrogram')
plt.colorbar(format='%+2.0f dB')
plt.colorbar(format='%+2.0f dB')
plt.show()

View File

@@ -0,0 +1,33 @@
import os
import pickle
from preprocessed_feature_extraction import logmel_filterbanks
dataset_dir = "/media/data/bbekci/voxceleb/wav/"
output_dir = "/media/data/bbekci/voxceleb/pkls_colwise_normed"
def process_id(idpath, idname):
video_names = os.listdir(idpath)
for video_name in video_names:
video_path = os.path.join(idpath, video_name)
audio_names = [fname for fname in os.listdir(video_path) if fname.endswith('.wav')]
audio_paths = [os.path.join(video_path, audio_name) for audio_name in audio_names if audio_name.endswith('.wav')]
for i, audio_path in enumerate(audio_paths):
features = logmel_filterbanks(audio_path)
dest_path = os.path.join(output_dir, idname, video_name)
os.makedirs(dest_path, exist_ok=True)
full_dest_path = os.path.join(dest_path, audio_names[i].replace('.wav','.pkl'))
with open(full_dest_path, 'wb') as pickle_file:
pickle.dump([idname, video_name, features], pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
os.makedirs(output_dir, exist_ok=True)
ids = [fname for fname in os.listdir(dataset_dir) if 'id' in fname]
id_paths = [os.path.join(dataset_dir, nid) for nid in ids]
for i, idp in enumerate(id_paths):
print("Process person: ", ids[i])
process_id(idp, ids[i])

View File

@@ -0,0 +1,71 @@
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from scipy.fftpack import dct
def display_spectrogram(spectrogram):
librosa.display.specshow(spectrogram.transpose(), hop_length=220.5,y_axis='mel', fmax=8000, x_axis='s')
#getting 7 second in time axis, it should be 3, why???
plt.title('Mel Spectrogram')
plt.colorbar(format='%+2.0f dB')
plt.show()
def logmel_filterbanks(filename,pre_emphasis=0.97,frame_size = 0.025,frame_stride = 0.01,nfilt=40):
signal, sample_rate = librosa.load(filename,duration=3)
#Pre-Emphasis step
emphasized_signal = np.empty(shape=len(signal)+1)
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
#Framing
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) + 1 # Make sure that we have at least 1 frame
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
#Hamming-Window
frames *= np.hamming(frame_length)
#FFT
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
#Filter-Bank
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
return filter_banks
def mfcc(filter_banks,num_ceps=13):
return dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)]

View File

@@ -0,0 +1,168 @@
import os
import glob
import torch
import pickle
from os import listdir
from os.path import isfile, join
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from scipy.fftpack import dct
from torch.utils.data import random_split, Dataset, DataLoader
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
def display_spectrogram(spectrogram):
librosa.display.specshow(spectrogram.transpose(), hop_length=220.5,y_axis='mel', fmax=8000, x_axis='s')
#getting 7 second in time axis, it should be 3, why???
plt.title('Mel Spectrogram')
plt.colorbar(format='%+2.0f dB')
plt.show()
def logmel_filterbanks(filename,pre_emphasis=0.97,frame_size = 0.025,frame_stride = 0.01,nfilt=40,normalize=True):
target_len = 66150
signal, sample_rate = librosa.load(filename)
while(signal.shape[0] < target_len):
signal = np.append(signal, signal[:target_len - signal.shape[0]])
#Pre-Emphasis step
emphasized_signal = np.empty(shape=len(signal)+1)
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
#Framing
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) + 1 # Make sure that we have at least 1 frame
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
#Hamming-Window
frames *= np.hamming(frame_length)
#FFT
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
#Filter-Bank
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
if normalize==True:
#filter_banks = (filter_banks - filter_banks.mean()) / (filter_banks.max() - filter_banks.min())
normed_filter_banks = (filter_banks - filter_banks.mean(axis=0)) / filter_banks.std(axis=0)
return normed_filter_banks
return filter_banks
def mfcc(filter_banks,num_ceps=13):
return dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)]
if __name__=='__main__':
dataset_dir = '/home/bbekci/datasets/vctk/wav48_silence_trimmed'
data = []
c2i, i2c = {}, {}
for indx, cla in enumerate(os.listdir(dataset_dir)):
main_path = dataset_dir + '/' + cla + '/*.flac'
for file_path in glob.glob(main_path):
data.append((file_path, cla))
c2i[cla] = indx
i2c[indx] = cla
with open('preprocessed_vctk.pkl', 'wb') as pickle_file:
result=[]
for i in range(0,len(data)):
sample = []
sound_path, class_name = data[i]
sound_data = logmel_filterbanks(sound_path)
label = c2i[class_name]
sample = [label, sound_data]
result.append((sample))
pickle.dump(result, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
class PreprocessedDataset(Dataset):
def __init__(self, file_dir):
self.file_dir = file_dir
self.lst = 0
with open(file_dir, 'rb') as pickle_load:
self.lst = pickle.load(pickle_load)
def __len__(self):
return len(self.lst)
def n_class(self):
return self.lst[-1][0]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sound_data = self.lst[idx][1]
label = self.lst[idx][0]
sample = (sound_data, label)
return sample
dataset_dir = '/home/bbekci/inzpeech/preprocessed_vctk.pkl'
offset_dict = {}
max_epochs = 25
batch_size = 256
sound_data = PreprocessedDataset(file_dir=dataset_dir)
n_classes = sound_data.n_class()
train_data, test_data = random_split(sound_data,
[int(len(sound_data) * 0.8),
len(sound_data) - int(len(sound_data) * 0.8)]
)
train_dataset_loader = torch.utils.data.DataLoader(train_data,
batch_size=batch_size,
shuffle=True,
num_workers=4)
test_dataset_loader = torch.utils.data.DataLoader(test_data,
batch_size=batch_size,
shuffle=True,
num_workers=4)

87
vgg-model-torch.py Normal file
View File

@@ -0,0 +1,87 @@
import torch
import torch.nn as nn
class SelfAttention(nn.Module):
def __init__(self,seq_hidden_dim, hidden_dim_nc ,seq_hidden_dim,device='cpu'):
super(SelfAttentiveEncoder, self).__init__()
self.W1 = nn.Parameter(nn.init.xavier_uniform_(torch.empty((seq_hidden_dim, hidden_dim_nc))).to(self.device))
self.W2 = nn.Parameter(nn.init.xavier_uniform_(torch.empty((hidden_dim_nc, n_hop))).to(self.device))
def forward(self, H):
size=H.size() #expected = [batch_size,19,3,512]
print("Size:")
print(size)
x=nn.tanh(nn.bmm(H.view(size[0],size[1]*size[2]),self.W1))
x=nn.bmm(self.W2,x)
A=nn.Softmax(x,dim=0)
E=nn.bmm(torch.transpose(A, 1, 2),H)
return E
"""
alphas = torch.transpose(alphas, 1, 2).contiguous() # [bsz, hops, seq_len]
alphas = self.softmax(alphas.view(-1, size[1])) # [bsz*hop, seq_len]
alphas = alphas.view(size[0], self.attention_hops, size[1]) # [bsz, hop, seq_len]
return torch.bmm(alphas, outh), alphas """
class VGGM(pl.LightningModule):
def __init__(self, n_classes=1251):
super(VGGM, self).__init__()
self.n_classes=n_classes
self.conv_part=nn.Sequential(OrderedDict([
('conv1', nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=1)),
('bn1', nn.BatchNorm2d(64, momentum=0.1)),
('relu1', nn.ReLU()),
('mpool1', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
('conv2', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=(1,1), padding=1)),
('bn2', nn.BatchNorm2d(128, momentum=0.5)),
('relu2', nn.ReLU()),
('mpool2', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
('conv3', nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,3), stride=(1,1), padding=1)),
('bn3', nn.BatchNorm2d(256, momentum=0.5)),
('relu3', nn.ReLU()),
('mpool3', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
('conv4', nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=1)),
('bn4', nn.BatchNorm2d(256, momentum=0.5)),
('relu4', nn.ReLU()),
('mpool4', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
('conv5', nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=1)),
('bn5', nn.BatchNorm2d(512, momentum=0.5)),
('relu5', nn.ReLU()),
('mpool5', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
]))
self.attention=nn.TransformerEncoderLayer(d_model=self.hidden_size_lstm*2,dim_feedforward=512,nhead=self.num_heads_self_attn)
self.avgpool = nn.AvgPool2d((4, 1))
self.classifier=nn.Sequential(OrderedDict([
('fc7', nn.Linear(4096, 1024)),
#('drop1', nn.Dropout()),
('relu7', nn.ReLU()),
('fc8', nn.Linear(1024, n_classes))]))
def forward(self, inp):
x=self.conv_part(inp)
x=self.attention(x)
x=self.avgpool(x)
x=self.classifier(nn.Flatten(x))
return x