1
0
mirror of https://github.com/inzva/inzpeech.git synced 2021-06-01 09:25:07 +03:00
This commit is contained in:
Bekci
2020-11-13 20:58:59 +03:00
9 changed files with 1016 additions and 41 deletions

File diff suppressed because one or more lines are too long

BIN
ResNet/.DS_Store vendored Normal file

Binary file not shown.

370
ResNet/Resnet.ipynb Normal file
View File

@@ -0,0 +1,370 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from os import listdir\n",
"from os.path import isfile, join\n",
"import numpy as np\n",
"import librosa\n",
"import matplotlib.pyplot as plt\n",
"from scipy import signal\n",
"\n",
"\n",
"def apply_melspectrogram(filename):\n",
" y, sample_rate = librosa.load(filename,duration=3)\n",
"\n",
" if y.shape[0] == 0:\n",
" print(\"y.shape[0] == 0\")\n",
" return None\n",
" \n",
" window_time = .025\n",
" hop_time = .01\n",
" n_fft = sample_rate * window_time\n",
"\n",
" hop_len = sample_rate * hop_time\n",
"\n",
" melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y),\n",
" sr=sample_rate,\n",
" n_mels=40,\n",
" n_fft=int(n_fft), \n",
" hop_length = int(hop_len),\n",
" window=signal.windows.hamming)\n",
" log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)\n",
"\n",
" melspectrogram = log_melspectrogram.T[:-1]\n",
"\n",
"\n",
" return melspectrogram\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(300, 40)\n"
]
}
],
"source": [
"sound = apply_melspectrogram('data/1.wav')\n",
"print(sound.shape)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([2, 1000])\n"
]
}
],
"source": [
"import math\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"\n",
"class SelfAttention(nn.Module):\n",
" def __init__(self, embed_size, heads):\n",
" super(SelfAttention, self).__init__()\n",
" self.embed_size = embed_size\n",
" self.heads = heads\n",
" self.head_dim = embed_size // heads\n",
"\n",
" assert (\n",
" self.head_dim * heads == embed_size\n",
" ), \"Embedding size needs to be divisible by heads\"\n",
"\n",
" self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
" self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
" self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
" self.fc_out = nn.Linear(heads * self.head_dim, embed_size)\n",
"\n",
" def forward(self, values, keys, query, mask=None):\n",
" # Get number of training examples\n",
" N = query.shape[0]\n",
"\n",
" value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]\n",
"\n",
" # Split the embedding into self.heads different pieces\n",
" values = values.reshape(N, value_len, self.heads, self.head_dim)\n",
" keys = keys.reshape(N, key_len, self.heads, self.head_dim)\n",
" query = query.reshape(N, query_len, self.heads, self.head_dim)\n",
"\n",
" values = self.values(values) # (N, value_len, heads, head_dim)\n",
" keys = self.keys(keys) # (N, key_len, heads, head_dim)\n",
" queries = self.queries(query) # (N, query_len, heads, heads_dim)\n",
"\n",
" # Einsum does matrix mult. for query*keys for each training example\n",
" # with every other training example, don't be confused by einsum\n",
" # it's just how I like doing matrix multiplication & bmm\n",
"\n",
" energy = torch.einsum(\"nqhd,nkhd->nhqk\", [queries, keys])\n",
" # queries shape: (N, query_len, heads, heads_dim),\n",
" # keys shape: (N, key_len, heads, heads_dim)\n",
" # energy: (N, heads, query_len, key_len)\n",
"\n",
" # Mask padded indices so their weights become 0\n",
" if mask is not None:\n",
" energy = energy.masked_fill(mask == 0, float(\"-1e20\"))\n",
"\n",
" # Normalize energy values similarly to seq2seq + attention\n",
" # so that they sum to 1. Also divide by scaling factor for\n",
" # better stability\n",
" attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)\n",
" # attention shape: (N, heads, query_len, key_len)\n",
"\n",
" out = torch.einsum(\"nhql,nlhd->nqhd\", [attention, values]).reshape(\n",
" N, query_len, self.heads * self.head_dim\n",
" )\n",
" # attention shape: (N, heads, query_len, key_len)\n",
" # values shape: (N, value_len, heads, heads_dim)\n",
" # out after matrix multiply: (N, query_len, heads, head_dim), then\n",
" # we reshape and flatten the last two dimensions.\n",
"\n",
" out = self.fc_out(out)\n",
" # Linear layer doesn't modify the shape, final shape will be\n",
" # (N, query_len, embed_size)\n",
"\n",
" return out\n",
" \n",
"class block(nn.Module):\n",
" def __init__(\n",
" self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1\n",
" ):\n",
" super(block, self).__init__()\n",
" self.conv1 = nn.Conv2d(\n",
" in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0\n",
" )\n",
" self.bn1 = nn.BatchNorm2d(intermediate_channels)\n",
" self.conv2 = nn.Conv2d(\n",
" intermediate_channels,\n",
" intermediate_channels,\n",
" kernel_size=3,\n",
" stride=stride,\n",
" padding=1,\n",
" )\n",
" self.bn2 = nn.BatchNorm2d(intermediate_channels)\n",
" self.conv3 = nn.Conv2d(\n",
" intermediate_channels,\n",
" out_channels,\n",
" kernel_size=1,\n",
" stride=1,\n",
" padding=0,\n",
" )\n",
" self.bn3 = nn.BatchNorm2d(out_channels)\n",
" self.relu = nn.ReLU()\n",
" self.identity_downsample = identity_downsample\n",
" self.stride = stride\n",
"\n",
" def forward(self, x):\n",
" identity = x.clone()\n",
"\n",
" x = self.conv1(x)\n",
" x = self.bn1(x)\n",
" x = self.relu(x)\n",
" x = self.conv2(x)\n",
" x = self.bn2(x)\n",
" x = self.relu(x)\n",
" x = self.conv3(x)\n",
" x = self.bn3(x)\n",
"\n",
" if self.identity_downsample is not None:\n",
" identity = self.identity_downsample(identity)\n",
"\n",
" x += identity\n",
" x = self.relu(x)\n",
" return x\n",
" \n",
"\n",
"class Net(nn.Module):\n",
" def __init__(self, block, layers, image_channels, num_classes, expansion):\n",
" super(Net, self).__init__()\n",
" self.in_channels = 64\n",
" self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)\n",
" self.bn1 = nn.BatchNorm2d(64)\n",
" self.relu = nn.ReLU()\n",
" self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\n",
"\n",
" # Essentially the entire ResNet architecture are in these 4 lines below\n",
" self.layer1 = self._make_layer(\n",
" block, layers[0], intermediate_channels=64, out_channels=64*expansion, stride=1\n",
" )\n",
" self.layer2 = self._make_layer(\n",
" block, layers[1], intermediate_channels=128, out_channels=128*expansion, stride=2\n",
" )\n",
" self.layer3 = self._make_layer(\n",
" block, layers[2], intermediate_channels=256, out_channels=256*expansion, stride=2\n",
" )\n",
" self.layer4 = self._make_layer(\n",
" block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2\n",
" )\n",
" \n",
" self.attention = SelfAttention(heads=4, embed_size=512*expansion)\n",
" \n",
" self.avgpool = nn.AvgPool2d((20, 1))\n",
" \n",
" self.fc1 = nn.Linear(512*expansion, 512*expansion//2)\n",
" self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)\n",
" self.fc3 = nn.Linear(512*expansion//4, num_classes)\n",
"\n",
" def forward(self, x):\n",
" # ResNet layer\n",
" x = self.conv1(x)\n",
" x = self.bn1(x)\n",
" x = self.relu(x)\n",
" x = self.maxpool(x)\n",
" x = self.layer1(x)\n",
" x = self.layer2(x)\n",
" x = self.layer3(x)\n",
" x = self.layer4(x)\n",
" \n",
" x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])\n",
" # Attenntion Layer\n",
" x = self.attention(x, x, x)\n",
" x = self.avgpool(x)\n",
" \n",
" # FC Layer\n",
" x = x.reshape(x.shape[0], -1)\n",
" x = self.relu(self.fc1(x))\n",
" x = self.relu(self.fc2(x))\n",
" x = self.relu(self.fc3(x))\n",
"\n",
" return x\n",
"\n",
" def _make_layer(self, block, num_residual_blocks, intermediate_channels, out_channels, stride):\n",
" identity_downsample = None\n",
" layers = []\n",
"\n",
" # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes\n",
" # we need to adapt the Identity (skip connection) so it will be able to be added\n",
" # to the layer that's ahead\n",
" if stride != 1 or self.in_channels != out_channels:\n",
" identity_downsample = nn.Sequential(\n",
" nn.Conv2d(\n",
" self.in_channels,\n",
" out_channels,\n",
" kernel_size=1,\n",
" stride=stride,\n",
" ),\n",
" nn.BatchNorm2d(out_channels),\n",
" )\n",
"\n",
" layers.append(\n",
" block(self.in_channels, intermediate_channels, out_channels, identity_downsample, stride)\n",
" )\n",
" \n",
" self.in_channels = out_channels\n",
"\n",
" # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,\n",
" # then finally back to 256. Hence no identity downsample is needed, since stride = 1,\n",
" # and also same amount of channels.\n",
" for i in range(num_residual_blocks - 1):\n",
" layers.append(block(self.in_channels, intermediate_channels, out_channels))\n",
"\n",
" return nn.Sequential(*layers)\n",
" \n",
"\n",
"def Net_ResNet50(img_channel=3, num_classes=1000):\n",
" return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)\n",
"\n",
"\n",
"def Net_ResNet101(img_channel=3, num_classes=1000):\n",
" return Net(block, [3, 4, 23, 3], img_channel, num_classes, expansion=4)\n",
"\n",
"\n",
"def Net_ResNet152(img_channel=3, num_classes=1000):\n",
" return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)\n",
"\n",
"\n",
"def test():\n",
" net = Net_ResNet101(img_channel=1)\n",
" x = torch.randn(2, 1, 300, 40)\n",
" y = net(x).to('cpu')\n",
" print(y.shape)\n",
" \n",
"test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -4,7 +4,6 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
class SelfAttention(nn.Module):
def __init__(self, embed_size, heads):
super(SelfAttention, self).__init__()
@@ -68,8 +67,7 @@ class SelfAttention(nn.Module):
# (N, query_len, embed_size)
return out
class block(nn.Module):
def __init__(
self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1
@@ -117,14 +115,13 @@ class block(nn.Module):
x += identity
x = self.relu(x)
return x
class Net(nn.Module):
def __init__(self, block, layers, image_channels, num_classes, expansion):
super(Net, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(
image_channels, 64, kernel_size=7, stride=2, padding=3)
self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
@@ -142,11 +139,11 @@ class Net(nn.Module):
self.layer4 = self._make_layer(
block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2
)
self.attention = SelfAttention(heads=4, embed_size=512*expansion)
self.avgpool = nn.AvgPool2d((20, 1))
self.fc1 = nn.Linear(512*expansion, 512*expansion//2)
self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)
self.fc3 = nn.Linear(512*expansion//4, num_classes)
@@ -161,17 +158,17 @@ class Net(nn.Module):
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])
# Attention Layer
# Attenntion Layer
x = self.attention(x, x, x)
x = self.avgpool(x)
# FC Layer
x = x.reshape(x.shape[0], -1)
x = self.relu(self.fc1(x))
x = self.relu(self.fc2(x))
x = self.fc3(x)
x = self.relu(self.fc3(x))
return x
@@ -194,21 +191,19 @@ class Net(nn.Module):
)
layers.append(
block(self.in_channels, intermediate_channels,
out_channels, identity_downsample, stride)
block(self.in_channels, intermediate_channels, out_channels, identity_downsample, stride)
)
self.in_channels = out_channels
# For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
# then finally back to 256. Hence no identity downsample is needed, since stride = 1,
# and also same amount of channels.
for i in range(num_residual_blocks - 1):
layers.append(
block(self.in_channels, intermediate_channels, out_channels))
layers.append(block(self.in_channels, intermediate_channels, out_channels))
return nn.Sequential(*layers)
def Net_ResNet50(img_channel=3, num_classes=1000):
return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)
@@ -220,3 +215,5 @@ def Net_ResNet101(img_channel=3, num_classes=1000):
def Net_ResNet152(img_channel=3, num_classes=1000):
return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)

144
ResNet/run_model.py Normal file
View File

@@ -0,0 +1,144 @@
import os
import glob
import torch
import librosa
import numpy as np
import pandas as pd
import scipy.signal as signal
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from model import Net_ResNet50
from torch.utils.data import random_split, Dataset, DataLoader
from tqdm import tqdm
# Parameters
dataset_dir = '/home/bbekci/datasets/vctk/wav48_silence_trimmed'
max_epochs = 100
batch_size = 64
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
class VCTKData(Dataset):
def __init__(self, root_dir, transform=None):
self.data = []
self.c2i, self.i2c = {}, {}
for indx, cla in enumerate(os.listdir(root_dir)):
main_path = root_dir + '/' + cla + '/*.flac'
for file_path in glob.glob(main_path):
self.data.append((file_path, cla))
self.c2i[cla] = indx
self.i2c[indx] = cla
self.transform = transform
def __len__(self):
return len(self.data)
def n_class(self):
return len(list(self.c2i.keys()))
# According to our input 66150 is the length
def apply_melspectrogram(self, filename):
target_len = 66150
y, sample_rate = librosa.load(filename, duration=3)
while(y.shape[0] != target_len):
y = np.append(y, y[:target_len - y.shape[0]])
if y.shape[0] == 0:
print("y.shape[0] == 0")
return None
window_time = .025
hop_time = .01
n_fft = int(sample_rate * window_time)
hop_len = int(sample_rate * hop_time)
melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y),
sr=sample_rate,
n_mels=40,
n_fft=n_fft,
hop_length=hop_len,
window=signal.windows.hamming)
log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
melspectrogram = log_melspectrogram.T[:-1]
out = np.expand_dims(melspectrogram, axis=0)
return out
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sound_path, label = self.data[idx]
sample = (self.apply_melspectrogram(sound_path), self.c2i[label])
if self.transform:
sample = self.transform(sample)
return sample
sound_data = VCTKData(root_dir=dataset_dir)
n_classes = sound_data.n_class()
train_data, test_data = random_split(sound_data,
[int(len(sound_data) * 0.8),
len(sound_data) - int(len(sound_data) * 0.8)]
)
train_dataset_loader = torch.utils.data.DataLoader(train_data,
batch_size=batch_size,
shuffle=True,
num_workers=4)
test_dataset_loader = torch.utils.data.DataLoader(test_data,
batch_size=batch_size,
shuffle=True,
num_workers=4)
net = Net_ResNet50(img_channel=1, num_classes=n_classes)
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(net.parameters())
for epoch in range(max_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(train_dataset_loader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')

View File

@@ -21,6 +21,7 @@
},
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 13,
"metadata": {},
"outputs": [],
@@ -29,35 +30,21 @@
"tr_txt = 'txts/tr_voxceleb_video_pkl_paths.txt'\n",
"val_txt = 'txts/val_voxceleb_video_pkl_paths.txt' \n",
"batch_size = 32"
]
},
{
"cell_type": "code",
"execution_count": 14,
=======
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tr_gen, val_gen = get_keras_datagens(data_dir, batch_size, split_by='video', split_size=0.3, txt_dirs=[tr_txt, val_txt])"
"sample_per_person = 10\n",
"batch_size = 64\n",
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"tags": [
"outputPrepend"
]
<<<<<<< HEAD
"execution_count": 14,
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" 364 2552 4947 1090 2189 3961 4623\n",
" 4922 2577 964 2048 5547 1662 4686 3146 2605 2089 3819 5493 2437 1326\n",
" 5154 940 5694 3133]\n",
"(32, 300, 40)\n",
"(32,)\n",
"[3636 1114 831 4209 866 529 3704 2501 2403 2645 1307 386 4005 4568\n",
" 5172 5787 4859 4257 547 1116 268 1585 354 1716 5165 1408 5708 4017\n",
" 3690 4062 4107 602]\n",
@@ -612,18 +599,37 @@
"\u001b[0;32m~/inzpeech/dataloaders/DatagenVoxCeleb.py\u001b[0m in \u001b[0;36mget_batch_sample\u001b[0;34m(self, idx, batch_size)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_pickle_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpickle_load\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mloaded_sample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_load\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0midname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvideoname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeatures\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloaded_sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/numpy/core/numeric.py\u001b[0m in \u001b[0;36m_frombuffer\u001b[0;34m(buf, dtype, shape, order)\u001b[0m\n\u001b[1;32m 1810\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1812\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_frombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1813\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfrombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
=======
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(64, 100, 40, 1)\n",
"(64, 109)\n"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
]
}
],
"source": [
<<<<<<< HEAD
"for x,y in tr_gen:\n",
" print(x.shape)\n",
" print(y.shape)\n",
" print(y)"
=======
"for x, y in tr_gen:\n",
" print(x.shape)\n",
" print(y.shape)\n",
" break"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
]
},
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 15,
"metadata": {},
"outputs": [],
@@ -694,20 +700,37 @@
],
"source": [
"model = vgg_att(num_class)"
=======
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"model = VGGish(input_shape, 109)"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
]
},
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"opt = Adam(lr=1e-3)\n",
"model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])"
=======
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"opt = Adam(lr=1e-4)\n",
"model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
]
},
{
"cell_type": "code",
<<<<<<< HEAD
"execution_count": 20,
"metadata": {},
"outputs": [
@@ -738,6 +761,23 @@
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36mwrapped_fn\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;31m# __wrapped__ allows AutoGraph to swap in a converted function. We give\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;31m# the function a weak reference to itself to avoid a reference cycle.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__wrapped__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweakref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwrapped_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint:disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"ag_error_metadata\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mag_error_metadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: in user code:\n\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function *\n outputs = self.distribute_strategy.run(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run **\n return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica\n return self._call_for_each_replica(fn, args, kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica\n return fn(*args, **kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step **\n loss = self.compiled_loss(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__\n loss_value = loss_obj(y_t, y_p, sample_weight=sw)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__\n losses = self.call(y_true, y_pred)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call\n return self.fn(y_true, y_pred, **self._fn_kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy\n return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy\n target.shape.assert_is_compatible_with(output.shape)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with\n raise ValueError(\"Shapes %s and %s are incompatible\" % (self, other))\n\n ValueError: Shapes (None, 1) and (None, 1251) are incompatible\n"
=======
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/20\n",
"14/14 [==============================] - 31s 2s/step - loss: 9.0328 - accuracy: 0.0238 - val_loss: 32030.5371 - val_accuracy: 0.0202\n",
"Epoch 2/20\n",
"14/14 [==============================] - 31s 2s/step - loss: 5.8168 - accuracy: 0.0306 - val_loss: 12789.2061 - val_accuracy: 0.0202\n",
"Epoch 3/20\n",
"14/14 [==============================] - 32s 2s/step - loss: 5.0801 - accuracy: 0.0522 - val_loss: 4288.9995 - val_accuracy: 0.0000e+00\n",
"Epoch 4/20\n",
"14/14 [==============================] - ETA: 0s - loss: 5.0360 - accuracy: 0.0782"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
]
}
],
@@ -755,6 +795,7 @@
],
"metadata": {
"kernelspec": {
<<<<<<< HEAD
"name": "Python 3.8.5 64-bit ('inzpeech': conda)",
"display_name": "Python 3.8.5 64-bit ('inzpeech': conda)",
"metadata": {
@@ -762,6 +803,11 @@
"hash": "fcc15a4440aa802b6aa76ba989d07fd1e1f9e303ad2563ebf174689c6e63879d"
}
}
=======
"display_name": "Python 3",
"language": "python",
"name": "python3"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
},
"language_info": {
"codemirror_mode": {
@@ -773,9 +819,17 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
<<<<<<< HEAD
"version": "3.8.5-final"
=======
"version": "3.8.5"
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
}
},
"nbformat": 4,
"nbformat_minor": 4
}
<<<<<<< HEAD
}
=======
}
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f

65
datagen_vctk.py Normal file
View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python
# coding: utf-8
from tensorflow.keras.utils import Sequence, to_categorical
from load_vctk import get_model_data
import math
import numpy as np
import os
data_main_dir = os.path.join('..', 'datasets', 'vctk', 'wav48_silence_trimmed')
class VCTKDatagen(Sequence):
def __init__(self, audio_paths, labels, batch_size, num_class, audio_load_func, shuffle=False):
self.aud_paths = audio_paths
self.labels = labels
self.b_size = batch_size
self.num_class = num_class
self.audio_load_func = audio_load_func
self.shuffle = audio_load_func
def __len__(self):
return math.ceil( len( self.aud_paths) / self.b_size )
def __getitem__(self, idx):
# Get portion of data for batch
batch_paths = self.aud_paths[idx*self.b_size:(idx+1)*self.b_size]
batch_labels = self.labels[idx*self.b_size:(idx+1)*self.b_size]
model_in = np.array([self.audio_load_func(ap) for ap in batch_paths])
model_out = to_categorical(batch_labels, num_classes=self.num_class)
return np.expand_dims(model_in, axis=-1), model_out
def on_epoch_end(self):
if self.shuffle:
idx = np.arange(len(self.aud_paths))
np.random.shuffle(idx)
self.aud_paths = np.array(self.aud_paths)[idx].tolist()
self.labels = np.array(self.labels)[idx].tolist()
def get_datagen(sample_per_person, batch_size, audio_load_func, split=[0.1, 0.1], shuffle=True, mics=[1, 2]):
"""
Get datagens for vctk dataset.
Params:
sample_per_person: Number of samples to select for each person.
batch_size: Batch size of the model
audio_load_func: Function to use audio files
split: Ratios for the test and validation sets. Default values are 0.1 for test and 0.1 for validation.
shuffle: Whether to shuffle the paths and labels before returning them. If you pass this false, consecutive audio files
will obtanied from same person.
mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
The code could return same audio files recorded from both mics.
Returns:
Datagens for train, validation and test sets
"""
[tr_aud, tr_label], [val_aud, val_label], [te_aud, te_label] = get_model_data(data_main_dir , sample_per_person, split, shuffle, mics)
# -2 for s5 and log.txt files
n_person = len(os.listdir(data_main_dir)) - 2
tr_gen = VCTKDatagen(tr_aud, tr_label, batch_size, n_person, audio_load_func, shuffle)
val_gen = VCTKDatagen(val_aud, val_label, batch_size, n_person, audio_load_func, shuffle)
te_gen = VCTKDatagen(te_aud, te_label, batch_size, n_person, audio_load_func, shuffle)
return tr_gen, val_gen, te_gen

145
model-keras.py Normal file
View File

@@ -0,0 +1,145 @@
import keras
from keras_self_attention import SeqSelfAttention
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Import
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers import Activation
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.layers.normalization import BatchNormalization
class SelfAttention(Layer):
def __init__(self,
n_hop,
hidden_dim,
penalty=1.0,
return_attention=False,
kernel_initializer='glorot_uniform',
kernel_regularizer=None,
kernel_constraint=None,
**kwargs):
self.n_hop = n_hop
self.hidden_dim = hidden_dim
self.penalty = penalty
self.kernel_initializer = keras.initializers.get(kernel_initializer)
self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
self.kernel_constraint = keras.constraints.get(kernel_constraint)
self.return_attention = return_attention
super(SelfAttention, self).__init__(**kwargs)
def build(self, input_shape):
# input_shape: (None, Sequence_size, Sequence_hidden_dim)
assert len(input_shape) >= 3
batch_size, sequence_size, sequence_hidden_dim = input_shape
self.Ws1 = self.add_weight(shape=(self.hidden_dim, sequence_hidden_dim),
initializer=self.kernel_initializer,
name='SelfAttention-Ws1',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
self.Ws2 = self.add_weight(shape=(self.n_hop, self.hidden_dim),
initializer=self.kernel_initializer,
name='SelfAttention-Ws2',
regularizer=self.kernel_regularizer,
constraint=self.kernel_constraint)
super(SelfAttention, self).build(input_shape)
def call(self, inputs):
batch_size = K.cast(K.shape(inputs)[0], K.floatx())
inputs_t = K.permute_dimensions(inputs, (1,2,0)) # H.T
d1 = K.tanh(K.permute_dimensions(K.dot(self.Ws1, inputs_t), (2,0,1))) # d1 = tanh(dot(Ws1, H.T))
d1 = K.permute_dimensions(d1, (2,1,0))
A = K.softmax(K.permute_dimensions(K.dot(self.Ws2, d1), (2,0,1))) # A = softmax(dot(Ws2, d1))
H = K.permute_dimensions(inputs, (0,2,1))
outputs = K.batch_dot(A, H, axes=2) # M = AH
A_t = K.permute_dimensions(A, (0,2,1))
I = K.eye(self.n_hop)
P = K.square(self._frobenius_norm(K.batch_dot(A, A_t) - I)) # P = (frobenius_norm(dot(A, A.T) - I))**2
self.add_loss(self.penalty*(P/batch_size))
if self.return_attention:
return [outputs, A]
else:
return outputs
def compute_output_shape(self, input_shape):
assert input_shape and len(input_shape) >= 3
assert input_shape[-1]
batch_size, sequence_size, sequence_hidden_dim = input_shape
output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
if self.return_attention:
attention_shape = tuple([batch_size, self.n_hop, sequence_size])
return [output_shape, attention_shape]
else: return output_shape
def get_config(self):
config = {
'n_hop': self.n_hop,
'hidden_dim': self.hidden_dim,
'penalty':self.penalty,
'return_attention': self.return_attention,
'kernel_initializer': initializers.serialize(self.kernel_initializer),
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
'kernel_constraint': constraints.serialize(self.kernel_constraint)
}
base_config = super(SelfAttention, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def _frobenius_norm(self, inputs):
outputs = K.sqrt(K.sum(K.square(inputs)))
return outputs
def vgg_att():
inputs = keras.Input(shape=(300,40,))
x=Conv2D(64, (3, 3), padding='same', name='block1_conv1',activation='relu')(inputs)
x=Conv2D(64, (3, 3), padding='same', name='block1_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv1',activation='relu')(x)
x=Conv2D(128, (3, 3), padding='same', name='block2_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv1',activation='relu')(x)
x=Conv2D(256, (3, 3), padding='same', name='block3_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv1',activation='relu')(x)
x=Conv2D(512, (3, 3), padding='same', name='block4_conv2',activation='relu')(x)
x=BatchNormalization()(x)
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
att=SelfAttention(n_hop=4,hidden_dim=1536)
x=att(x)
x=AveragePooling2D(x,pool_size=(4, 1))
x = Flatten()(x)
x = Dense(256, activation = 'relu')(x)
output = Dense(1251,activation = 'softmax')(x)
model = Model(inputs=inputs, outputs=output)
model.compile(loss='categorical_crossentropy',optimizer ='adam')#need hyperparam-tuning
model.summary()
return model

49
vggish.py Normal file
View File

@@ -0,0 +1,49 @@
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation, BatchNormalization
from tensorflow.keras import backend as K
def VGGish(input_shape, num_classes):
aud_input = Input(shape=input_shape, name='input_1')
# Block 1
x = Conv2D(64, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv1')(aud_input)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
# Block 2
x = Conv2D(128, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv2')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
# Block 3
x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_1')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_2')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
# Block 4
x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_1')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_2')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
x = Flatten(name='flatten_')(x)
x = Dense(4096, activation=None, name='vggish_fc1/fc1_1')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(4096, activation=None, name='vggish_fc1/fc1_2')(x)
x = BatchNormalization()(x)
preds = Dense(num_classes, activation='softmax', name='vggish_fc2')(x)
model = Model(aud_input, preds, name='VGGish')
return model