mirror of
https://github.com/inzva/inzpeech.git
synced 2021-06-01 09:25:07 +03:00
Merged
This commit is contained in:
151
.ipynb_checkpoints/feature_extraction-checkpoint.ipynb
Normal file
151
.ipynb_checkpoints/feature_extraction-checkpoint.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
ResNet/.DS_Store
vendored
Normal file
BIN
ResNet/.DS_Store
vendored
Normal file
Binary file not shown.
370
ResNet/Resnet.ipynb
Normal file
370
ResNet/Resnet.ipynb
Normal file
@@ -0,0 +1,370 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from os import listdir\n",
|
||||
"from os.path import isfile, join\n",
|
||||
"import numpy as np\n",
|
||||
"import librosa\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from scipy import signal\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def apply_melspectrogram(filename):\n",
|
||||
" y, sample_rate = librosa.load(filename,duration=3)\n",
|
||||
"\n",
|
||||
" if y.shape[0] == 0:\n",
|
||||
" print(\"y.shape[0] == 0\")\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" window_time = .025\n",
|
||||
" hop_time = .01\n",
|
||||
" n_fft = sample_rate * window_time\n",
|
||||
"\n",
|
||||
" hop_len = sample_rate * hop_time\n",
|
||||
"\n",
|
||||
" melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y),\n",
|
||||
" sr=sample_rate,\n",
|
||||
" n_mels=40,\n",
|
||||
" n_fft=int(n_fft), \n",
|
||||
" hop_length = int(hop_len),\n",
|
||||
" window=signal.windows.hamming)\n",
|
||||
" log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)\n",
|
||||
"\n",
|
||||
" melspectrogram = log_melspectrogram.T[:-1]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" return melspectrogram\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(300, 40)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sound = apply_melspectrogram('data/1.wav')\n",
|
||||
"print(sound.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 92,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([2, 1000])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import math\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"\n",
|
||||
"class SelfAttention(nn.Module):\n",
|
||||
" def __init__(self, embed_size, heads):\n",
|
||||
" super(SelfAttention, self).__init__()\n",
|
||||
" self.embed_size = embed_size\n",
|
||||
" self.heads = heads\n",
|
||||
" self.head_dim = embed_size // heads\n",
|
||||
"\n",
|
||||
" assert (\n",
|
||||
" self.head_dim * heads == embed_size\n",
|
||||
" ), \"Embedding size needs to be divisible by heads\"\n",
|
||||
"\n",
|
||||
" self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
|
||||
" self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
|
||||
" self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
|
||||
" self.fc_out = nn.Linear(heads * self.head_dim, embed_size)\n",
|
||||
"\n",
|
||||
" def forward(self, values, keys, query, mask=None):\n",
|
||||
" # Get number of training examples\n",
|
||||
" N = query.shape[0]\n",
|
||||
"\n",
|
||||
" value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]\n",
|
||||
"\n",
|
||||
" # Split the embedding into self.heads different pieces\n",
|
||||
" values = values.reshape(N, value_len, self.heads, self.head_dim)\n",
|
||||
" keys = keys.reshape(N, key_len, self.heads, self.head_dim)\n",
|
||||
" query = query.reshape(N, query_len, self.heads, self.head_dim)\n",
|
||||
"\n",
|
||||
" values = self.values(values) # (N, value_len, heads, head_dim)\n",
|
||||
" keys = self.keys(keys) # (N, key_len, heads, head_dim)\n",
|
||||
" queries = self.queries(query) # (N, query_len, heads, heads_dim)\n",
|
||||
"\n",
|
||||
" # Einsum does matrix mult. for query*keys for each training example\n",
|
||||
" # with every other training example, don't be confused by einsum\n",
|
||||
" # it's just how I like doing matrix multiplication & bmm\n",
|
||||
"\n",
|
||||
" energy = torch.einsum(\"nqhd,nkhd->nhqk\", [queries, keys])\n",
|
||||
" # queries shape: (N, query_len, heads, heads_dim),\n",
|
||||
" # keys shape: (N, key_len, heads, heads_dim)\n",
|
||||
" # energy: (N, heads, query_len, key_len)\n",
|
||||
"\n",
|
||||
" # Mask padded indices so their weights become 0\n",
|
||||
" if mask is not None:\n",
|
||||
" energy = energy.masked_fill(mask == 0, float(\"-1e20\"))\n",
|
||||
"\n",
|
||||
" # Normalize energy values similarly to seq2seq + attention\n",
|
||||
" # so that they sum to 1. Also divide by scaling factor for\n",
|
||||
" # better stability\n",
|
||||
" attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)\n",
|
||||
" # attention shape: (N, heads, query_len, key_len)\n",
|
||||
"\n",
|
||||
" out = torch.einsum(\"nhql,nlhd->nqhd\", [attention, values]).reshape(\n",
|
||||
" N, query_len, self.heads * self.head_dim\n",
|
||||
" )\n",
|
||||
" # attention shape: (N, heads, query_len, key_len)\n",
|
||||
" # values shape: (N, value_len, heads, heads_dim)\n",
|
||||
" # out after matrix multiply: (N, query_len, heads, head_dim), then\n",
|
||||
" # we reshape and flatten the last two dimensions.\n",
|
||||
"\n",
|
||||
" out = self.fc_out(out)\n",
|
||||
" # Linear layer doesn't modify the shape, final shape will be\n",
|
||||
" # (N, query_len, embed_size)\n",
|
||||
"\n",
|
||||
" return out\n",
|
||||
" \n",
|
||||
"class block(nn.Module):\n",
|
||||
" def __init__(\n",
|
||||
" self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1\n",
|
||||
" ):\n",
|
||||
" super(block, self).__init__()\n",
|
||||
" self.conv1 = nn.Conv2d(\n",
|
||||
" in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0\n",
|
||||
" )\n",
|
||||
" self.bn1 = nn.BatchNorm2d(intermediate_channels)\n",
|
||||
" self.conv2 = nn.Conv2d(\n",
|
||||
" intermediate_channels,\n",
|
||||
" intermediate_channels,\n",
|
||||
" kernel_size=3,\n",
|
||||
" stride=stride,\n",
|
||||
" padding=1,\n",
|
||||
" )\n",
|
||||
" self.bn2 = nn.BatchNorm2d(intermediate_channels)\n",
|
||||
" self.conv3 = nn.Conv2d(\n",
|
||||
" intermediate_channels,\n",
|
||||
" out_channels,\n",
|
||||
" kernel_size=1,\n",
|
||||
" stride=1,\n",
|
||||
" padding=0,\n",
|
||||
" )\n",
|
||||
" self.bn3 = nn.BatchNorm2d(out_channels)\n",
|
||||
" self.relu = nn.ReLU()\n",
|
||||
" self.identity_downsample = identity_downsample\n",
|
||||
" self.stride = stride\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" identity = x.clone()\n",
|
||||
"\n",
|
||||
" x = self.conv1(x)\n",
|
||||
" x = self.bn1(x)\n",
|
||||
" x = self.relu(x)\n",
|
||||
" x = self.conv2(x)\n",
|
||||
" x = self.bn2(x)\n",
|
||||
" x = self.relu(x)\n",
|
||||
" x = self.conv3(x)\n",
|
||||
" x = self.bn3(x)\n",
|
||||
"\n",
|
||||
" if self.identity_downsample is not None:\n",
|
||||
" identity = self.identity_downsample(identity)\n",
|
||||
"\n",
|
||||
" x += identity\n",
|
||||
" x = self.relu(x)\n",
|
||||
" return x\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"class Net(nn.Module):\n",
|
||||
" def __init__(self, block, layers, image_channels, num_classes, expansion):\n",
|
||||
" super(Net, self).__init__()\n",
|
||||
" self.in_channels = 64\n",
|
||||
" self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)\n",
|
||||
" self.bn1 = nn.BatchNorm2d(64)\n",
|
||||
" self.relu = nn.ReLU()\n",
|
||||
" self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\n",
|
||||
"\n",
|
||||
" # Essentially the entire ResNet architecture are in these 4 lines below\n",
|
||||
" self.layer1 = self._make_layer(\n",
|
||||
" block, layers[0], intermediate_channels=64, out_channels=64*expansion, stride=1\n",
|
||||
" )\n",
|
||||
" self.layer2 = self._make_layer(\n",
|
||||
" block, layers[1], intermediate_channels=128, out_channels=128*expansion, stride=2\n",
|
||||
" )\n",
|
||||
" self.layer3 = self._make_layer(\n",
|
||||
" block, layers[2], intermediate_channels=256, out_channels=256*expansion, stride=2\n",
|
||||
" )\n",
|
||||
" self.layer4 = self._make_layer(\n",
|
||||
" block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" self.attention = SelfAttention(heads=4, embed_size=512*expansion)\n",
|
||||
" \n",
|
||||
" self.avgpool = nn.AvgPool2d((20, 1))\n",
|
||||
" \n",
|
||||
" self.fc1 = nn.Linear(512*expansion, 512*expansion//2)\n",
|
||||
" self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)\n",
|
||||
" self.fc3 = nn.Linear(512*expansion//4, num_classes)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" # ResNet layer\n",
|
||||
" x = self.conv1(x)\n",
|
||||
" x = self.bn1(x)\n",
|
||||
" x = self.relu(x)\n",
|
||||
" x = self.maxpool(x)\n",
|
||||
" x = self.layer1(x)\n",
|
||||
" x = self.layer2(x)\n",
|
||||
" x = self.layer3(x)\n",
|
||||
" x = self.layer4(x)\n",
|
||||
" \n",
|
||||
" x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])\n",
|
||||
" # Attenntion Layer\n",
|
||||
" x = self.attention(x, x, x)\n",
|
||||
" x = self.avgpool(x)\n",
|
||||
" \n",
|
||||
" # FC Layer\n",
|
||||
" x = x.reshape(x.shape[0], -1)\n",
|
||||
" x = self.relu(self.fc1(x))\n",
|
||||
" x = self.relu(self.fc2(x))\n",
|
||||
" x = self.relu(self.fc3(x))\n",
|
||||
"\n",
|
||||
" return x\n",
|
||||
"\n",
|
||||
" def _make_layer(self, block, num_residual_blocks, intermediate_channels, out_channels, stride):\n",
|
||||
" identity_downsample = None\n",
|
||||
" layers = []\n",
|
||||
"\n",
|
||||
" # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes\n",
|
||||
" # we need to adapt the Identity (skip connection) so it will be able to be added\n",
|
||||
" # to the layer that's ahead\n",
|
||||
" if stride != 1 or self.in_channels != out_channels:\n",
|
||||
" identity_downsample = nn.Sequential(\n",
|
||||
" nn.Conv2d(\n",
|
||||
" self.in_channels,\n",
|
||||
" out_channels,\n",
|
||||
" kernel_size=1,\n",
|
||||
" stride=stride,\n",
|
||||
" ),\n",
|
||||
" nn.BatchNorm2d(out_channels),\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" layers.append(\n",
|
||||
" block(self.in_channels, intermediate_channels, out_channels, identity_downsample, stride)\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" self.in_channels = out_channels\n",
|
||||
"\n",
|
||||
" # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,\n",
|
||||
" # then finally back to 256. Hence no identity downsample is needed, since stride = 1,\n",
|
||||
" # and also same amount of channels.\n",
|
||||
" for i in range(num_residual_blocks - 1):\n",
|
||||
" layers.append(block(self.in_channels, intermediate_channels, out_channels))\n",
|
||||
"\n",
|
||||
" return nn.Sequential(*layers)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def Net_ResNet50(img_channel=3, num_classes=1000):\n",
|
||||
" return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def Net_ResNet101(img_channel=3, num_classes=1000):\n",
|
||||
" return Net(block, [3, 4, 23, 3], img_channel, num_classes, expansion=4)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def Net_ResNet152(img_channel=3, num_classes=1000):\n",
|
||||
" return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def test():\n",
|
||||
" net = Net_ResNet101(img_channel=1)\n",
|
||||
" x = torch.randn(2, 1, 300, 40)\n",
|
||||
" y = net(x).to('cpu')\n",
|
||||
" print(y.shape)\n",
|
||||
" \n",
|
||||
"test()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
@@ -4,7 +4,6 @@ import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class SelfAttention(nn.Module):
|
||||
def __init__(self, embed_size, heads):
|
||||
super(SelfAttention, self).__init__()
|
||||
@@ -68,8 +67,7 @@ class SelfAttention(nn.Module):
|
||||
# (N, query_len, embed_size)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
||||
class block(nn.Module):
|
||||
def __init__(
|
||||
self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1
|
||||
@@ -117,14 +115,13 @@ class block(nn.Module):
|
||||
x += identity
|
||||
x = self.relu(x)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self, block, layers, image_channels, num_classes, expansion):
|
||||
super(Net, self).__init__()
|
||||
self.in_channels = 64
|
||||
self.conv1 = nn.Conv2d(
|
||||
image_channels, 64, kernel_size=7, stride=2, padding=3)
|
||||
self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
|
||||
self.bn1 = nn.BatchNorm2d(64)
|
||||
self.relu = nn.ReLU()
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
@@ -142,11 +139,11 @@ class Net(nn.Module):
|
||||
self.layer4 = self._make_layer(
|
||||
block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2
|
||||
)
|
||||
|
||||
|
||||
self.attention = SelfAttention(heads=4, embed_size=512*expansion)
|
||||
|
||||
|
||||
self.avgpool = nn.AvgPool2d((20, 1))
|
||||
|
||||
|
||||
self.fc1 = nn.Linear(512*expansion, 512*expansion//2)
|
||||
self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)
|
||||
self.fc3 = nn.Linear(512*expansion//4, num_classes)
|
||||
@@ -161,17 +158,17 @@ class Net(nn.Module):
|
||||
x = self.layer2(x)
|
||||
x = self.layer3(x)
|
||||
x = self.layer4(x)
|
||||
|
||||
|
||||
x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])
|
||||
# Attention Layer
|
||||
# Attenntion Layer
|
||||
x = self.attention(x, x, x)
|
||||
x = self.avgpool(x)
|
||||
|
||||
|
||||
# FC Layer
|
||||
x = x.reshape(x.shape[0], -1)
|
||||
x = self.relu(self.fc1(x))
|
||||
x = self.relu(self.fc2(x))
|
||||
x = self.fc3(x)
|
||||
x = self.relu(self.fc3(x))
|
||||
|
||||
return x
|
||||
|
||||
@@ -194,21 +191,19 @@ class Net(nn.Module):
|
||||
)
|
||||
|
||||
layers.append(
|
||||
block(self.in_channels, intermediate_channels,
|
||||
out_channels, identity_downsample, stride)
|
||||
block(self.in_channels, intermediate_channels, out_channels, identity_downsample, stride)
|
||||
)
|
||||
|
||||
|
||||
self.in_channels = out_channels
|
||||
|
||||
# For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
|
||||
# then finally back to 256. Hence no identity downsample is needed, since stride = 1,
|
||||
# and also same amount of channels.
|
||||
for i in range(num_residual_blocks - 1):
|
||||
layers.append(
|
||||
block(self.in_channels, intermediate_channels, out_channels))
|
||||
layers.append(block(self.in_channels, intermediate_channels, out_channels))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
|
||||
|
||||
def Net_ResNet50(img_channel=3, num_classes=1000):
|
||||
return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)
|
||||
@@ -220,3 +215,5 @@ def Net_ResNet101(img_channel=3, num_classes=1000):
|
||||
|
||||
def Net_ResNet152(img_channel=3, num_classes=1000):
|
||||
return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)
|
||||
|
||||
|
||||
|
||||
144
ResNet/run_model.py
Normal file
144
ResNet/run_model.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import os
|
||||
import glob
|
||||
import torch
|
||||
import librosa
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import scipy.signal as signal
|
||||
|
||||
import torch.optim as optim
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from model import Net_ResNet50
|
||||
|
||||
from torch.utils.data import random_split, Dataset, DataLoader
|
||||
from tqdm import tqdm
|
||||
|
||||
# Parameters
|
||||
dataset_dir = '/home/bbekci/datasets/vctk/wav48_silence_trimmed'
|
||||
max_epochs = 100
|
||||
batch_size = 64
|
||||
|
||||
# CUDA for PyTorch
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if use_cuda else "cpu")
|
||||
torch.backends.cudnn.benchmark = True
|
||||
|
||||
|
||||
class VCTKData(Dataset):
|
||||
def __init__(self, root_dir, transform=None):
|
||||
self.data = []
|
||||
self.c2i, self.i2c = {}, {}
|
||||
for indx, cla in enumerate(os.listdir(root_dir)):
|
||||
main_path = root_dir + '/' + cla + '/*.flac'
|
||||
for file_path in glob.glob(main_path):
|
||||
self.data.append((file_path, cla))
|
||||
|
||||
self.c2i[cla] = indx
|
||||
self.i2c[indx] = cla
|
||||
|
||||
self.transform = transform
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
||||
def n_class(self):
|
||||
return len(list(self.c2i.keys()))
|
||||
|
||||
# According to our input 66150 is the length
|
||||
def apply_melspectrogram(self, filename):
|
||||
target_len = 66150
|
||||
y, sample_rate = librosa.load(filename, duration=3)
|
||||
|
||||
while(y.shape[0] != target_len):
|
||||
y = np.append(y, y[:target_len - y.shape[0]])
|
||||
|
||||
if y.shape[0] == 0:
|
||||
print("y.shape[0] == 0")
|
||||
return None
|
||||
|
||||
window_time = .025
|
||||
hop_time = .01
|
||||
n_fft = int(sample_rate * window_time)
|
||||
|
||||
hop_len = int(sample_rate * hop_time)
|
||||
|
||||
melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y),
|
||||
sr=sample_rate,
|
||||
n_mels=40,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_len,
|
||||
window=signal.windows.hamming)
|
||||
log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
|
||||
|
||||
melspectrogram = log_melspectrogram.T[:-1]
|
||||
|
||||
out = np.expand_dims(melspectrogram, axis=0)
|
||||
|
||||
return out
|
||||
|
||||
def __getitem__(self, idx):
|
||||
if torch.is_tensor(idx):
|
||||
idx = idx.tolist()
|
||||
|
||||
sound_path, label = self.data[idx]
|
||||
sample = (self.apply_melspectrogram(sound_path), self.c2i[label])
|
||||
|
||||
if self.transform:
|
||||
sample = self.transform(sample)
|
||||
|
||||
return sample
|
||||
|
||||
|
||||
sound_data = VCTKData(root_dir=dataset_dir)
|
||||
n_classes = sound_data.n_class()
|
||||
|
||||
|
||||
train_data, test_data = random_split(sound_data,
|
||||
[int(len(sound_data) * 0.8),
|
||||
len(sound_data) - int(len(sound_data) * 0.8)]
|
||||
)
|
||||
|
||||
train_dataset_loader = torch.utils.data.DataLoader(train_data,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=4)
|
||||
|
||||
test_dataset_loader = torch.utils.data.DataLoader(test_data,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
num_workers=4)
|
||||
|
||||
|
||||
net = Net_ResNet50(img_channel=1, num_classes=n_classes)
|
||||
net.to(device)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
optimizer = torch.optim.AdamW(net.parameters())
|
||||
|
||||
for epoch in range(max_epochs): # loop over the dataset multiple times
|
||||
|
||||
running_loss = 0.0
|
||||
for i, data in enumerate(train_dataset_loader):
|
||||
# get the inputs; data is a list of [inputs, labels]
|
||||
inputs, labels = data[0].to(device), data[1].to(device)
|
||||
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward + backward + optimize
|
||||
outputs = net(inputs)
|
||||
loss = criterion(outputs, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# print statistics
|
||||
running_loss += loss.item()
|
||||
if i % 2000 == 1999: # print every 2000 mini-batches
|
||||
print('[%d, %5d] loss: %.3f' %
|
||||
(epoch + 1, i + 1, running_loss / 2000))
|
||||
running_loss = 0.0
|
||||
|
||||
print('Finished Training')
|
||||
@@ -21,6 +21,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
<<<<<<< HEAD
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -29,35 +30,21 @@
|
||||
"tr_txt = 'txts/tr_voxceleb_video_pkl_paths.txt'\n",
|
||||
"val_txt = 'txts/val_voxceleb_video_pkl_paths.txt' \n",
|
||||
"batch_size = 32"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
=======
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tr_gen, val_gen = get_keras_datagens(data_dir, batch_size, split_by='video', split_size=0.3, txt_dirs=[tr_txt, val_txt])"
|
||||
"sample_per_person = 10\n",
|
||||
"batch_size = 64\n",
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"outputPrepend"
|
||||
]
|
||||
<<<<<<< HEAD
|
||||
"execution_count": 14,
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
" 364 2552 4947 1090 2189 3961 4623\n",
|
||||
" 4922 2577 964 2048 5547 1662 4686 3146 2605 2089 3819 5493 2437 1326\n",
|
||||
" 5154 940 5694 3133]\n",
|
||||
"(32, 300, 40)\n",
|
||||
"(32,)\n",
|
||||
"[3636 1114 831 4209 866 529 3704 2501 2403 2645 1307 386 4005 4568\n",
|
||||
" 5172 5787 4859 4257 547 1116 268 1585 354 1716 5165 1408 5708 4017\n",
|
||||
" 3690 4062 4107 602]\n",
|
||||
@@ -612,18 +599,37 @@
|
||||
"\u001b[0;32m~/inzpeech/dataloaders/DatagenVoxCeleb.py\u001b[0m in \u001b[0;36mget_batch_sample\u001b[0;34m(self, idx, batch_size)\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_pickle_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpickle_load\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0mloaded_sample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_load\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0midname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvideoname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeatures\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloaded_sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/numpy/core/numeric.py\u001b[0m in \u001b[0;36m_frombuffer\u001b[0;34m(buf, dtype, shape, order)\u001b[0m\n\u001b[1;32m 1810\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1812\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_frombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1813\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfrombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
||||
=======
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(64, 100, 40, 1)\n",
|
||||
"(64, 109)\n"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
<<<<<<< HEAD
|
||||
"for x,y in tr_gen:\n",
|
||||
" print(x.shape)\n",
|
||||
" print(y.shape)\n",
|
||||
" print(y)"
|
||||
=======
|
||||
"for x, y in tr_gen:\n",
|
||||
" print(x.shape)\n",
|
||||
" print(y.shape)\n",
|
||||
" break"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
<<<<<<< HEAD
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -694,20 +700,37 @@
|
||||
],
|
||||
"source": [
|
||||
"model = vgg_att(num_class)"
|
||||
=======
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = VGGish(input_shape, 109)"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
<<<<<<< HEAD
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"opt = Adam(lr=1e-3)\n",
|
||||
"model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])"
|
||||
=======
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"opt = Adam(lr=1e-4)\n",
|
||||
"model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
<<<<<<< HEAD
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -738,6 +761,23 @@
|
||||
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36mwrapped_fn\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[0;31m# __wrapped__ allows AutoGraph to swap in a converted function. We give\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[0;31m# the function a weak reference to itself to avoid a reference cycle.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__wrapped__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 442\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweakref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwrapped_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# pylint:disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"ag_error_metadata\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mag_error_metadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mValueError\u001b[0m: in user code:\n\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function *\n outputs = self.distribute_strategy.run(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run **\n return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica\n return self._call_for_each_replica(fn, args, kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica\n return fn(*args, **kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step **\n loss = self.compiled_loss(\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__\n loss_value = loss_obj(y_t, y_p, sample_weight=sw)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__\n losses = self.call(y_true, y_pred)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call\n return self.fn(y_true, y_pred, **self._fn_kwargs)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy\n return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy\n target.shape.assert_is_compatible_with(output.shape)\n /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with\n raise ValueError(\"Shapes %s and %s are incompatible\" % (self, other))\n\n ValueError: Shapes (None, 1) and (None, 1251) are incompatible\n"
|
||||
=======
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Epoch 1/20\n",
|
||||
"14/14 [==============================] - 31s 2s/step - loss: 9.0328 - accuracy: 0.0238 - val_loss: 32030.5371 - val_accuracy: 0.0202\n",
|
||||
"Epoch 2/20\n",
|
||||
"14/14 [==============================] - 31s 2s/step - loss: 5.8168 - accuracy: 0.0306 - val_loss: 12789.2061 - val_accuracy: 0.0202\n",
|
||||
"Epoch 3/20\n",
|
||||
"14/14 [==============================] - 32s 2s/step - loss: 5.0801 - accuracy: 0.0522 - val_loss: 4288.9995 - val_accuracy: 0.0000e+00\n",
|
||||
"Epoch 4/20\n",
|
||||
"14/14 [==============================] - ETA: 0s - loss: 5.0360 - accuracy: 0.0782"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -755,6 +795,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
<<<<<<< HEAD
|
||||
"name": "Python 3.8.5 64-bit ('inzpeech': conda)",
|
||||
"display_name": "Python 3.8.5 64-bit ('inzpeech': conda)",
|
||||
"metadata": {
|
||||
@@ -762,6 +803,11 @@
|
||||
"hash": "fcc15a4440aa802b6aa76ba989d07fd1e1f9e303ad2563ebf174689c6e63879d"
|
||||
}
|
||||
}
|
||||
=======
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -773,9 +819,17 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
<<<<<<< HEAD
|
||||
"version": "3.8.5-final"
|
||||
=======
|
||||
"version": "3.8.5"
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
<<<<<<< HEAD
|
||||
}
|
||||
=======
|
||||
}
|
||||
>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
|
||||
|
||||
65
datagen_vctk.py
Normal file
65
datagen_vctk.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
from tensorflow.keras.utils import Sequence, to_categorical
|
||||
from load_vctk import get_model_data
|
||||
import math
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
|
||||
data_main_dir = os.path.join('..', 'datasets', 'vctk', 'wav48_silence_trimmed')
|
||||
|
||||
class VCTKDatagen(Sequence):
|
||||
def __init__(self, audio_paths, labels, batch_size, num_class, audio_load_func, shuffle=False):
|
||||
self.aud_paths = audio_paths
|
||||
self.labels = labels
|
||||
self.b_size = batch_size
|
||||
self.num_class = num_class
|
||||
self.audio_load_func = audio_load_func
|
||||
self.shuffle = audio_load_func
|
||||
|
||||
def __len__(self):
|
||||
return math.ceil( len( self.aud_paths) / self.b_size )
|
||||
|
||||
def __getitem__(self, idx):
|
||||
|
||||
# Get portion of data for batch
|
||||
batch_paths = self.aud_paths[idx*self.b_size:(idx+1)*self.b_size]
|
||||
batch_labels = self.labels[idx*self.b_size:(idx+1)*self.b_size]
|
||||
|
||||
model_in = np.array([self.audio_load_func(ap) for ap in batch_paths])
|
||||
model_out = to_categorical(batch_labels, num_classes=self.num_class)
|
||||
|
||||
return np.expand_dims(model_in, axis=-1), model_out
|
||||
|
||||
def on_epoch_end(self):
|
||||
if self.shuffle:
|
||||
idx = np.arange(len(self.aud_paths))
|
||||
np.random.shuffle(idx)
|
||||
self.aud_paths = np.array(self.aud_paths)[idx].tolist()
|
||||
self.labels = np.array(self.labels)[idx].tolist()
|
||||
|
||||
def get_datagen(sample_per_person, batch_size, audio_load_func, split=[0.1, 0.1], shuffle=True, mics=[1, 2]):
|
||||
"""
|
||||
Get datagens for vctk dataset.
|
||||
Params:
|
||||
sample_per_person: Number of samples to select for each person.
|
||||
batch_size: Batch size of the model
|
||||
audio_load_func: Function to use audio files
|
||||
split: Ratios for the test and validation sets. Default values are 0.1 for test and 0.1 for validation.
|
||||
shuffle: Whether to shuffle the paths and labels before returning them. If you pass this false, consecutive audio files
|
||||
will obtanied from same person.
|
||||
mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
|
||||
The code could return same audio files recorded from both mics.
|
||||
Returns:
|
||||
Datagens for train, validation and test sets
|
||||
"""
|
||||
[tr_aud, tr_label], [val_aud, val_label], [te_aud, te_label] = get_model_data(data_main_dir , sample_per_person, split, shuffle, mics)
|
||||
|
||||
# -2 for s5 and log.txt files
|
||||
n_person = len(os.listdir(data_main_dir)) - 2
|
||||
tr_gen = VCTKDatagen(tr_aud, tr_label, batch_size, n_person, audio_load_func, shuffle)
|
||||
val_gen = VCTKDatagen(val_aud, val_label, batch_size, n_person, audio_load_func, shuffle)
|
||||
te_gen = VCTKDatagen(te_aud, te_label, batch_size, n_person, audio_load_func, shuffle)
|
||||
|
||||
return tr_gen, val_gen, te_gen
|
||||
145
model-keras.py
Normal file
145
model-keras.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import keras
|
||||
from keras_self_attention import SeqSelfAttention
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense
|
||||
from keras.layers import Dropout
|
||||
from keras.layers import Flatten
|
||||
from keras.layers import Import
|
||||
from keras.constraints import maxnorm
|
||||
from keras.optimizers import SGD
|
||||
from keras.layers import Activation
|
||||
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
|
||||
from keras.layers.normalization import BatchNormalization
|
||||
|
||||
class SelfAttention(Layer):
|
||||
def __init__(self,
|
||||
n_hop,
|
||||
hidden_dim,
|
||||
penalty=1.0,
|
||||
return_attention=False,
|
||||
kernel_initializer='glorot_uniform',
|
||||
kernel_regularizer=None,
|
||||
kernel_constraint=None,
|
||||
**kwargs):
|
||||
self.n_hop = n_hop
|
||||
self.hidden_dim = hidden_dim
|
||||
self.penalty = penalty
|
||||
self.kernel_initializer = keras.initializers.get(kernel_initializer)
|
||||
self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
|
||||
self.kernel_constraint = keras.constraints.get(kernel_constraint)
|
||||
self.return_attention = return_attention
|
||||
super(SelfAttention, self).__init__(**kwargs)
|
||||
|
||||
def build(self, input_shape):
|
||||
# input_shape: (None, Sequence_size, Sequence_hidden_dim)
|
||||
assert len(input_shape) >= 3
|
||||
batch_size, sequence_size, sequence_hidden_dim = input_shape
|
||||
|
||||
self.Ws1 = self.add_weight(shape=(self.hidden_dim, sequence_hidden_dim),
|
||||
initializer=self.kernel_initializer,
|
||||
name='SelfAttention-Ws1',
|
||||
regularizer=self.kernel_regularizer,
|
||||
constraint=self.kernel_constraint)
|
||||
|
||||
self.Ws2 = self.add_weight(shape=(self.n_hop, self.hidden_dim),
|
||||
initializer=self.kernel_initializer,
|
||||
name='SelfAttention-Ws2',
|
||||
regularizer=self.kernel_regularizer,
|
||||
constraint=self.kernel_constraint)
|
||||
|
||||
super(SelfAttention, self).build(input_shape)
|
||||
|
||||
def call(self, inputs):
|
||||
batch_size = K.cast(K.shape(inputs)[0], K.floatx())
|
||||
inputs_t = K.permute_dimensions(inputs, (1,2,0)) # H.T
|
||||
d1 = K.tanh(K.permute_dimensions(K.dot(self.Ws1, inputs_t), (2,0,1))) # d1 = tanh(dot(Ws1, H.T))
|
||||
d1 = K.permute_dimensions(d1, (2,1,0))
|
||||
A = K.softmax(K.permute_dimensions(K.dot(self.Ws2, d1), (2,0,1))) # A = softmax(dot(Ws2, d1))
|
||||
H = K.permute_dimensions(inputs, (0,2,1))
|
||||
outputs = K.batch_dot(A, H, axes=2) # M = AH
|
||||
|
||||
A_t = K.permute_dimensions(A, (0,2,1))
|
||||
I = K.eye(self.n_hop)
|
||||
P = K.square(self._frobenius_norm(K.batch_dot(A, A_t) - I)) # P = (frobenius_norm(dot(A, A.T) - I))**2
|
||||
self.add_loss(self.penalty*(P/batch_size))
|
||||
|
||||
if self.return_attention:
|
||||
return [outputs, A]
|
||||
else:
|
||||
return outputs
|
||||
|
||||
def compute_output_shape(self, input_shape):
|
||||
assert input_shape and len(input_shape) >= 3
|
||||
assert input_shape[-1]
|
||||
batch_size, sequence_size, sequence_hidden_dim = input_shape
|
||||
output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
|
||||
|
||||
if self.return_attention:
|
||||
attention_shape = tuple([batch_size, self.n_hop, sequence_size])
|
||||
return [output_shape, attention_shape]
|
||||
else: return output_shape
|
||||
|
||||
|
||||
def get_config(self):
|
||||
config = {
|
||||
'n_hop': self.n_hop,
|
||||
'hidden_dim': self.hidden_dim,
|
||||
'penalty':self.penalty,
|
||||
'return_attention': self.return_attention,
|
||||
'kernel_initializer': initializers.serialize(self.kernel_initializer),
|
||||
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
|
||||
'kernel_constraint': constraints.serialize(self.kernel_constraint)
|
||||
}
|
||||
base_config = super(SelfAttention, self).get_config()
|
||||
return dict(list(base_config.items()) + list(config.items()))
|
||||
|
||||
def _frobenius_norm(self, inputs):
|
||||
outputs = K.sqrt(K.sum(K.square(inputs)))
|
||||
return outputs
|
||||
|
||||
|
||||
|
||||
def vgg_att():
|
||||
inputs = keras.Input(shape=(300,40,))
|
||||
x=Conv2D(64, (3, 3), padding='same', name='block1_conv1',activation='relu')(inputs)
|
||||
x=Conv2D(64, (3, 3), padding='same', name='block1_conv2',activation='relu')(x)
|
||||
x=BatchNormalization()(x)
|
||||
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
|
||||
|
||||
x=Conv2D(128, (3, 3), padding='same', name='block2_conv1',activation='relu')(x)
|
||||
x=Conv2D(128, (3, 3), padding='same', name='block2_conv2',activation='relu')(x)
|
||||
x=BatchNormalization()(x)
|
||||
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
|
||||
|
||||
|
||||
x=Conv2D(256, (3, 3), padding='same', name='block3_conv1',activation='relu')(x)
|
||||
x=Conv2D(256, (3, 3), padding='same', name='block3_conv2',activation='relu')(x)
|
||||
x=BatchNormalization()(x)
|
||||
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
|
||||
|
||||
x=Conv2D(512, (3, 3), padding='same', name='block4_conv1',activation='relu')(x)
|
||||
x=Conv2D(512, (3, 3), padding='same', name='block4_conv2',activation='relu')(x)
|
||||
x=BatchNormalization()(x)
|
||||
x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
|
||||
|
||||
att=SelfAttention(n_hop=4,hidden_dim=1536)
|
||||
x=att(x)
|
||||
x=AveragePooling2D(x,pool_size=(4, 1))
|
||||
x = Flatten()(x)
|
||||
x = Dense(256, activation = 'relu')(x)
|
||||
output = Dense(1251,activation = 'softmax')(x)
|
||||
model = Model(inputs=inputs, outputs=output)
|
||||
|
||||
model.compile(loss='categorical_crossentropy',optimizer ='adam')#need hyperparam-tuning
|
||||
model.summary()
|
||||
return model
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
49
vggish.py
Normal file
49
vggish.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from tensorflow.keras.models import Model
|
||||
from tensorflow.keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation, BatchNormalization
|
||||
from tensorflow.keras import backend as K
|
||||
|
||||
def VGGish(input_shape, num_classes):
|
||||
|
||||
aud_input = Input(shape=input_shape, name='input_1')
|
||||
|
||||
# Block 1
|
||||
x = Conv2D(64, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv1')(aud_input)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
|
||||
|
||||
# Block 2
|
||||
x = Conv2D(128, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv2')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
|
||||
|
||||
# Block 3
|
||||
x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_1')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_2')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
|
||||
|
||||
# Block 4
|
||||
x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_1')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_2')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
|
||||
|
||||
x = Flatten(name='flatten_')(x)
|
||||
x = Dense(4096, activation=None, name='vggish_fc1/fc1_1')(x)
|
||||
x = BatchNormalization()(x)
|
||||
x = Activation('relu')(x)
|
||||
x = Dense(4096, activation=None, name='vggish_fc1/fc1_2')(x)
|
||||
x = BatchNormalization()(x)
|
||||
preds = Dense(num_classes, activation='softmax', name='vggish_fc2')(x)
|
||||
|
||||
model = Model(aud_input, preds, name='VGGish')
|
||||
|
||||
return model
|
||||
Reference in New Issue
Block a user