Merged

2021-06-01 09:25:07 +03:00 · 2020-11-13 20:58:59 +03:00
parent 0fe0fe810f 90355144ce
commit 9e03b876bb
9 changed files with 1016 additions and 41 deletions
--- a/.ipynb_checkpoints/feature_extraction-checkpoint.ipynb
+++ b/.ipynb_checkpoints/feature_extraction-checkpoint.ipynb
--- a/ResNet/.DS_Store
+++ b/ResNet/.DS_Store
--- a/ResNet/Resnet.ipynb
+++ b/ResNet/Resnet.ipynb
@@ -0,0 +1,370 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from os import listdir\n",
+    "from os.path import isfile, join\n",
+    "import numpy as np\n",
+    "import librosa\n",
+    "import matplotlib.pyplot as plt\n",
+    "from scipy import signal\n",
+    "\n",
+    "\n",
+    "def apply_melspectrogram(filename):\n",
+    "    y, sample_rate = librosa.load(filename,duration=3)\n",
+    "\n",
+    "    if y.shape[0] == 0:\n",
+    "        print(\"y.shape[0] == 0\")\n",
+    "        return None\n",
+    "    \n",
+    "    window_time = .025\n",
+    "    hop_time = .01\n",
+    "    n_fft = sample_rate * window_time\n",
+    "\n",
+    "    hop_len = sample_rate * hop_time\n",
+    "\n",
+    "    melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y),\n",
+    "                                                    sr=sample_rate,\n",
+    "                                                    n_mels=40,\n",
+    "                                                    n_fft=int(n_fft), \n",
+    "                                                    hop_length = int(hop_len),\n",
+    "                                                    window=signal.windows.hamming)\n",
+    "    log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)\n",
+    "\n",
+    "    melspectrogram = log_melspectrogram.T[:-1]\n",
+    "\n",
+    "\n",
+    "    return melspectrogram\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(300, 40)\n"
+     ]
+    }
+   ],
+   "source": [
+    "sound = apply_melspectrogram('data/1.wav')\n",
+    "print(sound.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([2, 1000])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import math\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "class SelfAttention(nn.Module):\n",
+    "    def __init__(self, embed_size, heads):\n",
+    "        super(SelfAttention, self).__init__()\n",
+    "        self.embed_size = embed_size\n",
+    "        self.heads = heads\n",
+    "        self.head_dim = embed_size // heads\n",
+    "\n",
+    "        assert (\n",
+    "            self.head_dim * heads == embed_size\n",
+    "        ), \"Embedding size needs to be divisible by heads\"\n",
+    "\n",
+    "        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
+    "        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
+    "        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)\n",
+    "        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)\n",
+    "\n",
+    "    def forward(self, values, keys, query, mask=None):\n",
+    "        # Get number of training examples\n",
+    "        N = query.shape[0]\n",
+    "\n",
+    "        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]\n",
+    "\n",
+    "        # Split the embedding into self.heads different pieces\n",
+    "        values = values.reshape(N, value_len, self.heads, self.head_dim)\n",
+    "        keys = keys.reshape(N, key_len, self.heads, self.head_dim)\n",
+    "        query = query.reshape(N, query_len, self.heads, self.head_dim)\n",
+    "\n",
+    "        values = self.values(values)  # (N, value_len, heads, head_dim)\n",
+    "        keys = self.keys(keys)  # (N, key_len, heads, head_dim)\n",
+    "        queries = self.queries(query)  # (N, query_len, heads, heads_dim)\n",
+    "\n",
+    "        # Einsum does matrix mult. for query*keys for each training example\n",
+    "        # with every other training example, don't be confused by einsum\n",
+    "        # it's just how I like doing matrix multiplication & bmm\n",
+    "\n",
+    "        energy = torch.einsum(\"nqhd,nkhd->nhqk\", [queries, keys])\n",
+    "        # queries shape: (N, query_len, heads, heads_dim),\n",
+    "        # keys shape: (N, key_len, heads, heads_dim)\n",
+    "        # energy: (N, heads, query_len, key_len)\n",
+    "\n",
+    "        # Mask padded indices so their weights become 0\n",
+    "        if mask is not None:\n",
+    "            energy = energy.masked_fill(mask == 0, float(\"-1e20\"))\n",
+    "\n",
+    "        # Normalize energy values similarly to seq2seq + attention\n",
+    "        # so that they sum to 1. Also divide by scaling factor for\n",
+    "        # better stability\n",
+    "        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)\n",
+    "        # attention shape: (N, heads, query_len, key_len)\n",
+    "\n",
+    "        out = torch.einsum(\"nhql,nlhd->nqhd\", [attention, values]).reshape(\n",
+    "            N, query_len, self.heads * self.head_dim\n",
+    "        )\n",
+    "        # attention shape: (N, heads, query_len, key_len)\n",
+    "        # values shape: (N, value_len, heads, heads_dim)\n",
+    "        # out after matrix multiply: (N, query_len, heads, head_dim), then\n",
+    "        # we reshape and flatten the last two dimensions.\n",
+    "\n",
+    "        out = self.fc_out(out)\n",
+    "        # Linear layer doesn't modify the shape, final shape will be\n",
+    "        # (N, query_len, embed_size)\n",
+    "\n",
+    "        return out\n",
+    "    \n",
+    "class block(nn.Module):\n",
+    "    def __init__(\n",
+    "        self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1\n",
+    "    ):\n",
+    "        super(block, self).__init__()\n",
+    "        self.conv1 = nn.Conv2d(\n",
+    "            in_channels, intermediate_channels, kernel_size=1, stride=1, padding=0\n",
+    "        )\n",
+    "        self.bn1 = nn.BatchNorm2d(intermediate_channels)\n",
+    "        self.conv2 = nn.Conv2d(\n",
+    "            intermediate_channels,\n",
+    "            intermediate_channels,\n",
+    "            kernel_size=3,\n",
+    "            stride=stride,\n",
+    "            padding=1,\n",
+    "        )\n",
+    "        self.bn2 = nn.BatchNorm2d(intermediate_channels)\n",
+    "        self.conv3 = nn.Conv2d(\n",
+    "            intermediate_channels,\n",
+    "            out_channels,\n",
+    "            kernel_size=1,\n",
+    "            stride=1,\n",
+    "            padding=0,\n",
+    "        )\n",
+    "        self.bn3 = nn.BatchNorm2d(out_channels)\n",
+    "        self.relu = nn.ReLU()\n",
+    "        self.identity_downsample = identity_downsample\n",
+    "        self.stride = stride\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        identity = x.clone()\n",
+    "\n",
+    "        x = self.conv1(x)\n",
+    "        x = self.bn1(x)\n",
+    "        x = self.relu(x)\n",
+    "        x = self.conv2(x)\n",
+    "        x = self.bn2(x)\n",
+    "        x = self.relu(x)\n",
+    "        x = self.conv3(x)\n",
+    "        x = self.bn3(x)\n",
+    "\n",
+    "        if self.identity_downsample is not None:\n",
+    "            identity = self.identity_downsample(identity)\n",
+    "\n",
+    "        x += identity\n",
+    "        x = self.relu(x)\n",
+    "        return x\n",
+    "    \n",
+    "\n",
+    "class Net(nn.Module):\n",
+    "    def __init__(self, block, layers, image_channels, num_classes, expansion):\n",
+    "        super(Net, self).__init__()\n",
+    "        self.in_channels = 64\n",
+    "        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)\n",
+    "        self.bn1 = nn.BatchNorm2d(64)\n",
+    "        self.relu = nn.ReLU()\n",
+    "        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)\n",
+    "\n",
+    "        # Essentially the entire ResNet architecture are in these 4 lines below\n",
+    "        self.layer1 = self._make_layer(\n",
+    "            block, layers[0], intermediate_channels=64, out_channels=64*expansion, stride=1\n",
+    "        )\n",
+    "        self.layer2 = self._make_layer(\n",
+    "            block, layers[1], intermediate_channels=128, out_channels=128*expansion, stride=2\n",
+    "        )\n",
+    "        self.layer3 = self._make_layer(\n",
+    "            block, layers[2], intermediate_channels=256, out_channels=256*expansion, stride=2\n",
+    "        )\n",
+    "        self.layer4 = self._make_layer(\n",
+    "            block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2\n",
+    "        )\n",
+    "    \n",
+    "        self.attention = SelfAttention(heads=4, embed_size=512*expansion)\n",
+    "        \n",
+    "        self.avgpool = nn.AvgPool2d((20, 1))\n",
+    "        \n",
+    "        self.fc1 = nn.Linear(512*expansion, 512*expansion//2)\n",
+    "        self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)\n",
+    "        self.fc3 = nn.Linear(512*expansion//4, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        # ResNet layer\n",
+    "        x = self.conv1(x)\n",
+    "        x = self.bn1(x)\n",
+    "        x = self.relu(x)\n",
+    "        x = self.maxpool(x)\n",
+    "        x = self.layer1(x)\n",
+    "        x = self.layer2(x)\n",
+    "        x = self.layer3(x)\n",
+    "        x = self.layer4(x)\n",
+    "          \n",
+    "        x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])\n",
+    "        # Attenntion Layer\n",
+    "        x = self.attention(x, x, x)\n",
+    "        x = self.avgpool(x)\n",
+    "        \n",
+    "        # FC Layer\n",
+    "        x = x.reshape(x.shape[0], -1)\n",
+    "        x = self.relu(self.fc1(x))\n",
+    "        x = self.relu(self.fc2(x))\n",
+    "        x = self.relu(self.fc3(x))\n",
+    "\n",
+    "        return x\n",
+    "\n",
+    "    def _make_layer(self, block, num_residual_blocks, intermediate_channels, out_channels, stride):\n",
+    "        identity_downsample = None\n",
+    "        layers = []\n",
+    "\n",
+    "        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes\n",
+    "        # we need to adapt the Identity (skip connection) so it will be able to be added\n",
+    "        # to the layer that's ahead\n",
+    "        if stride != 1 or self.in_channels != out_channels:\n",
+    "            identity_downsample = nn.Sequential(\n",
+    "                nn.Conv2d(\n",
+    "                    self.in_channels,\n",
+    "                    out_channels,\n",
+    "                    kernel_size=1,\n",
+    "                    stride=stride,\n",
+    "                ),\n",
+    "                nn.BatchNorm2d(out_channels),\n",
+    "            )\n",
+    "\n",
+    "        layers.append(\n",
+    "            block(self.in_channels, intermediate_channels, out_channels, identity_downsample, stride)\n",
+    "        )\n",
+    "        \n",
+    "        self.in_channels = out_channels\n",
+    "\n",
+    "        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,\n",
+    "        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,\n",
+    "        # and also same amount of channels.\n",
+    "        for i in range(num_residual_blocks - 1):\n",
+    "            layers.append(block(self.in_channels, intermediate_channels, out_channels))\n",
+    "\n",
+    "        return nn.Sequential(*layers)\n",
+    "    \n",
+    "\n",
+    "def Net_ResNet50(img_channel=3, num_classes=1000):\n",
+    "    return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)\n",
+    "\n",
+    "\n",
+    "def Net_ResNet101(img_channel=3, num_classes=1000):\n",
+    "    return Net(block, [3, 4, 23, 3], img_channel, num_classes, expansion=4)\n",
+    "\n",
+    "\n",
+    "def Net_ResNet152(img_channel=3, num_classes=1000):\n",
+    "    return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)\n",
+    "\n",
+    "\n",
+    "def test():\n",
+    "    net = Net_ResNet101(img_channel=1)\n",
+    "    x = torch.randn(2, 1, 300, 40)\n",
+    "    y = net(x).to('cpu')\n",
+    "    print(y.shape)\n",
+    "    \n",
+    "test()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/ResNet/model.py
+++ b/ResNet/model.py
@@ -4,7 +4,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F

-
 class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
@@ -68,8 +67,7 @@ class SelfAttention(nn.Module):
        # (N, query_len, embed_size)

        return out
-
-
+    
 class block(nn.Module):
    def __init__(
        self, in_channels, intermediate_channels, out_channels, identity_downsample=None, stride=1
@@ -117,14 +115,13 @@ class block(nn.Module):
        x += identity
        x = self.relu(x)
        return x
-
+    

 class Net(nn.Module):
    def __init__(self, block, layers, image_channels, num_classes, expansion):
        super(Net, self).__init__()
        self.in_channels = 64
-        self.conv1 = nn.Conv2d(
-            image_channels, 64, kernel_size=7, stride=2, padding=3)
+        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
@@ -142,11 +139,11 @@ class Net(nn.Module):
        self.layer4 = self._make_layer(
            block, layers[3], intermediate_channels=512, out_channels=512*expansion, stride=2
        )
-
+    
        self.attention = SelfAttention(heads=4, embed_size=512*expansion)
-
+        
        self.avgpool = nn.AvgPool2d((20, 1))
-
+        
        self.fc1 = nn.Linear(512*expansion, 512*expansion//2)
        self.fc2 = nn.Linear(512*expansion//2, 512*expansion//4)
        self.fc3 = nn.Linear(512*expansion//4, num_classes)
@@ -161,17 +158,17 @@ class Net(nn.Module):
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
-
+          
        x = x.reshape(x.shape[0], x.shape[2] * x.shape[3], x.shape[1])
-        # Attention Layer
+        # Attenntion Layer
        x = self.attention(x, x, x)
        x = self.avgpool(x)
-
+        
        # FC Layer
        x = x.reshape(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
-        x = self.fc3(x)
+        x = self.relu(self.fc3(x))

        return x

@@ -194,21 +191,19 @@ class Net(nn.Module):
            )

        layers.append(
-            block(self.in_channels, intermediate_channels,
-                  out_channels, identity_downsample, stride)
+            block(self.in_channels, intermediate_channels, out_channels, identity_downsample, stride)
        )
-
+        
        self.in_channels = out_channels

        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
        # and also same amount of channels.
        for i in range(num_residual_blocks - 1):
-            layers.append(
-                block(self.in_channels, intermediate_channels, out_channels))
+            layers.append(block(self.in_channels, intermediate_channels, out_channels))

        return nn.Sequential(*layers)
-
+    

 def Net_ResNet50(img_channel=3, num_classes=1000):
    return Net(block, [3, 4, 6, 3], img_channel, num_classes, expansion=4)
@@ -220,3 +215,5 @@ def Net_ResNet101(img_channel=3, num_classes=1000):

 def Net_ResNet152(img_channel=3, num_classes=1000):
    return Net(block, [3, 8, 36, 3], img_channel, num_classes, expansion=4)
+
+
--- a/ResNet/run_model.py
+++ b/ResNet/run_model.py
@@ -0,0 +1,144 @@
+import os
+import glob
+import torch
+import librosa
+
+import numpy as np
+import pandas as pd
+import scipy.signal as signal
+
+import torch.optim as optim
+import torch.nn as nn
+import torch.nn.functional as F
+
+from model import Net_ResNet50
+
+from torch.utils.data import random_split, Dataset, DataLoader
+from tqdm import tqdm
+
+# Parameters
+dataset_dir = '/home/bbekci/datasets/vctk/wav48_silence_trimmed'
+max_epochs = 100
+batch_size = 64
+
+# CUDA for PyTorch
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda:0" if use_cuda else "cpu")
+torch.backends.cudnn.benchmark = True
+
+
+class VCTKData(Dataset):
+    def __init__(self, root_dir, transform=None):
+        self.data = []
+        self.c2i, self.i2c = {}, {}
+        for indx, cla in enumerate(os.listdir(root_dir)):
+            main_path = root_dir + '/' + cla + '/*.flac'
+            for file_path in glob.glob(main_path):
+                self.data.append((file_path, cla))
+
+            self.c2i[cla] = indx
+            self.i2c[indx] = cla
+
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.data)
+
+    def n_class(self):
+        return len(list(self.c2i.keys()))
+
+    # According to our input 66150 is the length
+    def apply_melspectrogram(self, filename):
+        target_len = 66150
+        y, sample_rate = librosa.load(filename, duration=3)
+        
+        while(y.shape[0] != target_len):
+            y = np.append(y, y[:target_len - y.shape[0]])
+
+        if y.shape[0] == 0:
+            print("y.shape[0] == 0")
+            return None
+
+        window_time = .025
+        hop_time = .01
+        n_fft = int(sample_rate * window_time)
+
+        hop_len = int(sample_rate * hop_time)
+
+        melspectrogram = librosa.feature.melspectrogram(y=librosa.effects.preemphasis(y),
+                                                        sr=sample_rate,
+                                                        n_mels=40,
+                                                        n_fft=n_fft,
+                                                        hop_length=hop_len,
+                                                        window=signal.windows.hamming)
+        log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
+
+        melspectrogram = log_melspectrogram.T[:-1]
+
+        out = np.expand_dims(melspectrogram, axis=0)
+
+        return out
+
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+
+        sound_path, label = self.data[idx]
+        sample = (self.apply_melspectrogram(sound_path), self.c2i[label])
+
+        if self.transform:
+            sample = self.transform(sample)
+
+        return sample
+
+
+sound_data = VCTKData(root_dir=dataset_dir)
+n_classes = sound_data.n_class()
+
+
+train_data, test_data = random_split(sound_data,
+                                     [int(len(sound_data) * 0.8),
+                                      len(sound_data) - int(len(sound_data) * 0.8)]
+                                     )
+
+train_dataset_loader = torch.utils.data.DataLoader(train_data,
+                                                   batch_size=batch_size,
+                                                   shuffle=True,
+                                                   num_workers=4)
+
+test_dataset_loader = torch.utils.data.DataLoader(test_data,
+                                                  batch_size=batch_size,
+                                                  shuffle=True,
+                                                  num_workers=4)
+
+
+net = Net_ResNet50(img_channel=1, num_classes=n_classes)
+net.to(device)
+
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.AdamW(net.parameters())
+
+for epoch in range(max_epochs):  # loop over the dataset multiple times
+
+    running_loss = 0.0
+    for i, data in enumerate(train_dataset_loader):
+        # get the inputs; data is a list of [inputs, labels]
+        inputs, labels = data[0].to(device), data[1].to(device)
+
+        # zero the parameter gradients
+        optimizer.zero_grad()
+
+        # forward + backward + optimize
+        outputs = net(inputs)
+        loss = criterion(outputs, labels)
+        loss.backward()
+        optimizer.step()
+
+        # print statistics
+        running_loss += loss.item()
+        if i % 2000 == 1999:    # print every 2000 mini-batches
+            print('[%d, %5d] loss: %.3f' %
+                  (epoch + 1, i + 1, running_loss / 2000))
+            running_loss = 0.0
+
+print('Finished Training')
--- a/Train_VCTK_Keras.ipynb
+++ b/Train_VCTK_Keras.ipynb
@@ -21,6 +21,7 @@
  },
  {
   "cell_type": "code",
+<<<<<<< HEAD
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
@@ -29,35 +30,21 @@
    "tr_txt = 'txts/tr_voxceleb_video_pkl_paths.txt'\n",
    "val_txt = 'txts/val_voxceleb_video_pkl_paths.txt' \n",
    "batch_size = 32"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
+=======
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
-    "tr_gen, val_gen = get_keras_datagens(data_dir, batch_size, split_by='video', split_size=0.3, txt_dirs=[tr_txt, val_txt])"
+    "sample_per_person = 10\n",
+    "batch_size = 64\n",
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {
-    "tags": [
-     "outputPrepend"
-    ]
+<<<<<<< HEAD
+   "execution_count": 14,
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
   },
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      " 364 2552 4947 1090 2189 3961 4623\n",
-      " 4922 2577  964 2048 5547 1662 4686 3146 2605 2089 3819 5493 2437 1326\n",
-      " 5154  940 5694 3133]\n",
-      "(32, 300, 40)\n",
-      "(32,)\n",
      "[3636 1114  831 4209  866  529 3704 2501 2403 2645 1307  386 4005 4568\n",
      " 5172 5787 4859 4257  547 1116  268 1585  354 1716 5165 1408 5708 4017\n",
      " 3690 4062 4107  602]\n",
@@ -612,18 +599,37 @@
      "\u001b[0;32m~/inzpeech/dataloaders/DatagenVoxCeleb.py\u001b[0m in \u001b[0;36mget_batch_sample\u001b[0;34m(self, idx, batch_size)\u001b[0m\n\u001b[1;32m     49\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msample_pickle_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     50\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpickle_load\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m                 \u001b[0mloaded_sample\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpickle_load\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     52\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     53\u001b[0m             \u001b[0midname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvideoname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeatures\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloaded_sample\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/numpy/core/numeric.py\u001b[0m in \u001b[0;36m_frombuffer\u001b[0;34m(buf, dtype, shape, order)\u001b[0m\n\u001b[1;32m   1810\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1811\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1812\u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_frombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1813\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mfrombuffer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1814\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+=======
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(64, 100, 40, 1)\n",
+      "(64, 109)\n"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
     ]
    }
   ],
   "source": [
+<<<<<<< HEAD
    "for x,y in tr_gen:\n",
    "    print(x.shape)\n",
    "    print(y.shape)\n",
    "    print(y)"
+=======
+    "for x, y in tr_gen:\n",
+    "    print(x.shape)\n",
+    "    print(y.shape)\n",
+    "    break"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
   ]
  },
  {
   "cell_type": "code",
+<<<<<<< HEAD
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
@@ -694,20 +700,37 @@
   ],
   "source": [
    "model = vgg_att(num_class)"
+=======
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = VGGish(input_shape, 109)"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
   ]
  },
  {
   "cell_type": "code",
+<<<<<<< HEAD
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "opt = Adam(lr=1e-3)\n",
    "model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])"
+=======
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "opt = Adam(lr=1e-4)\n",
+    "model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
   ]
  },
  {
   "cell_type": "code",
+<<<<<<< HEAD
   "execution_count": 20,
   "metadata": {},
   "outputs": [
@@ -738,6 +761,23 @@
      "\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py\u001b[0m in \u001b[0;36mwrapped_fn\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m    439\u001b[0m         \u001b[0;31m# __wrapped__ allows AutoGraph to swap in a converted function. We give\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    440\u001b[0m         \u001b[0;31m# the function a weak reference to itself to avoid a reference cycle.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 441\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mweak_wrapped_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__wrapped__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    442\u001b[0m     \u001b[0mweak_wrapped_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mweakref\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mwrapped_fn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    443\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    966\u001b[0m           \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m  \u001b[0;31m# pylint:disable=broad-except\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    967\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"ag_error_metadata\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m               \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mag_error_metadata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    969\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    970\u001b[0m               \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: in user code:\n\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *\n        outputs = self.distribute_strategy.run(\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **\n        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica\n        return self._call_for_each_replica(fn, args, kwargs)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica\n        return fn(*args, **kwargs)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:532 train_step  **\n        loss = self.compiled_loss(\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:205 __call__\n        loss_value = loss_obj(y_t, y_p, sample_weight=sw)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:143 __call__\n        losses = self.call(y_true, y_pred)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:246 call\n        return self.fn(y_true, y_pred, **self._fn_kwargs)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1527 categorical_crossentropy\n        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4561 categorical_crossentropy\n        target.shape.assert_is_compatible_with(output.shape)\n    /home/bbekci/miniconda3/envs/inzpeech/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with\n        raise ValueError(\"Shapes %s and %s are incompatible\" % (self, other))\n\n    ValueError: Shapes (None, 1) and (None, 1251) are incompatible\n"
+=======
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/20\n",
+      "14/14 [==============================] - 31s 2s/step - loss: 9.0328 - accuracy: 0.0238 - val_loss: 32030.5371 - val_accuracy: 0.0202\n",
+      "Epoch 2/20\n",
+      "14/14 [==============================] - 31s 2s/step - loss: 5.8168 - accuracy: 0.0306 - val_loss: 12789.2061 - val_accuracy: 0.0202\n",
+      "Epoch 3/20\n",
+      "14/14 [==============================] - 32s 2s/step - loss: 5.0801 - accuracy: 0.0522 - val_loss: 4288.9995 - val_accuracy: 0.0000e+00\n",
+      "Epoch 4/20\n",
+      "14/14 [==============================] - ETA: 0s - loss: 5.0360 - accuracy: 0.0782"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
     ]
    }
   ],
@@ -755,6 +795,7 @@
 ],
 "metadata": {
  "kernelspec": {
+<<<<<<< HEAD
   "name": "Python 3.8.5 64-bit ('inzpeech': conda)",
   "display_name": "Python 3.8.5 64-bit ('inzpeech': conda)",
   "metadata": {
@@ -762,6 +803,11 @@
     "hash": "fcc15a4440aa802b6aa76ba989d07fd1e1f9e303ad2563ebf174689c6e63879d"
    }
   }
+=======
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
  },
  "language_info": {
   "codemirror_mode": {
@@ -773,9 +819,17 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
+<<<<<<< HEAD
   "version": "3.8.5-final"
+=======
+   "version": "3.8.5"
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+<<<<<<< HEAD
+}
+=======
+}
+>>>>>>> 90355144ce4a7d9932e7274ea8bfbec292c5288f
--- a/datagen_vctk.py
+++ b/datagen_vctk.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# coding: utf-8
+from tensorflow.keras.utils import Sequence, to_categorical
+from load_vctk import get_model_data
+import math
+import numpy as np
+import os
+
+
+data_main_dir = os.path.join('..', 'datasets', 'vctk', 'wav48_silence_trimmed')
+
+class VCTKDatagen(Sequence):
+    def __init__(self, audio_paths, labels, batch_size, num_class, audio_load_func, shuffle=False):
+        self.aud_paths = audio_paths
+        self.labels = labels
+        self.b_size = batch_size
+        self.num_class = num_class
+        self.audio_load_func = audio_load_func
+        self.shuffle = audio_load_func
+        
+    def __len__(self):
+        return math.ceil( len( self.aud_paths) / self.b_size )
+    
+    def __getitem__(self, idx):
+        
+        # Get portion of data for batch
+        batch_paths = self.aud_paths[idx*self.b_size:(idx+1)*self.b_size]
+        batch_labels = self.labels[idx*self.b_size:(idx+1)*self.b_size]
+        
+        model_in = np.array([self.audio_load_func(ap) for ap in batch_paths])
+        model_out = to_categorical(batch_labels, num_classes=self.num_class)
+        
+        return np.expand_dims(model_in, axis=-1), model_out
+    
+    def on_epoch_end(self):
+        if self.shuffle:
+            idx = np.arange(len(self.aud_paths))
+            np.random.shuffle(idx)
+            self.aud_paths = np.array(self.aud_paths)[idx].tolist()
+            self.labels = np.array(self.labels)[idx].tolist()
+
+def get_datagen(sample_per_person, batch_size, audio_load_func, split=[0.1, 0.1], shuffle=True, mics=[1, 2]):
+    """
+    Get datagens for vctk dataset. 
+    Params:
+        sample_per_person: Number of samples to select for each person.
+        batch_size: Batch size of the model
+        audio_load_func: Function to use audio files
+        split: Ratios for the test and validation sets. Default values are 0.1 for test and 0.1 for validation.
+        shuffle: Whether to shuffle the paths and labels before returning them. If you pass this false, consecutive audio files
+        will obtanied from same person.
+        mics: Mic number of the selected audio samples. Can be one of [1], [2], [1, 2]. If both mics included
+        The code could return same audio files recorded from both mics. 
+    Returns:
+        Datagens for train, validation and test sets
+    """
+    [tr_aud, tr_label], [val_aud, val_label], [te_aud, te_label] = get_model_data(data_main_dir , sample_per_person, split, shuffle, mics)
+    
+    # -2 for s5 and log.txt files
+    n_person = len(os.listdir(data_main_dir)) - 2
+    tr_gen = VCTKDatagen(tr_aud, tr_label, batch_size, n_person, audio_load_func, shuffle)
+    val_gen = VCTKDatagen(val_aud, val_label, batch_size, n_person, audio_load_func, shuffle)
+    te_gen = VCTKDatagen(te_aud, te_label, batch_size, n_person, audio_load_func, shuffle)
+    
+    return tr_gen, val_gen, te_gen
--- a/model-keras.py
+++ b/model-keras.py
@@ -0,0 +1,145 @@
+import keras
+from keras_self_attention import SeqSelfAttention
+import numpy as np
+import matplotlib.pyplot as plt
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import Dropout
+from keras.layers import Flatten
+from keras.layers import Import
+from keras.constraints import maxnorm
+from keras.optimizers import SGD
+from keras.layers import Activation 
+from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
+from keras.layers.normalization import BatchNormalization
+
+class SelfAttention(Layer):
+    def __init__(self, 
+                 n_hop,
+                 hidden_dim,
+                 penalty=1.0,
+                 return_attention=False,
+                 kernel_initializer='glorot_uniform',
+                 kernel_regularizer=None,
+                 kernel_constraint=None,
+                 **kwargs):
+        self.n_hop = n_hop
+        self.hidden_dim = hidden_dim
+        self.penalty = penalty
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+        self.kernel_constraint = keras.constraints.get(kernel_constraint)
+        self.return_attention = return_attention
+        super(SelfAttention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # input_shape: (None, Sequence_size, Sequence_hidden_dim)
+        assert len(input_shape) >= 3
+        batch_size, sequence_size, sequence_hidden_dim = input_shape
+        
+        self.Ws1 = self.add_weight(shape=(self.hidden_dim, sequence_hidden_dim),
+                                      initializer=self.kernel_initializer,
+                                      name='SelfAttention-Ws1',
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        
+        self.Ws2 = self.add_weight(shape=(self.n_hop, self.hidden_dim), 
+                                      initializer=self.kernel_initializer,
+                                      name='SelfAttention-Ws2',
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        
+        super(SelfAttention, self).build(input_shape)
+
+    def call(self, inputs):
+        batch_size = K.cast(K.shape(inputs)[0], K.floatx())
+        inputs_t = K.permute_dimensions(inputs, (1,2,0)) # H.T
+        d1 = K.tanh(K.permute_dimensions(K.dot(self.Ws1, inputs_t), (2,0,1))) # d1 = tanh(dot(Ws1, H.T))
+        d1 = K.permute_dimensions(d1, (2,1,0))
+        A = K.softmax(K.permute_dimensions(K.dot(self.Ws2, d1), (2,0,1))) # A = softmax(dot(Ws2, d1))
+        H = K.permute_dimensions(inputs, (0,2,1))
+        outputs = K.batch_dot(A, H, axes=2) # M = AH
+
+        A_t = K.permute_dimensions(A, (0,2,1))
+        I = K.eye(self.n_hop)
+        P = K.square(self._frobenius_norm(K.batch_dot(A, A_t) - I)) # P = (frobenius_norm(dot(A, A.T) - I))**2
+        self.add_loss(self.penalty*(P/batch_size))
+        
+        if self.return_attention: 
+            return [outputs, A]
+        else: 
+            return outputs
+
+    def compute_output_shape(self, input_shape):
+        assert input_shape and len(input_shape) >= 3
+        assert input_shape[-1]
+        batch_size, sequence_size, sequence_hidden_dim = input_shape
+        output_shape = tuple([batch_size, self.n_hop, sequence_hidden_dim])
+        
+        if self.return_attention:
+            attention_shape = tuple([batch_size, self.n_hop, sequence_size])
+            return [output_shape, attention_shape]
+        else: return output_shape
+
+
+    def get_config(self):
+        config = {
+            'n_hop': self.n_hop,
+            'hidden_dim': self.hidden_dim,
+            'penalty':self.penalty,
+            'return_attention': self.return_attention,
+            'kernel_initializer': initializers.serialize(self.kernel_initializer),
+            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+            'kernel_constraint': constraints.serialize(self.kernel_constraint)
+        }
+        base_config = super(SelfAttention, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+    
+    def _frobenius_norm(self, inputs):
+        outputs = K.sqrt(K.sum(K.square(inputs)))
+        return outputs
+
+
+
+def vgg_att():
+    inputs = keras.Input(shape=(300,40,))
+    x=Conv2D(64, (3, 3), padding='same', name='block1_conv1',activation='relu')(inputs)
+    x=Conv2D(64, (3, 3), padding='same', name='block1_conv2',activation='relu')(x)
+    x=BatchNormalization()(x)
+    x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
+
+    x=Conv2D(128, (3, 3), padding='same', name='block2_conv1',activation='relu')(x)
+    x=Conv2D(128, (3, 3), padding='same', name='block2_conv2',activation='relu')(x)
+    x=BatchNormalization()(x)
+    x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
+
+
+    x=Conv2D(256, (3, 3), padding='same', name='block3_conv1',activation='relu')(x)
+    x=Conv2D(256, (3, 3), padding='same', name='block3_conv2',activation='relu')(x)
+    x=BatchNormalization()(x)
+    x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
+
+    x=Conv2D(512, (3, 3), padding='same', name='block4_conv1',activation='relu')(x)
+    x=Conv2D(512, (3, 3), padding='same', name='block4_conv2',activation='relu')(x)
+    x=BatchNormalization()(x)
+    x=MaxPooling2D(pool_size = (2, 2), strides = (2, 2))(x)
+
+    att=SelfAttention(n_hop=4,hidden_dim=1536)
+    x=att(x)
+    x=AveragePooling2D(x,pool_size=(4, 1))
+    x = Flatten()(x)
+    x = Dense(256, activation = 'relu')(x)
+    output = Dense(1251,activation = 'softmax')(x)
+    model = Model(inputs=inputs, outputs=output)
+
+    model.compile(loss='categorical_crossentropy',optimizer ='adam')#need hyperparam-tuning 
+    model.summary()
+    return model
+
+
+
+
+
+
+
+
--- a/vggish.py
+++ b/vggish.py
@@ -0,0 +1,49 @@
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Flatten, Dense, Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D, GlobalMaxPooling2D, Activation, BatchNormalization
+from tensorflow.keras import backend as K
+
+def VGGish(input_shape, num_classes):
+
+    aud_input = Input(shape=input_shape, name='input_1')
+    
+    # Block 1
+    x = Conv2D(64, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv1')(aud_input)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
+
+    # Block 2
+    x = Conv2D(128, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv2')(x)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
+
+    # Block 3
+    x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_1')(x)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = Conv2D(256, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv3/conv3_2')(x)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
+
+    # Block 4
+    x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_1')(x)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = Conv2D(512, (3, 3), strides=(1, 1), activation=None, padding='same', name='conv4/conv4_2')(x)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
+
+    x = Flatten(name='flatten_')(x)
+    x = Dense(4096, activation=None, name='vggish_fc1/fc1_1')(x)
+    x = BatchNormalization()(x)
+    x = Activation('relu')(x)
+    x = Dense(4096, activation=None, name='vggish_fc1/fc1_2')(x)
+    x = BatchNormalization()(x)
+    preds = Dense(num_classes, activation='softmax', name='vggish_fc2')(x)
+    
+    model = Model(aud_input, preds, name='VGGish')
+
+    return model