inzpeech/vgg-model-torch.py

import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self,seq_hidden_dim, hidden_dim_nc ,seq_hidden_dim,device='cpu'):
        super(SelfAttentiveEncoder, self).__init__()
        self.W1 = nn.Parameter(nn.init.xavier_uniform_(torch.empty((seq_hidden_dim, hidden_dim_nc))).to(self.device))
        self.W2 = nn.Parameter(nn.init.xavier_uniform_(torch.empty((hidden_dim_nc, n_hop))).to(self.device))


    def forward(self, H):
        size=H.size() #expected = [batch_size,19,3,512]
        print("Size:")
        print(size)
        x=nn.tanh(nn.bmm(H.view(size[0],size[1]*size[2]),self.W1))
        x=nn.bmm(self.W2,x)
        A=nn.Softmax(x,dim=0)
        E=nn.bmm(torch.transpose(A, 1, 2),H)

        return E


"""
        alphas = torch.transpose(alphas, 1, 2).contiguous()  # [bsz, hops, seq_len]

        alphas = self.softmax(alphas.view(-1, size[1]))  # [bsz*hop, seq_len]

        alphas = alphas.view(size[0], self.attention_hops, size[1])  # [bsz, hop, seq_len]

        return torch.bmm(alphas, outh), alphas """

class VGGM(pl.LightningModule):

    def __init__(self, n_classes=1251):
        super(VGGM, self).__init__()
        self.n_classes=n_classes
        self.conv_part=nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3,3), stride=(1,1), padding=1)),
            ('bn1', nn.BatchNorm2d(64, momentum=0.1)),
            ('relu1', nn.ReLU()),
            ('mpool1', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
            ('conv2', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3,3), stride=(1,1), padding=1)),
            ('bn2', nn.BatchNorm2d(128, momentum=0.5)),
            ('relu2', nn.ReLU()),
            ('mpool2', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
            ('conv3', nn.Conv2d(in_channels=128, out_channels=256, kernel_size=(3,3), stride=(1,1), padding=1)),
            ('bn3', nn.BatchNorm2d(256, momentum=0.5)),
            ('relu3', nn.ReLU()),
            ('mpool3', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
            ('conv4', nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=1)),
            ('bn4', nn.BatchNorm2d(256, momentum=0.5)),
            ('relu4', nn.ReLU()),
            ('mpool4', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
            ('conv5', nn.Conv2d(in_channels=256, out_channels=512, kernel_size=(3,3), stride=(1,1), padding=1)),
            ('bn5', nn.BatchNorm2d(512, momentum=0.5)),
            ('relu5', nn.ReLU()),
            ('mpool5', nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))),
            ]))

        self.attention=nn.TransformerEncoderLayer(d_model=self.hidden_size_lstm*2,dim_feedforward=512,nhead=self.num_heads_self_attn)


        self.avgpool = nn.AvgPool2d((4, 1))
        self.classifier=nn.Sequential(OrderedDict([
            ('fc7', nn.Linear(4096, 1024)),
            #('drop1', nn.Dropout()),
            ('relu7', nn.ReLU()),
            ('fc8', nn.Linear(1024, n_classes))]))

    def forward(self, inp):
        x=self.conv_part(inp)
        x=self.attention(x)
        x=self.avgpool(x)
        x=self.classifier(nn.Flatten(x))

        return x