poonehmousavi commited on
Commit
04455e3
1 Parent(s): c5e640e

Update custom_interface.py

Browse files
Files changed (1) hide show
  1. custom_interface.py +97 -0
custom_interface.py CHANGED
@@ -1,6 +1,103 @@
1
  import torch
2
  from speechbrain.inference.interfaces import Pretrained
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  class CustomEncoderClassifier(Pretrained):
6
  """A ready-to-use class for utterance-level classification (e.g, speaker-id,
 
1
  import torch
2
  from speechbrain.inference.interfaces import Pretrained
3
 
4
+ class AttentionMLP(torch.nn.Module):
5
+ def __init__(self, input_dim, hidden_dim):
6
+ super(AttentionMLP, self).__init__()
7
+ self.layers = torch.nn.Sequential(
8
+ torch.nn.Linear(input_dim, hidden_dim),
9
+ torch.nn.ReLU(),
10
+ torch.nn.Linear(hidden_dim, 1, bias=False),
11
+ )
12
+
13
+ def forward(self, x):
14
+ x = self.layers(x)
15
+ att_w = torch.nn.functional.softmax(x, dim=2)
16
+ return att_w
17
+
18
+
19
+ class Discrete_EmbeddingLayer(torch.nn.Module):
20
+ """This class handles embedding layers for discrete tokens.
21
+
22
+ Arguments
23
+ ---------
24
+ num_codebooks: int ,
25
+ number of codebooks of the tokenizer.
26
+ vocab_size : int,
27
+ size of the dictionary of embeddings
28
+ emb_dim: int ,
29
+ the size of each embedding vector
30
+ pad_index: int (default: 0),
31
+ If specified, the entries at padding_idx do not contribute to the gradient.
32
+ init: boolean (default: False):
33
+ If set to True, init the embedding with the tokenizer embedding otherwise init randomly.
34
+ freeze: boolean (default: False)
35
+ If True, the embedding is frozen. If False, the model will be trained
36
+ alongside with the rest of the pipeline.
37
+
38
+ Example
39
+ -------
40
+ >>> from speechbrain.lobes.models.huggingface_transformers.encodec import Encodec
41
+ >>> model_hub = "facebook/encodec_24khz"
42
+ >>> save_path = "savedir"
43
+ >>> model = Encodec(model_hub, save_path)
44
+ >>> audio = torch.randn(4, 1000)
45
+ >>> length = torch.tensor([1.0, .5, .75, 1.0])
46
+ >>> tokens, emb = model.encode(audio, length)
47
+ >>> print(tokens.shape)
48
+ torch.Size([4, 4, 2])
49
+ >>> emb= Discrete_EmbeddingLayer(2, 1024, 1024)
50
+ >>> in_emb = emb(tokens)
51
+ >>> print(in_emb.shape)
52
+ torch.Size([4, 4, 2, 1024])
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ num_codebooks,
58
+ vocab_size,
59
+ emb_dim,
60
+ pad_index=0,
61
+ init=False,
62
+ freeze=False,
63
+ ):
64
+ super(Discrete_EmbeddingLayer, self).__init__()
65
+ self.vocab_size = vocab_size
66
+ self.num_codebooks = num_codebooks
67
+ self.freeze = freeze
68
+ self.embedding = torch.nn.Embedding(
69
+ num_codebooks * vocab_size, emb_dim
70
+ ).requires_grad_(not self.freeze)
71
+ self.init = init
72
+
73
+ def init_embedding(self, weights):
74
+ with torch.no_grad():
75
+ self.embedding.weight = torch.nn.Parameter(weights)
76
+
77
+ def forward(self, in_tokens):
78
+ """Computes the embedding for discrete tokens.
79
+ a sample.
80
+
81
+ Arguments
82
+ ---------
83
+ in_tokens : torch.Tensor
84
+ A (Batch x Time x num_codebooks)
85
+ audio sample
86
+ Returns
87
+ -------
88
+ in_embs : torch.Tensor
89
+ """
90
+ with torch.set_grad_enabled(not self.freeze):
91
+ # Add unique token IDs across diffrent codebooks by adding num_codebooks * vocab_size
92
+ in_tokens += torch.arange(
93
+ 0,
94
+ self.num_codebooks * self.vocab_size,
95
+ self.vocab_size,
96
+ device=in_tokens.device,
97
+ )
98
+ # Forward Pass to embedding and
99
+ in_embs = self.embedding(in_tokens)
100
+ return in_embs
101
 
102
  class CustomEncoderClassifier(Pretrained):
103
  """A ready-to-use class for utterance-level classification (e.g, speaker-id,