Newer
Older
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class WeightingModelAttention(nn.Module):
"""
A PyTorch module that computes the weights for each model in the ensemble.
Args:
feature_count (int): The number of features in the input data.
in_size (int): The maximum input size of the models in the ensemble.
model_count (int): The number of models in the ensemble.
out_size (int, optional): The output size of the models in the ensemble. Defaults to 48.
hidden_size (int, optional): The number of nodes in the hidden layer of the weighting model. Defaults to 256.
num_layers (int, optional): The number of layers in the weighting model. Defaults to 2.
dropout (float, optional): The dropout rate of the weighting model. Defaults to 0.25.
"""
def __init__(self,feature_count,in_size,model_count,out_size=48,hidden_size=256,num_layers=2,dropout=0.25,norm_func='softmax'):
super().__init__()
self.model_count = model_count
self.out_size = out_size
self.activation = nn.ReLU()
self.norm_func_name = norm_func
self.d_out_kq = 64
# hier ist der ensemble output die query
self.W_query = nn.Linear(model_count,self.d_out_kq,bias=False)
self.W_key = nn.Linear(feature_count,self.d_out_kq,bias=False) #TODO turn bias on
self.W_value = nn.Linear(feature_count,self.d_out_kq,bias=False) #TODO der wert ist frei
# hier ist der ursprüngliche input die query
#self.W_query = nn.Linear(feature_count,self.d_out_kq,bias=False)
#self.W_key = nn.Linear(model_count,self.d_out_kq,bias=False) #TODO turn bias on
#self.W_value = nn.Linear(model_count,self.d_out_kq,bias=False) #TODO der wert ist frei
#self.W_query = nn.Parameter(torch.rand(feature_count,self.d_out_kq))
#self.W_key = nn.Parameter(torch.rand(model_count,self.d_out_kq))
#self.W_value = nn.Parameter(torch.rand(model_count,self.d_out_kq))
if norm_func == 'softmax':
self.norm_func = lambda x: nn.functional.softmax(x,dim=1)
elif norm_func == 'minmax':
self.norm_func = lambda x: ((x - x.min(dim=1,keepdim=True).values.repeat(1,self.model_count,1)) / (x - x.min(dim=1,keepdim=True).values.repeat(1,self.model_count,1)).sum(dim=1,keepdim=True))
#self.l1 = nn.Linear(in_size*feature_count,hidden_size)
#self.l2 = nn.Linear(hidden_size,model_count*out_size)
def forward(self, x_1,x_2):
x_1, x_2 = x_2, x_1 #TODO aufräumen
x_1 = torch.movedim(x_1,2,1)
#x_2 = torch.movedim(x_2,2,1)
queries_1 = self.W_query(x_1)
keys_2 = self.W_key(x_2) # new
values_2 = self.W_value(x_2) # new
attn_scores = queries_1 @ keys_2.mT # new
#queries_1 = x_1 @ self.W_query
#keys_2 = x_2 @ self.W_key # new
#values_2 = x_2 @ self.W_value # new
#attn_scores = queries_1 @ keys_2.T # new
attn_weights = torch.softmax(
attn_scores / self.d_out_kq**0.5, dim=-1)
context_vec = attn_weights @ values_2
return context_vec
class WeightingModel(nn.Module):
"""
A PyTorch module that computes the weights for each model in the ensemble.
Args:
feature_count (int): The number of features in the input data.
in_size (int): The maximum input size of the models in the ensemble.
model_count (int): The number of models in the ensemble.
out_size (int, optional): The output size of the models in the ensemble. Defaults to 48.
hidden_size (int, optional): The number of nodes in the hidden layer of the weighting model. Defaults to 256.
num_layers (int, optional): The number of layers in the weighting model. Defaults to 2.
dropout (float, optional): The dropout rate of the weighting model. Defaults to 0.25.
def __init__(self,feature_count,in_size,model_count,out_size=48,hidden_size=256,num_layers=2,dropout=0.25,norm_func='softmax'):
super().__init__()
self.model_count = model_count
self.out_size = out_size
self.activation = nn.ReLU()
self.norm_func_name = norm_func
layers = [
nn.Linear(in_size*feature_count,hidden_size),
nn.Dropout(dropout),
self.activation,
]
for _ in range(num_layers - 2):
layers += [
nn.Linear(in_features=hidden_size, out_features=hidden_size),
self.activation,
nn.Dropout(dropout),
]
layers += [nn.Linear(hidden_size,model_count*out_size)]
self.mlp = nn.Sequential(*layers)
if norm_func == 'softmax':
self.norm_func = lambda x: nn.functional.softmax(x,dim=1)
elif norm_func == 'minmax':
self.norm_func = lambda x: ((x - x.min(dim=1,keepdim=True).values.repeat(1,self.model_count,1)) / (x - x.min(dim=1,keepdim=True).values.repeat(1,self.model_count,1)).sum(dim=1,keepdim=True))
#self.l1 = nn.Linear(in_size*feature_count,hidden_size)
#self.l2 = nn.Linear(hidden_size,model_count*out_size)
def forward(self, x):
x = torch.reshape(x, (x.size(0), -1))
x = self.mlp(x)
#x = self.l1(x)
#x = self.activation(x)
#x = self.l2(x)
return x
class WaVoLightningEnsemble(pl.LightningModule):
""" Since the data for this is normalized using the scaler from the first model in model_list all models should be trained on the same or at least similar data i think?
"""
# pylint: disable-next=unused-argument
def __init__(self, model_list,model_path_list,hidden_size,num_layers,dropout,norm_func,learning_rate):
super().__init__()
self.model_list = model_list
self.max_in_size = max([model.hparams['in_size'] for model in self.model_list])
self.out_size = model_list[0].hparams['out_size']
self.feature_count = model_list[0].hparams['feature_count']
assert len(set([model.target_idx for model in self.model_list])) == 1, "All models in the ensemble must have the same target_idx"
self.target_idx = model_list[0].target_idx
self.model_architecture = 'ensemble'
self.scaler = model_list[0].scaler
self.weighting = WeightingModel(
self.feature_count,
self.max_in_size,
len(model_list),
self.out_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=dropout,
norm_func=norm_func)
self.save_hyperparameters(ignore=['model_list','scaler'])
self.save_hyperparameters({"scaler": pickle.dumps(self.scaler)})
for model in self.model_list:
model.freeze()
def _common_step(self, batch, batch_idx,dataloader_idx=0):
"""
Computes the weighted average of the predictions of the models in the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
dataloader_idx (int, optional): The index of the dataloader. Defaults to 0.
Returns:
torch.Tensor: The weighted average of the predictions of the models in the ensemble.
"""
#if ut.need_classic_input(model.model_architecture):
# temp_batch = batch[0],batch[1]
#else:
# temp_batch = batch
y_hat_list.append(model.predict_step(batch,batch_idx,dataloader_idx))
#TODO what about scaling?! before/after?
y_hats = torch.stack(y_hat_list,axis=1)
w = self.weighting(x)
y_hat = torch.sum((y_hats*w),axis=1)
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
# pylint: disable-next=arguments-differ
def training_step(self, batch, batch_idx):
"""
Computes the training loss for the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
Returns:
torch.Tensor: The training loss for the ensemble.
"""
#This is kinda overengineered, pretty much useless if we only experiment with lstms and don't use the transformer stuff
if len(batch) == 4:
_, y, _, _ = batch
y = y[:, -self.out_size:,self.target_idx]
elif len(batch) == 2:
_, y = batch
y_inv = ut.inv_standard(y, self.scaler.mean_[self.target_idx], self.scaler.scale_[self.target_idx])
y_hat = self._common_step(batch, batch_idx)
loss = nn.functional.mse_loss(y_hat,y_inv)
self.log("hp/train_loss", loss)
return loss
# pylint: disable-next=arguments-differ
def validation_step(self, batch, batch_idx):
"""
Computes the validation loss for the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
"""
#This is kinda overengineered, pretty much useless if we only experiment with lstms and don't use the transformer stuff
if len(batch) == 4:
_, y, _, _ = batch
y = y[:, -self.out_size:,self.target_idx]
elif len(batch) == 2:
_, y = batch
y_inv = ut.inv_standard(y, self.scaler.mean_[self.target_idx], self.scaler.scale_[self.target_idx])
y_hat = self._common_step(batch, batch_idx)
val_loss = nn.functional.mse_loss(y_hat, y_inv)
self.log("hp/val_loss", val_loss, sync_dist=True)
# pylint: disable-next=arguments-differ
def test_step(self, batch, batch_idx):
"""
Computes the test loss for the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
"""
#This is kinda overengineered, pretty much useless if we only experiment with lstms and don't use the transformer stuff
if len(batch) == 4:
_, y, _, _ = batch
y = y[:, -self.out_size:,self.target_idx]
elif len(batch) == 2:
_, y = batch
y_inv = ut.inv_standard(y, self.scaler.mean_[self.target_idx], self.scaler.scale_[self.target_idx])
y_hat = self._common_step(batch, batch_idx)
test_loss = nn.functional.mse_loss(y_hat, y_inv)
self.log("hp/test_loss", test_loss, sync_dist=True)
def predict_step(self, batch, batch_idx, dataloader_idx=0):
y_hat = self._common_step(batch, batch_idx,dataloader_idx)
return y_hat
# pylint: disable-next=arguments-differ
def forward_old(self, x): #TODO remove?
y_hat_list = []
for model in self.model_list:
y_hat_list.append(model.predict_step((x,None),batch_idx=1))
y_hats = torch.stack(y_hat_list,axis=1)
w = self.weighting(x)
y_hat = torch.sum((y_hats*w),axis=1)
return y_hat
def configure_optimizers(self):
optimizer = optim.Adam(
self.parameters(), lr=self.hparams.learning_rate)
return optimizer
class WaVoLightningAttentionEnsemble(pl.LightningModule):
""" Since the data for this is normalized using the scaler from the first model in model_list all models should be trained on the same or at least similar data i think?
"""
# pylint: disable-next=unused-argument
def __init__(self, model_list,model_path_list,hidden_size,num_layers,dropout,norm_func,learning_rate):
super().__init__()
self.model_list = model_list
self.max_in_size = max([model.hparams['in_size'] for model in self.model_list])
self.out_size = model_list[0].hparams['out_size']
self.feature_count = model_list[0].hparams['feature_count']
assert len(set([model.target_idx for model in self.model_list])) == 1, "All models in the ensemble must have the same target_idx"
self.target_idx = model_list[0].target_idx
self.model_architecture = 'ensemble'
self.scaler = model_list[0].scaler
self.weighting = WeightingModelAttention(
self.feature_count,
self.max_in_size,
len(model_list),
self.out_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=dropout,
norm_func=norm_func)
#TODO different insizes
#TODO ModuleList?!
self.save_hyperparameters(ignore=['model_list','scaler'])
self.save_hyperparameters({"scaler": pickle.dumps(self.scaler)})
for model in self.model_list:
model.freeze()
def _common_step(self, batch, batch_idx,dataloader_idx=0):
"""
Computes the weighted average of the predictions of the models in the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
dataloader_idx (int, optional): The index of the dataloader. Defaults to 0.
Returns:
torch.Tensor: The weighted average of the predictions of the models in the ensemble.
"""
x = batch[0]
y_hat_list = []
for model in self.model_list:
#if ut.need_classic_input(model.model_architecture):
# temp_batch = batch[0],batch[1]
#else:
# temp_batch = batch
#y_hat_list.append(model.predict_step(batch,batch_idx,dataloader_idx))
y_hat_list.append(model(x))
#TODO what about scaling?! before/after?
y_hats = torch.stack(y_hat_list,axis=1)
w = self.weighting(x,y_hats)
y_hat = torch.sum((y_hats*w),axis=1)
return y_hat
# pylint: disable-next=arguments-differ
def training_step(self, batch, batch_idx):
"""
Computes the training loss for the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
Returns:
torch.Tensor: The training loss for the ensemble.
"""
#This is kinda overengineered, pretty much useless if we only experiment with lstms and don't use the transformer stuff
if len(batch) == 4:
_, y, _, _ = batch
y = y[:, -self.out_size:,self.target_idx]
elif len(batch) == 2:
_, y = batch
y_inv = ut.inv_standard(y, self.scaler.mean_[self.target_idx], self.scaler.scale_[self.target_idx])
loss = nn.functional.mse_loss(y_hat,y_inv)
self.log("hp/train_loss", loss)
return loss
# pylint: disable-next=arguments-differ
def validation_step(self, batch, batch_idx):
"""
Computes the validation loss for the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
"""
#This is kinda overengineered, pretty much useless if we only experiment with lstms and don't use the transformer stuff
if len(batch) == 4:
_, y, _, _ = batch
y = y[:, -self.out_size:,self.target_idx]
elif len(batch) == 2:
_, y = batch
y_inv = ut.inv_standard(y, self.scaler.mean_[self.target_idx], self.scaler.scale_[self.target_idx])
y_hat = self._common_step(batch, batch_idx)
val_loss = nn.functional.mse_loss(y_hat, y_inv)
# pylint: disable-next=arguments-differ
def test_step(self, batch, batch_idx):
"""
Computes the test loss for the ensemble.
Args:
batch (tuple): A tuple containing the input data and the target data.
batch_idx (int): The index of the batch.
"""
#This is kinda overengineered, pretty much useless if we only experiment with lstms and don't use the transformer stuff
if len(batch) == 4:
_, y, _, _ = batch
y = y[:, -self.out_size:,self.target_idx]
elif len(batch) == 2:
_, y = batch
y_inv = ut.inv_standard(y, self.scaler.mean_[self.target_idx], self.scaler.scale_[self.target_idx])
test_loss = nn.functional.mse_loss(y_hat, y_inv)
self.log("hp/test_loss", test_loss, sync_dist=True)
def predict_step(self, batch, batch_idx, dataloader_idx=0):
y_hat = self._common_step(batch, batch_idx,dataloader_idx)
return y_hat
# pylint: disable-next=arguments-differ
def forward_old(self, x): #TODO remove?
y_hat_list = []
for model in self.model_list:
y_hat_list.append(model.predict_step((x,None),batch_idx=1))
y_hats = torch.stack(y_hat_list,axis=1)
w = self.weighting(x)
y_hat = torch.sum((y_hats*w),axis=1)
return y_hat
def configure_optimizers(self):
optimizer = optim.Adam(
self.parameters(), lr=self.hparams.learning_rate)
return optimizer