Source code for torch_timeseries.dataloader.uea

from typing import Sequence, Tuple, Type

import numpy as np
import torch

from torch_timeseries.dataset import UEA
from torch_timeseries.utils.timefeatures import time_features
from ..scaler import Scaler
from torch_timeseries.core import (
    TimeSeriesDataset,
    TimeseriesSubset,
)
from torch.utils.data import Dataset, DataLoader, RandomSampler, Subset

from .wrapper import MultiStepTimeFeatureSet





class UEADataset(Dataset):
    def __init__(self, dataset: UEA, scaler: Scaler, flag: str ='train', window: int = 168, scaler_fit=True):
        self.dataset = dataset
        self.window = window
        self.scaler = scaler
        self.flag = flag
        if self.flag == 'test':
            self.scaled_feature_df = self.scaler.transform(self.dataset.test_df)
            self.feature_df = self.dataset.test_df
            self.labels = self.dataset.test_labels
        elif self.flag == 'train':
            if scaler_fit:
                self.scaler.fit(self.dataset.train_features_data)

            self.scaled_feature_df = self.scaler.transform(self.dataset.train_df)
            self.feature_df = self.dataset.train_df
            self.labels = self.dataset.train_labels

        self.indexes = self.feature_df.index.unique()
        
    def __getitem__(self, ind):
        scaled_x = torch.tensor(self.scaled_feature_df.loc[ind].values)
        x = torch.tensor(self.feature_df.loc[ind].values)
        y = torch.tensor(self.labels.loc[ind].values)
        return scaled_x, x, y
      
    def __len__(self):
        return len(self.indexes)




[docs]class UEAClassification: """ Class for handling the classification of UEA datasets. Attributes: batch_size (int): Number of samples per batch. num_worker (int): Number of worker threads for data loading. dataset (UEA): UEA dataset to be used. scaler (Scaler): Scaler to normalize the data. window (int): Window size for the time series data. If not enough data, zeros will be used for padding. shuffle_train (bool): Whether to shuffle the training data. """ def __init__( self, dataset: UEA, scaler: Scaler, window: int = 168, shuffle_train=True, batch_size: int = 32, num_worker: int = 3, ) -> None: self.batch_size = batch_size self.num_worker = num_worker self.dataset = dataset self.scaler = scaler self.window = window self.shuffle_train = shuffle_train self._load() def _load(self): self._load_dataset() self._load_dataloader() def _load_dataset(self): """ Return the splitted training, testing and validation dataloders :return: a tuple of train_dataloader, test_dataloader and val_dataloader """ # fixed suquence dataset self.train_dataset = UEADataset(self.dataset, self.scaler, 'train', self.window, scaler_fit=True) self.val_dataset = UEADataset(self.dataset, self.scaler, 'test', self.window, scaler_fit=False) self.test_dataset = UEADataset(self.dataset, self.scaler, 'test', self.window, scaler_fit=False) def _load_dataloader(self): self.train_size = len(self.train_dataset) self.val_size = len(self.val_dataset) self.test_size = len(self.test_dataset) self.train_loader = DataLoader( self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_worker, drop_last=False, collate_fn=lambda x: collate_fn(x, max_len=self.window) ) self.val_loader = DataLoader( self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_worker, drop_last=False, collate_fn=lambda x: collate_fn(x, max_len=self.window) ) self.test_loader = DataLoader( self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_worker, drop_last=False, collate_fn=lambda x: collate_fn(x, max_len=self.window) )
def collate_fn(data, max_len=None): """Build mini-batch tensors from a list of (X, mask) tuples. Mask input. Create Args: data: len(batch_size) list of tuples (X, y). - X: torch tensor of shape (seq_length, feat_dim); variable seq_length. - y: torch tensor of shape (num_labels,) : class indices or numerical targets (for classification or regression, respectively). num_labels > 1 for multi-task models max_len: global fixed sequence length. Used for architectures requiring fixed length input, where the batch length cannot vary dynamically. Longer sequences are clipped, shorter are padded with 0s Returns: X: (batch_size, padded_length, feat_dim) torch tensor of masked features (input) targets: (batch_size, padded_length, feat_dim) torch tensor of unmasked features (output) target_masks: (batch_size, padded_length, feat_dim) boolean torch tensor 0 indicates masked values to be predicted, 1 indicates unaffected/"active" feature values padding_masks: (batch_size, padded_length) boolean tensor, 1 means keep vector at this position, 0 means padding """ batch_size = len(data) features, raw_features, labels = zip(*data) # Stack and pad features and masks (convert 2D to 3D tensors, i.e. add batch dimension) lengths = [X.shape[0] for X in features] # original sequence length for each time series if max_len is None: max_len = max(lengths) X = torch.zeros(batch_size, max_len, features[0].shape[-1]) # (batch_size, padded_length, feat_dim) scaled_X = torch.zeros(batch_size, max_len, features[0].shape[-1]) # (batch_size, padded_length, feat_dim) for i in range(batch_size): end = min(lengths[i], max_len) scaled_X[i, :end, :] = features[i][:end, :] X[i, :end, :] = raw_features[i][:end, :] targets = torch.stack(labels, dim=0) # (batch_size, num_labels) padding_masks = padding_mask(torch.tensor(lengths, dtype=torch.int16), max_len=max_len) # (batch_size, padded_length) boolean tensor, "1" means keep return scaled_X, X, targets, padding_masks def padding_mask(lengths, max_len=None): """ Used to mask padded positions: creates a (batch_size, max_len) boolean mask from a tensor of sequence lengths, where 1 means keep element at this position (time step) """ batch_size = lengths.numel() max_len = max_len or lengths.max_val() # trick works because of overloading of 'or' operator for non-boolean types return (torch.arange(0, max_len, device=lengths.device) .type_as(lengths) .repeat(batch_size, 1) .lt(lengths.unsqueeze(1)))