import torch
import logging
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from preprocess.imdb_reviews import PreprocessIMDB
from core.word2vec.dataset import Word2VecDataset
[docs]
class RNNDataset(Word2VecDataset):
"""
RNN Dataset
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.logger = logging.getLogger(__name__)
self.config_dict = config_dict
self.num_vocab = config_dict["dataset"]["num_vocab"]
self.preprocess()
self.get_vocab()
[docs]
def get_data(self):
"""
Generates tokens and labels from extracted data
:return: Input tokens, Labels
:rtype: tuple (numpy.ndarray [num_samples, seq_len], numpy.ndarray [num_samples, num_classes])
"""
self.word2id["<PAD>"] = len(self.word2id)
self.id2word[len(self.id2word)] = "<PAD>"
self.label_encoder = OneHotEncoder()
X = self.pad_slice_text(self.text_ls)
y = np.array(self.preproc_cls.label_ls).reshape(-1, 1)
y = self.label_encoder.fit_transform(y).toarray()
return np.array(X), y
[docs]
def get_test_data(self):
"""
Generates test tokens and labels from extracted data
:return: Input tokens, Labels
:rtype: tuple (numpy.ndarray [num_samples, seq_len], numpy.ndarray [num_samples, num_classes])
"""
root_path = self.config_dict["paths"]["test_folder"]
explore_folder = self.config_dict["dataset"]["explore_folder"]
num_samples = self.config_dict["dataset"]["test_samples"]
operations = self.config_dict["preprocess"]["operations"]
randomize = self.config_dict["preprocess"]["randomize"]
test_preproc_cls = PreprocessIMDB(
root_path, explore_folder, num_samples, operations, randomize
)
test_preproc_cls.run()
test_text_ls, test_labels = test_preproc_cls.text_ls, test_preproc_cls.label_ls
X_test = self.pad_slice_text(test_text_ls)
y_test = np.array(test_labels).reshape(-1, 1)
y_test = self.label_encoder.transform(y_test).toarray()
return X_test, y_test
[docs]
def pad_slice_text(self, text_ls):
"""
Pads and slices text to seq_len tokens
:param text_ls: List of text samples
:type text_ls: list
:return: list of preprocessed text
:rtype: list
"""
seq_len = self.config_dict["dataset"]["seq_len"]
X = []
for text in text_ls:
ls = []
for word in text.split()[:seq_len]:
if word in self.word2id.keys():
ls.append(self.word2id[word])
else:
ls.append(self.word2id["<UNK>"])
if len(ls) < seq_len:
num_pad = seq_len - len(ls)
ls.extend([self.word2id["<PAD>"]] * num_pad)
X.append(ls)
return X
[docs]
def create_dataloader(X, y, val_split=0.2, batch_size=32, seed=2024):
"""
Creates Train, Validation DataLoader
:param X: Input tokens
:type X: torch.Tensor (num_samples, seq_len)
:param y: Output Labels
:type y: torch.Tensor (num_samples, num_classes)
:param val_split: validation split, defaults to 0.2
:type val_split: float, optional
:param batch_size: Batch size, defaults to 32
:type batch_size: int, optional
:param seed: Seed, defaults to 2024
:type seed: int, optional
:return: Train, Val dataloaders
:rtype: tuple (torch.utils.data.DataLoader, torch.utils.data.DataLoader)
"""
train_x, val_x, train_y, val_y = train_test_split(
X, y, test_size=val_split, random_state=seed
)
train_x, val_x, train_y, val_y = train_test_split(
X, y, test_size=val_split, random_state=seed
)
train_ds = TensorDataset(torch.Tensor(train_x), torch.Tensor(train_y))
train_loader = DataLoader(
train_ds,
batch_size=batch_size,
shuffle=True,
drop_last=True,
num_workers=1,
pin_memory=True,
)
val_ds = TensorDataset(torch.Tensor(val_x), torch.Tensor(val_y))
val_loader = DataLoader(
val_ds,
batch_size=batch_size,
shuffle=False,
drop_last=True,
num_workers=1,
pin_memory=True,
)
return train_loader, val_loader