Source code for src.core.transformer.dataset
import torch
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from preprocess.utils import preprocess_text, BytePairEncoding
[docs]
class PreprocessTransformer:
"""
A class to preprocess Transformer Data
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.logger = logging.getLogger(__name__)
self.input_path = config_dict["paths"]["input_file"]
self.num_samples = config_dict["dataset"]["num_samples"]
self.seq_len = 1 + config_dict["dataset"]["seq_len"]
self.operations = config_dict["preprocess"]["operations"]
self.bpe = BytePairEncoding(config_dict)
[docs]
def get_data(self):
"""
Converts extracted data into tokens
:return: Text tokens
:rtype: numpy.ndarray (num_samples, seq_len)
"""
text_ls = self.extract_data()
text_ls = self.preprocess_text(text_ls)
bpe_words = self.get_vocab(text_ls)
text_tokens = np.zeros((len(text_ls), self.seq_len))
text_lens = [len(text.split()) for text in text_ls]
count = 0
for i, text_len in enumerate(text_lens):
text = " ".join(bpe_words[count : count + text_len]).split()
count += text_len
text = text[: self.seq_len]
if len(text) < self.seq_len:
text = text + ["<PAD>"] * (self.seq_len - len(text))
text_tokens[i] = np.array([self.word2id[ch] for ch in text])
return text_tokens
[docs]
def preprocess_text(self, text_ls):
"""
Preprocesses list of strings
:param text_ls: List of Raw strings
:type text_ls: list
:return: List of preprocssed strings
:rtype: list
"""
text_ls = [preprocess_text(text, self.operations) for text in text_ls]
return text_ls
[docs]
def get_vocab(self, text_ls):
"""
Generates Vocabulary
:param text_ls: List of preprocessed strings
:type text_ls: list
:return: Corpus generated by WordPiece
:rtype: list
"""
self.logger.info("Building Vocabulary using Byte Pair Encoding method")
words = self.bpe.fit(text_ls)
vocab = ["<PAD>"] + list(self.bpe.vocab_freq.keys())
self.word2id = {w: i for i, w in enumerate(vocab)}
self.id2word = {v: k for k, v in self.word2id.items()}
return words
[docs]
def batched_ids2tokens(self, tokens):
"""
Converting sentence of ids to tokens
:param tokens: Tokens Array, 2D array (num_samples, seq_len)
:type tokens: numpy.ndarray
:return: List of decoded sentences
:rtype: list
"""
func = lambda x: self.id2word[x]
vect_func = np.vectorize(func)
tokens = vect_func(tokens)
sentences = []
for words in tokens:
txt = ""
for word in words:
if word not in ["<PAD>"]:
txt += f"{word} "
txt = txt[:-1]
txt = txt.split("</w>")
txt = " ".join([i.replace(" ", "") for i in txt])
sentences.append(txt)
return sentences
[docs]
def extract_data(self):
"""
Extracts data from Lyrics csv file
:return: Lost of raw strings
:rtype: list
"""
df = pd.read_csv(self.input_path, nrows=self.num_samples)
return df["lyrics"]
[docs]
def create_dataloader(X, val_split=0.2, test_split=0.2, batch_size=32, seed=2024):
"""
Creates Train, Validation and Test DataLoader
:param X: Input tokens
:type X: torch.Tensor (num_samples, seq_len+1)
:param val_split: validation split, defaults to 0.2
:type val_split: float, optional
:param test_split: test split, defaults to 0.2
:type test_split: float, optional
:param batch_size: Batch size, defaults to 32
:type batch_size: int, optional
:param seed: Seed, defaults to 2024
:type seed: int, optional
:return: Train, Val, Test dataloaders
:rtype: tuple (torch.utils.data.DataLoader, torch.utils.data.DataLoader, torch.utils.data.DataLoader)
"""
train_X, val_X = train_test_split(
X, test_size=val_split + test_split, random_state=seed
)
val_X, test_X = train_test_split(
X, test_size=test_split / (val_split + test_split), random_state=seed
)
train_ds = TensorDataset(torch.Tensor(train_X))
train_loader = DataLoader(
train_ds,
batch_size=batch_size,
shuffle=True,
drop_last=True,
num_workers=1,
pin_memory=True,
)
val_ds = TensorDataset(torch.Tensor(val_X))
val_loader = DataLoader(
val_ds,
batch_size=batch_size,
shuffle=False,
drop_last=True,
num_workers=1,
pin_memory=True,
)
test_ds = TensorDataset(torch.Tensor(test_X))
test_loader = DataLoader(
test_ds,
batch_size=batch_size,
shuffle=False,
drop_last=False,
num_workers=1,
pin_memory=False,
)
return train_loader, val_loader, test_loader