Source code for src.core.bert.dataset_pretrain

import torch
import logging
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

from preprocess.utils import preprocess_text, WordPiece



[docs]
class PreprocessBERTPretrain:
    """
    A class to preprocess BERT Pretraining data

    :param config_dict: Config Params Dictionary
    :type config_dict: dict
    """

    def __init__(self, config_dict):
        self.logger = logging.getLogger(__name__)

        self.input_path = config_dict["paths"]["input_file"]
        self.num_samples = config_dict["dataset"]["num_samples"]
        self.seq_len = config_dict["dataset"]["seq_len"]
        self.operations = config_dict["preprocess"]["operations"]

        self.wordpiece = WordPiece(config_dict)


[docs]
    def get_data(self):
        """
        Converts extracted data into tokens and Next Sentence Prediction labels

        :return: Pretraining Data (tokens, nsp labels)
        :rtype: tuple (numpy.ndarray [num_samples, seq_len], numpy.ndarray [num_samples,])
        """
        text_ls = self.extract_data()
        text_ls = self.preprocess_text(text_ls)
        text_lens = [len(text.split()) for text in text_ls]

        corpus = self.get_vocab(text_ls)

        half_seq_len = self.seq_len // 2 - 1
        text_tokens_a = np.zeros((len(text_ls), half_seq_len + 1))
        text_tokens_b = np.zeros((len(text_ls), half_seq_len + 1))
        count = 0
        for id, text_len in enumerate(text_lens):
            tokens_ls = corpus[count : count + text_len]
            tokens = [i for ls in tokens_ls for i in ls]
            text = " ".join(tokens).split()
            count += text_len

            text = text[: self.seq_len]

            if len(text) < self.seq_len:
                text = text + ["<PAD>"] * (self.seq_len - len(text))

            text_tokens_a[id] = np.array(
                [self.word2id[ch] for ch in ["<CLS>"] + text[:half_seq_len]]
            )
            text_tokens_b[id] = np.array(
                [
                    self.word2id[ch]
                    for ch in ["<SEP>"] + text[half_seq_len : 2 * half_seq_len]
                ]
            )

        reorder_tokens_b = np.random.choice(len(text_ls), len(text_ls), replace=False)
        text_tokens_a = np.concatenate([text_tokens_a, text_tokens_a], axis=0)
        text_tokens_b = np.concatenate(
            [text_tokens_b, text_tokens_b[reorder_tokens_b]], axis=0
        )
        text_tokens = np.concatenate([text_tokens_a, text_tokens_b], axis=-1)

        nsp_labels = np.array(["IsNext"] * len(text_ls) + ["NotNext"] * len(text_ls))
        self.lencoder = LabelEncoder()
        nsp_labels = self.lencoder.fit_transform(nsp_labels)

        return text_tokens, nsp_labels



[docs]
    def preprocess_text(self, text_ls):
        """
        Preprocesses list of strings

        :param text_ls: List of Raw strings
        :type text_ls: list
        :return: List of preprocssed strings
        :rtype: list
        """
        text_ls = [preprocess_text(text, self.operations) for text in text_ls]

        return text_ls



[docs]
    def get_vocab(self, text_ls):
        """
        Generates Vocabulary

        :param text_ls: List of preprocessed strings
        :type text_ls: list
        :return: Corpus generated by WordPiece
        :rtype: list
        """
        self.logger.info("Building Vocabulary using Word piece Tokenization method")
        corpus = self.wordpiece.fit(text_ls)
        vocab = ["<PAD>", "<UNK>", "<CLS>", "<SEP>", "<MASK>"] + list(
            self.wordpiece.vocab_freq.keys()
        )

        self.word2id = {w: i for i, w in enumerate(vocab)}
        self.id2word = {v: k for k, v in self.word2id.items()}

        return corpus



[docs]
    def batched_ids2tokens(self, tokens):
        cc
        func = lambda x: self.id2word[x]
        vect_func = np.vectorize(func)

        tokens = vect_func(tokens)

        sentences = []
        for seq in tokens:
            start_id = 0
            words = []
            for i, ch in enumerate(seq):
                if "##" != ch[:2] and i != 0:
                    tokens = seq[start_id:i]
                    word = "".join(
                        [w if i == 0 else w[2:] for i, w in enumerate(tokens)]
                    )
                    words.append(word)
                    start_id = i
            final_word = "".join(
                [w if i == 0 else w[2:] for i, w in enumerate(seq[start_id:])]
            )
            words.append(final_word)
            sentences.append(" ".join(words))

        return sentences



[docs]
    def extract_data(self):
        """
        Extracts data from Wiki en csv file

        :return: List of raw strimgs
        :rtype: list
        """
        df = pd.read_csv(self.input_path, nrows=self.num_samples)

        return df["text"]





[docs]
class BERTPretrainDataset(Dataset):
    """
    A class to generate BERT NSP training formatted dataset from preprocessed strings

    :param text_tokens: Preprocessed text tokens
    :type text_tokens: numpy.ndarray (num_samples, seq_len)
    :param nsp_labels: NSP labels
    :type nsp_labels: numpy.ndarray (n um_samples,)
    :param config_dict: Config Params Dictionary
    :type config_dict: dict
    :param word2id: Words to Ids mapping
    :type word2id: dict
    """

    def __init__(self, text_tokens, nsp_labels, config_dict, word2id):
        self.text_tokens = text_tokens
        self.nsp_labels = nsp_labels
        self.word2id = word2id

        self.num_vocab = config_dict["dataset"]["num_vocab"]
        self.seq_len = config_dict["dataset"]["seq_len"]
        self.half_seq_len = self.seq_len // 2 - 1

        pred_prob = config_dict["preprocess"]["replace_token"]["prediction"]
        pred_mask = config_dict["preprocess"]["replace_token"]["mask"]
        pred_random = config_dict["preprocess"]["replace_token"]["random"]
        self.num_extra_tokens = config_dict["dataset"]["num_extra_tokens"]

        self.num_pred_tokens_half = int(pred_prob * self.seq_len)
        self.num_mask_tokens = int(2 * self.num_pred_tokens_half * pred_mask)
        self.num_rand_tokens = int(2 * self.num_pred_tokens_half * pred_random)

    def __len__(self):
        """
        Returns number of samples

        :return: Number of samples
        :rtype: int
        """
        return len(self.text_tokens)

    def __getitem__(self, idx):
        """
        Returns a masked data sample

        :param idx: Id of the sample
        :type idx: int
        :return: Masked Data sample (text tokens, label mask, nsp label)
        :rtype: (numpy.ndarray [seq_len,], numpy.ndarray [1,], numpy.ndarray [1,])
        """
        nsp_label = self.nsp_labels[idx]
        text_token = self.text_tokens[idx].to(torch.int64)
        text_token, lbl_mask = self._apply_mask(text_token)

        return text_token, lbl_mask, nsp_label

    def _apply_mask(self, text_token):
        """
        Applies maks on random tokens for each sample using startegy mentioned on BERT paper

        :param text_token: sample text token
        :type text_token: numpy.ndarray (seq_len,)
        :return: Masked Tokens, Label Mask
        :rtype: tuple (numpy.ndarray [seq_len,], numpy.ndarray [seq_len,])
        """
        lbl_mask_ids_a = (
            1 + torch.randperm(self.half_seq_len)[: self.num_pred_tokens_half]
        )
        lbl_mask_ids_b = (
            1
            + self.seq_len // 2
            + torch.randperm(self.half_seq_len)[: self.num_pred_tokens_half]
        )
        lbl_mask_ids = torch.concat([lbl_mask_ids_a, lbl_mask_ids_b], axis=0)
        lbl_mask = torch.zeros_like(text_token)
        lbl_mask[lbl_mask_ids] = 1

        rand_tokens = (
            torch.randperm(self.num_vocab - self.num_extra_tokens)[
                : self.num_rand_tokens
            ]
            + self.num_extra_tokens
        )
        text_token[lbl_mask_ids[: self.num_mask_tokens]] = self.word2id["<MASK>"]
        text_token[
            lbl_mask_ids[
                self.num_mask_tokens : self.num_mask_tokens + self.num_rand_tokens
            ]
        ] = rand_tokens

        return text_token, lbl_mask




[docs]
def create_dataloader_pretrain(
    X, y, config_dict, word2id, val_split=0.2, test_split=0.2, batch_size=32, seed=2024
):
    """
    Creates PyTorch DataLoader for Pretraining data

    :param X: Masked Input text tokens
    :type X: numpy.ndarray (num_samples, seq_len)
    :param y: NSP labels
    :type y: numpy.ndarray (num_samples,)
    :param config_dict: Config Params Dictionary
    :type config_dict: dict
    :param word2id: Words to Ids mapping
    :type word2id: dict
    :param val_split: validation split, defaults to 0.2
    :type val_split: float, optional
    :param test_split: Test split, defaults to 0.2
    :type test_split: float, optional
    :param batch_size: Batch size, defaults to 32
    :type batch_size: int, optional
    :param seed: Seed, defaults to 2024
    :type seed: int, optional
    :return: Train, Val and Test dataloaders
    :rtype: tuple (torch.utils.data.DataLoader, torch.utils.data.DataLoader,torch.utils.data.DataLoader)
    """
    train_X, val_X, train_y, val_y = train_test_split(
        X, y, test_size=val_split + test_split, random_state=seed
    )
    val_X, test_X, val_y, test_y = train_test_split(
        X, y, test_size=test_split / (val_split + test_split), random_state=seed
    )

    train_ds = BERTPretrainDataset(
        torch.Tensor(train_X), torch.Tensor(train_y), config_dict, word2id
    )
    train_loader = DataLoader(
        train_ds,
        batch_size=batch_size,
        shuffle=True,
        drop_last=True,
        num_workers=1,
        pin_memory=True,
    )

    val_ds = BERTPretrainDataset(
        torch.Tensor(val_X), torch.Tensor(val_y), config_dict, word2id
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=batch_size,
        shuffle=False,
        drop_last=True,
        num_workers=1,
        pin_memory=True,
    )

    test_ds = BERTPretrainDataset(
        torch.Tensor(test_X), torch.Tensor(test_y), config_dict, word2id
    )
    test_loader = DataLoader(
        test_ds,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=1,
        pin_memory=False,
    )

    return train_loader, val_loader, test_loader