Source code for src.core.tfidf.tfidf

import os
import numpy as np
import pandas as pd
import logging

from core.bow.bow import BOW
from plot_utils import plot_pca_pairplot



[docs]
class TFIDF(BOW):
    """
    A class to run TFIDF data preprocessing, training and inference

    :param config_dict: Config Params Dictionary
    :type config_dict: dict
    """

    def __init__(self, config_dict):
        self.logger = logging.getLogger(__name__)
        self.config_dict = config_dict
        self.return_label = self.config_dict["model"]["output_label"]


[docs]
    def run(self):
        """
        Runs TF-IDF Fit, Transform and saves output
        """
        self.preprocess()
        X, y = self.fit_transform()
        self.save_output(X, y)



[docs]
    def fit(self, text_ls=None, y=None):
        """
        Fits BOW algo on preprocessed text

        :param text_ls: List of preprocessec strings, defaults to None
        :type text_ls: list, optional
        :param y: Labels, defaults to None
        :type y: list, optional
        """
        self.logger.info("Fitting TF-IDF to Extracted Data")
        if text_ls is None:
            text_ls, y = self.text_ls, self.y
        self.vocab = []
        for text in text_ls:
            uniq_words = list(set(text.split()))
            self.vocab.extend(uniq_words)
            self.vocab = list(set(self.vocab))
        self.vocab_dict = {k: v for v, k in enumerate(self.vocab)}



[docs]
    def fit_transform(self, text_ls=None, y=None):
        """
        Fits and Transforms preprocessed text

        :param text_ls: List of preprocessec strings, defaults to None
        :type text_ls: list, optional
        :param y: Labels, defaults to None
        :type y: list, optional
        :return: Word vectors, Labels
        :rtype: tuple (numpy.ndarray [num_samples, num_vocab], numpy.ndarray [num_samples, num_vocab])
        """
        if text_ls is None:
            text_ls, y = self.text_ls, self.y
        self.fit(text_ls, y)
        X, y = self.transform(text_ls, y)
        return X, y



[docs]
    def transform(self, text_ls=None, y=None):
        """
        Transforms preprocessed text

        :param text_ls: List of preprocessec strings, defaults to None
        :type text_ls: list, optional
        :param y: Labels, defaults to None
        :type y: list, optional
        :return: Word vectors, Labels
        :rtype: tuple (numpy.ndarray [num_samples, num_vocab], numpy.ndarray [num_samples, num_vocab])
        """
        self.logger.info("Transforming Txt Data into Vectors")
        if text_ls is None:
            text_ls, y = self.text_ls, self.y

        self.tf_arr = self.get_tf(text_ls)
        self.idf_arr = self.get_idf(text_ls)

        X = (self.tf_arr.T) * (self.idf_arr.T)
        return X, y



[docs]
    def get_tf(self, text_ls):
        """
        Creates Term Frequency array

        :param text_ls: List of preprocessec strings
        :type text_ls: list
        :return: Term Frequency array
        :rtype: numpy.ndarray (num_vocab, num_samples)
        """
        """
            Binary
            Raw Count
            Term Frequency
            Log Norm
            Double Norm
        """
        self.logger.info("Calculating TF")
        tf_mode = self.config_dict["model"]["tf_mode"]
        tf_arr = np.zeros((len(self.vocab), len(text_ls)))

        for i, text in enumerate(text_ls):
            for word in text.split():
                tf_arr[self.vocab_dict[word]][i] += 1

        if tf_mode == "binary":
            tf_arr = np.where(tf_arr > 0, 1, 0)
        elif tf_mode == "raw_count":
            tf_arr = tf_arr
        elif tf_mode == "term_frequency":
            tf_arr = tf_arr / np.sum(tf_arr, axis=1)
        elif tf_mode == "log_norm":
            tf_arr = np.log(1 + tf_arr)
        elif tf_mode == "double_norm":
            tf_arr = 0.5 + 0.5 * tf_arr / np.max(tf_arr, axis=1)
        else:
            self.logger.error("Invalid TF Mode.")

        return tf_arr



[docs]
    def get_idf(self, text_ls):
        """
        Creates Inverse Document Frequency array

        :param text_ls: List of preprocessec strings
        :type text_ls: list
        :return: Inverse Document Frequency array
        :rtype: numpy.ndarray (num_vocab,)
        """
        """
            Unary
            Log Scaled
            Log Scaled Smoothing
            Log Scaled Max
            Log Scaled Probablistic
        """
        self.logger.info("Calculating IDF")
        idf_mode = self.config_dict["model"]["idf_mode"]

        tf_binary = np.where(self.tf_arr > 0, 1, 0)
        idf_arr = np.sum(tf_binary, axis=1)
        N = len(text_ls)

        if idf_mode == "unary":
            idf_arr = np.where(idf_arr > 0, 1, 0)
        elif idf_mode == "log_scaled":
            idf_arr = np.log(N / idf_arr)
        elif idf_mode == "log_scaled_smoothing":
            idf_arr = 1 + np.log(N / (1 + idf_arr))
        elif idf_mode == "log_scaled_max":
            idf_arr = np.log(np.max(idf_arr) / (1 + idf_arr))
        elif idf_mode == "log_scaled_probablistic":
            idf_arr = np.log(N / idf_arr - 1)
        else:
            self.logger.error("Invalid IDF Mode.")

        return idf_arr



[docs]
    def save_output(self, X, y):
        """
        Saves Training and Inference results

        :param X: Word vectors
        :type X: numpy.ndarray (num_samples, num_vocab)
        :param y: Labels, defaults to None
        :type y: list, optional
        """
        self.logger.info("Saving Vectors and Plots into Output Folder")
        output_folder = self.config_dict["paths"]["output_folder"]
        os.makedirs(output_folder, exist_ok=True)
        visualize = self.config_dict["visualize"]

        np.save(os.path.join(output_folder, "Text Vector.npy"), X)
        np.save(os.path.join(output_folder, "Text Label.npy"), y)

        if visualize:
            plot_pca_pairplot(X, y, output_folder)
            plot_pca_pairplot(self.tf_arr.T, y, output_folder, name="TF PCA Pairplot")
            idf_df = pd.DataFrame.from_dict({"Token": self.vocab, "IDF": self.idf_arr})
            idf_df.to_csv(os.path.join(output_folder, "IDF.csv"), index=False)