import os
import numpy as np
import pandas as pd
import logging
from preprocess.imdb_reviews import PreprocessIMDB
from plot_utils import plot_topk_freq, plot_wordcloud
[docs]
class BOW:
"""
A class to run BOW data preprocessing, training and inference
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.logger = logging.getLogger(__name__)
self.config_dict = config_dict
self.return_label = self.config_dict["model"]["output_label"]
[docs]
def run(self):
"""
Runs BOW Fit, Transform and saves output
"""
self.preprocess()
X, y = self.fit_transform()
self.save_output(X, y)
[docs]
def fit(self, text_ls=None, y=None):
"""
Fits BOW algo on preprocessed text
:param text_ls: List of preprocessec strings, defaults to None
:type text_ls: list, optional
:param y: Labels, defaults to None
:type y: list, optional
"""
self.logger.info("Fitting BOW to Extracted Data")
if text_ls is None:
text_ls, y = self.text_ls, self.y
self.vocab = []
for text in text_ls:
uniq_words = list(set(text.split()))
self.vocab.extend(uniq_words)
self.vocab = list(set(self.vocab))
[docs]
def preprocess(self):
"""
Preprocesses text
"""
root_path = self.config_dict["paths"]["input_folder"]
explore_folder = self.config_dict["dataset"]["explore_folder"]
num_samples = self.config_dict["dataset"]["num_samples"]
operations = self.config_dict["preprocess"]["operations"]
randomize = self.config_dict["preprocess"]["randomize"]
self.preproc_cls = PreprocessIMDB(
root_path, explore_folder, num_samples, operations, randomize
)
self.preproc_cls.run()
self.text_ls, self.y = self.preproc_cls.text_ls, self.preproc_cls.label_ls
self.logger.info("Preprocessing Done")
[docs]
def save_output(self, X, y):
"""
Saves Training and Inference results
:param X: Word vectors
:type X: numpy.ndarray (num_samples, num_vocab)
:param y: Labels, defaults to None
:type y: list, optional
"""
self.logger.info("Saving Vectors and Plots into Output Folder")
output_folder = self.config_dict["paths"]["output_folder"]
os.makedirs(output_folder, exist_ok=True)
visualize = self.config_dict["visualize"]
np.save(os.path.join(output_folder, "Text Vector.npy"), X)
np.save(os.path.join(output_folder, "Text Label.npy"), y)
df = pd.DataFrame.from_dict(
{"Word": self.vocab_freq.keys(), "Frequency": self.vocab_freq.values()}
)
df.to_csv(os.path.join(output_folder, "Vocab Frequency.csv"), index=False)
if visualize:
plot_topk_freq(self.vocab_freq, output_folder)
plot_wordcloud(self.vocab_freq, output_folder)