Source code for src.preprocess.imdb_reviews

import os
import glob
from tqdm import tqdm
import numpy as np
import logging

from .utils import preprocess_text


[docs] class PreprocessIMDB: """ Loading and Generating Reviews, labels for IMDB dataset :param root_path: Root Folder with all the classes Folders with txt files for each sample or can have txt files :type root_path: str :param explore_folder: Whether the root_path has classes folder or txt files :type explore_folder: bool :param num_samples: How many samples to select from each folder :type num_samples: int :param operations: Any combinations of {'lcase', 'remalpha', 'stopwords', 'stemming'}. list of preprocessing Operations :type operations: list :param randomize: Select first num_samples or at random :type randomize: bool """ def __init__(self, root_path, explore_folder, num_samples, operations, randomize): self.logger = logging.getLogger(__name__) self.extract_data(root_path, explore_folder, num_samples, randomize) self.logger.info("Extracted Data from TXT Files") self.operations = operations
[docs] def run(self): """ Preprocessing list of sentences """ self.text_ls = [preprocess_text(i, self.operations) for i in self.text_ls]
[docs] def extract_data(self, root_path, explore_folder, num_samples, randomize): """ Extracting data from txt files :param root_path: Root Folder with all the classes Folders with txt files for each sample or can have txt files :type root_path: str :param explore_folder: Whether the root_path has classes folder or txt files :type explore_folder: bool :param num_samples: How many samples to select from each folder :type num_samples: int :param randomize: Select first num_samples or at random :type randomize: bool """ self.text_ls, self.label_ls = [], [] if explore_folder: folders = os.listdir(root_path) for folder in folders: fold_path = os.path.join(root_path, folder) text_ls = self.extract_data_folder(fold_path, num_samples, randomize) self.text_ls.extend(text_ls) self.label_ls.extend([folder] * num_samples) else: text_ls = self.extract_data_folder(root_path, num_samples, randomize) self.text_ls.extend(text_ls) folder = os.path.basename(root_path) self.label_ls.extend([folder] * num_samples)
[docs] def extract_data_folder(self, fold_path, num_samples, randomize): """ Extracting txt data from each folder :param fold_path: Path to Folder :type fold_path: str :param num_samples: How many samples to select from each folder :type num_samples: int :param randomize: Select first num_samples or at random :type randomize: bool :return: List of sentences from the folder :rtype: list """ text_ls = [] path_ls = glob.glob(f"{fold_path}/*.txt") if randomize: path_ids = np.random.choice(np.arange(len(path_ls)), size=num_samples) else: path_ids = np.arange(num_samples) desc = f"Extracting from: {os.path.basename(fold_path)}" for id in tqdm(path_ids, desc=desc): f = open(path_ls[id], "r", encoding="utf8", errors="ignore") text = " ".join(f.readlines()) text_ls.append(text) f.close() return text_ls