Source code for src.preprocess.imdb_reviews
import os
import glob
from tqdm import tqdm
import numpy as np
import logging
from .utils import preprocess_text
[docs]
class PreprocessIMDB:
"""
Loading and Generating Reviews, labels for IMDB dataset
:param root_path: Root Folder with all the classes Folders with txt files for each sample or can have txt files
:type root_path: str
:param explore_folder: Whether the root_path has classes folder or txt files
:type explore_folder: bool
:param num_samples: How many samples to select from each folder
:type num_samples: int
:param operations: Any combinations of {'lcase', 'remalpha', 'stopwords', 'stemming'}. list of preprocessing Operations
:type operations: list
:param randomize: Select first num_samples or at random
:type randomize: bool
"""
def __init__(self, root_path, explore_folder, num_samples, operations, randomize):
self.logger = logging.getLogger(__name__)
self.extract_data(root_path, explore_folder, num_samples, randomize)
self.logger.info("Extracted Data from TXT Files")
self.operations = operations
[docs]
def run(self):
"""
Preprocessing list of sentences
"""
self.text_ls = [preprocess_text(i, self.operations) for i in self.text_ls]
[docs]
def extract_data(self, root_path, explore_folder, num_samples, randomize):
"""
Extracting data from txt files
:param root_path: Root Folder with all the classes Folders with txt files for each sample or can have txt files
:type root_path: str
:param explore_folder: Whether the root_path has classes folder or txt files
:type explore_folder: bool
:param num_samples: How many samples to select from each folder
:type num_samples: int
:param randomize: Select first num_samples or at random
:type randomize: bool
"""
self.text_ls, self.label_ls = [], []
if explore_folder:
folders = os.listdir(root_path)
for folder in folders:
fold_path = os.path.join(root_path, folder)
text_ls = self.extract_data_folder(fold_path, num_samples, randomize)
self.text_ls.extend(text_ls)
self.label_ls.extend([folder] * num_samples)
else:
text_ls = self.extract_data_folder(root_path, num_samples, randomize)
self.text_ls.extend(text_ls)
folder = os.path.basename(root_path)
self.label_ls.extend([folder] * num_samples)
[docs]
def extract_data_folder(self, fold_path, num_samples, randomize):
"""
Extracting txt data from each folder
:param fold_path: Path to Folder
:type fold_path: str
:param num_samples: How many samples to select from each folder
:type num_samples: int
:param randomize: Select first num_samples or at random
:type randomize: bool
:return: List of sentences from the folder
:rtype: list
"""
text_ls = []
path_ls = glob.glob(f"{fold_path}/*.txt")
if randomize:
path_ids = np.random.choice(np.arange(len(path_ls)), size=num_samples)
else:
path_ids = np.arange(num_samples)
desc = f"Extracting from: {os.path.basename(fold_path)}"
for id in tqdm(path_ids, desc=desc):
f = open(path_ls[id], "r", encoding="utf8", errors="ignore")
text = " ".join(f.readlines())
text_ls.append(text)
f.close()
return text_ls