Source code for src.preprocess.flickr
import os
import logging
import numpy as np
import pandas as pd
from collections import Counter
import albumentations as A
from albumentations.pytorch import ToTensorV2
from .utils import preprocess_text
[docs]
class PreprocessFlickr:
"""
Preprocessing Flickr Dataset
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.logger = logging.getLogger(__name__)
self.config_dict = config_dict
self.train_df, self.test_df = self.extract_data()
[docs]
def get_data(self):
"""
Preprocessing
:return: Returns image paths, Train Tokens, (Train, Test Transforms)
:rtype: (list, numpy.ndarray [num_samples, seq_len], (albumentations.Compose, albumentations.Compose))
"""
self.get_vocab(self.train_df)
train_tokens = self.word_tokens(self.train_df)
train_transforms, test_transforms = self.image_transforms("train")
return (
list(self.train_df["Path"]),
train_tokens,
(train_transforms, test_transforms),
)
[docs]
def get_test_data(self):
"""
Generating test Data
:return: Returns image paths, Test Tokens, Test Transforms
:rtype: (list, numpy.ndarray [num_samples, seq_len], albumentations.Compose)
"""
test_tokens = self.word_tokens(self.test_df)
test_transforms = self.image_transforms("test")
return list(self.test_df["Path"]), test_tokens, test_transforms
[docs]
def extract_data(self):
"""
Extracting Image and Captions Data
:return: Train and Test DataFrames
:rtype: tuple (pandas.DataFrame, pandas.DataFrame)
"""
self.logger.info(
"Creating a DataFrame from images folder and captions txt file"
)
im_folder = self.config_dict["paths"]["image_folder"]
caption_file = self.config_dict["paths"]["captions_file"]
operations = self.config_dict["preprocess"]["operations"]
with open(caption_file, "r") as f:
lines = np.array(f.readlines()[1:])
num_train = self.config_dict["dataset"]["train_samples"]
num_test = self.config_dict["dataset"]["test_samples"]
rand_ids = np.random.choice(len(lines), num_train + num_test)
paths, captions = zip(*(s.split(",") for s in lines[rand_ids]))
paths = [os.path.join(im_folder, i) for i in paths]
df = pd.DataFrame.from_dict({"Path": paths, "Caption": captions})
df["Caption"] = df["Caption"].map(lambda x: preprocess_text(x, operations))
train_df = df.iloc[:num_train]
test_df = df.iloc[num_train:]
return train_df, test_df
[docs]
def get_vocab(self, train_df):
"""
Generates Vocabulary
:param train_df: DataFrame with Training Captions
:type train_df: pandas.DataFrame
"""
self.logger.info("Building Vocabulary from training data captions")
num_vocab = (
self.config_dict["dataset"]["num_vocab"]
- self.config_dict["dataset"]["num_extra_tokens"]
)
all_words = []
for text in train_df["Caption"]:
all_words += text.split()
topk_vocab_freq = Counter(all_words).most_common(num_vocab)
self.vocab = ["<START>", "<END>", "<PAD>", "<UNK>"] + [
i[0] for i in topk_vocab_freq
]
self.word2id = {w: i for i, w in enumerate(self.vocab)}
self.id2word = {v: k for k, v in self.word2id.items()}
[docs]
def word_tokens(self, df):
"""
Coverting Sentences to Tokens
:param df: Captions DataFrame
:type df: pandas.DataFrame
:return: Tokens array (num_samples, seq_len)
:rtype: numpy.ndarray
"""
seq_len = self.config_dict["dataset"]["seq_len"]
tokens = np.zeros((len(df), seq_len))
for i, text in enumerate(df["Caption"]):
words = ["<START>"] + text.split()[: seq_len - 2]
if len(words) < seq_len - 1:
words += ["<PAD>"] * (seq_len - 1 - len(words))
words += ["<END>"]
for j, w in enumerate(words):
if w in self.vocab:
tokens[i, j] = self.word2id[w]
else:
tokens[i, j] = self.word2id["<UNK>"]
return tokens
[docs]
def batched_ids2captions(self, tokens):
"""
Converting sentence of ids to tokens
:param tokens: Tokens Array, 2D array (num_samples, seq_len)
:type tokens: numpy.ndarray
:return: List of decoded sentences
:rtype: list
"""
func = lambda x: self.id2word[x]
vect_func = np.vectorize(func)
tokens = vect_func(tokens)
captions = []
for words in tokens:
txt = ""
for word in words:
if word not in ["<START>", "<END>", "<PAD>"]:
txt += f"{word} "
captions.append(txt[:-1])
return captions
[docs]
def image_transforms(self, data_type):
"""
Creating Albumentations Transforms for train or test data
:param data_type: {'train', 'test'}. Type of Data
:type data_type: str
:return: Transforms
:rtype: albumentations.Compose
"""
im_w, im_h = self.config_dict["preprocess"]["image_dim"][1:]
train_transforms = A.Compose(
[
A.Resize(im_w, im_h),
A.HorizontalFlip(p=0.5),
A.RandomBrightnessContrast(p=0.2),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
max_pixel_value=255.0,
p=1.0,
),
ToTensorV2(),
],
p=1.0,
)
test_transforms = A.Compose(
[
A.Resize(im_w, im_h),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
max_pixel_value=255.0,
p=1.0,
),
ToTensorV2(),
],
p=1.0,
)
if data_type == "train":
return train_transforms, test_transforms
else:
return test_transforms