import os
import logging
import numpy as np
import pandas as pd
from collections import Counter
import albumentations as A
from albumentations.pytorch import ToTensorV2
from .utils import preprocess_text
[docs]
class PreprocessFlickr:
"""
Preprocessing Flickr Dataset
:param config_dict: Config Params Dictionary
:type config_dict: dict
"""
def __init__(self, config_dict):
self.logger = logging.getLogger(__name__)
self.config_dict = config_dict
self.train_df, self.test_df = self.extract_data()
[docs]
def get_data(self):
"""
Preprocessing
:return: Returns image paths, Train Tokens, (Train, Test Transforms)
:rtype: (list, numpy.ndarray [num_samples, seq_len], (albumentations.Compose, albumentations.Compose))
"""
self.get_vocab(self.train_df)
train_tokens = self.word_tokens(self.train_df)
train_transforms, test_transforms = self.image_transforms("train")
return (
list(self.train_df["Path"]),
train_tokens,
(train_transforms, test_transforms),
)
[docs]
def get_test_data(self):
"""
Generating test Data
:return: Returns image paths, Test Tokens, Test Transforms
:rtype: (list, numpy.ndarray [num_samples, seq_len], albumentations.Compose)
"""
test_tokens = self.word_tokens(self.test_df)
test_transforms = self.image_transforms("test")
return list(self.test_df["Path"]), test_tokens, test_transforms
[docs]
def get_vocab(self, train_df):
"""
Generates Vocabulary
:param train_df: DataFrame with Training Captions
:type train_df: pandas.DataFrame
"""
self.logger.info("Building Vocabulary from training data captions")
num_vocab = (
self.config_dict["dataset"]["num_vocab"]
- self.config_dict["dataset"]["num_extra_tokens"]
)
all_words = []
for text in train_df["Caption"]:
all_words += text.split()
topk_vocab_freq = Counter(all_words).most_common(num_vocab)
self.vocab = ["<START>", "<END>", "<PAD>", "<UNK>"] + [
i[0] for i in topk_vocab_freq
]
self.word2id = {w: i for i, w in enumerate(self.vocab)}
self.id2word = {v: k for k, v in self.word2id.items()}
[docs]
def word_tokens(self, df):
"""
Coverting Sentences to Tokens
:param df: Captions DataFrame
:type df: pandas.DataFrame
:return: Tokens array (num_samples, seq_len)
:rtype: numpy.ndarray
"""
seq_len = self.config_dict["dataset"]["seq_len"]
tokens = np.zeros((len(df), seq_len))
for i, text in enumerate(df["Caption"]):
words = ["<START>"] + text.split()[: seq_len - 2]
if len(words) < seq_len - 1:
words += ["<PAD>"] * (seq_len - 1 - len(words))
words += ["<END>"]
for j, w in enumerate(words):
if w in self.vocab:
tokens[i, j] = self.word2id[w]
else:
tokens[i, j] = self.word2id["<UNK>"]
return tokens
[docs]
def batched_ids2captions(self, tokens):
"""
Converting sentence of ids to tokens
:param tokens: Tokens Array, 2D array (num_samples, seq_len)
:type tokens: numpy.ndarray
:return: List of decoded sentences
:rtype: list
"""
func = lambda x: self.id2word[x]
vect_func = np.vectorize(func)
tokens = vect_func(tokens)
captions = []
for words in tokens:
txt = ""
for word in words:
if word not in ["<START>", "<END>", "<PAD>"]:
txt += f"{word} "
captions.append(txt[:-1])
return captions