Source code for src.core.word2vec.huffman
import logging
[docs]
class Node:
"""
A class to initialize Node in a Huffman tree
:param word_idx: Word Id
:type word_idx: int
:param freq: Frequency of node
:type freq: int
:param left: Left nodes, defaults to None
:type left: list, optional
:param right: Right nodes, defaults to None
:type right: list, optional
"""
def __init__(self, word_idx, freq, left=None, right=None):
self.word_idx = word_idx
self.freq = freq
self.huffman_code = []
self.huffman_path = []
self.left = left
self.right = right
[docs]
class HuffmanBTree:
"""
Creates Huffman Binary Tree to perform Softmax
:param vocab_freq_dict: Vocabulary Frequency Dictionary
:type vocab_freq_dict: dict
"""
def __init__(self, vocab_freq_dict):
self.logger = logging.getLogger(__name__)
self.vocab = list(vocab_freq_dict.keys())
self.freq = list(vocab_freq_dict.values())
self.construct_tree()
self.word_code, self.word_path = {}, {}
self.generate_huffman_code(self.huffman_tree, [], [])
self.logger.info("Generated Huffman code for all the vocabulary")
self.left_huff_dict, self.right_huff_dict = {}, {}
self.separate_left_right_path()
[docs]
def construct_tree(self):
"""
Constructs Huffman Binary Tree
"""
node_list = []
for w, f in zip(self.vocab, self.freq):
node_list.append(Node(w, f))
count = len(self.vocab)
while len(node_list) > 1:
node_list = sorted(node_list, key=lambda a: a.freq, reverse=True)
left = node_list[-2]
right = node_list[-1]
freq = left.freq + right.freq
word_idx = count
self.huffman_tree = Node(word_idx, freq, left, right)
node_list = node_list[:-2]
node_list.append(self.huffman_tree)
count += 1
self.logger.info("Constructed Huffman Tree")
[docs]
def generate_huffman_code(self, tree, code, path):
"""
Generates Huffman code for vocabulary
:param tree: Node object that initialized HuffmanTree
:type tree: Node
:param code: Binary code for each vocab
:type code: list
:param path: Path for each vocab wrto Node Id
:type path: list
"""
if tree.left is None and tree.right is None:
self.word_code[tree.word_idx] = code
self.word_path[tree.word_idx] = path
else:
self.generate_huffman_code(tree.left, code + [1], path + [tree.word_idx])
self.generate_huffman_code(tree.right, code + [0], path + [tree.word_idx])
[docs]
def separate_left_right_path(self):
"""
Separates Left and Right paths
"""
for widx, code, path in zip(
self.word_code.keys(), self.word_code.values(), self.word_path.values()
):
left, right = [], []
for c, p in zip(code, path):
if c == 1:
left.append(p)
else:
right.append(p)
self.left_huff_dict[widx] = left
self.right_huff_dict[widx] = right
self.logger.info("Separated Paths into Left and Right branches")