The most common approach is to multiply each word vector by its corresponding tf_idf
score. One often sees this approach in academic papers. You could do something like this:
Create tfidf
scores:
import tensorflow as tf
import numpy as np
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
import collections
def td_idf_word2weight(text):
print("Creating TfidfVectorizer...")
tfidf = TfidfVectorizer(preprocessor=' '.join)
tfidf.fit(text)
# if a word was never seen - it is considered to be at least as infrequent as any of the known words
max_idf = max(tfidf.idf_)
return collections.defaultdict(
lambda: max_idf,
[(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
text = [['she let the balloon float up into the air with her hopes and dreams'],
['the old rusted farm equipment surrounded the house predicting its demise'],
['he was so preoccupied with whether or not he could that he failed to stop to consider if he should']]
td_idf = td_idf_word2weight(text)
text = np.concatenate(text)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text)
text_sequences = tokenizer.texts_to_sequences(text)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences, padding='post')
vocab_size = len(tokenizer.word_index) + 1
print(td_idf.items())
print(vocab_size)
Creating TfidfVectorizer...
dict_items([('she', 1.6931471805599454), ('let', 1.6931471805599454), ('the', 1.2876820724517808), ('balloon', 1.6931471805599454), ('float', 1.6931471805599454), ('up', 1.6931471805599454), ('into', 1.6931471805599454), ('air', 1.6931471805599454), ('with', 1.2876820724517808), ('her', 1.6931471805599454), ('hopes', 1.6931471805599454), ('and', 1.6931471805599454), ('dreams', 1.6931471805599454), ('old', 1.6931471805599454), ('rusted', 1.6931471805599454), ('farm', 1.6931471805599454), ('equipment', 1.6931471805599454), ('surrounded', 1.6931471805599454), ('house', 1.6931471805599454), ('predicting', 1.6931471805599454), ('its', 1.6931471805599454), ('demise', 1.6931471805599454), ('he', 1.6931471805599454), ('was', 1.6931471805599454), ('so', 1.6931471805599454), ('preoccupied', 1.6931471805599454), ('whether', 1.6931471805599454), ('or', 1.6931471805599454), ('not', 1.6931471805599454), ('could', 1.6931471805599454), ('that', 1.6931471805599454), ('failed', 1.6931471805599454), ('to', 1.6931471805599454), ('stop', 1.6931471805599454), ('consider', 1.6931471805599454), ('if', 1.6931471805599454), ('should', 1.6931471805599454)])
38
Create tf_idf
-weighted embeddings matrix:
model = api.load("glove-twitter-25")
embedding_dim = 25
weight_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
try:
embedding_vector = model[word] * td_idf[word]
weight_matrix[i] = embedding_vector
except KeyError:
weight_matrix[i] = np.random.uniform(-5, 5, embedding_dim)
print(weight_matrix.shape)
(38, 25)
CLICK HERE to find out more related problems solutions.