is there a way to use pre-trained embedding with tf-idf in tensorflow?

The most common approach is to multiply each word vector by its corresponding tf_idf score. One often sees this approach in academic papers. You could do something like this:

Create tfidf scores:

import tensorflow as tf
import numpy as np
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
import collections

def td_idf_word2weight(text):
    print("Creating TfidfVectorizer...")
    tfidf = TfidfVectorizer(preprocessor=' '.join)
    tfidf.fit(text)

    # if a word was never seen - it is considered to be at least as infrequent as any of the known words
    max_idf = max(tfidf.idf_)
    return collections.defaultdict(
        lambda: max_idf,
        [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

text = [['she let the balloon float up into the air with her hopes and dreams'],
        ['the old rusted farm equipment surrounded the house predicting its demise'],
        ['he was so preoccupied with whether or not he could that he failed to stop to consider if he should']]

td_idf = td_idf_word2weight(text)

text = np.concatenate(text)
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(text)
text_sequences = tokenizer.texts_to_sequences(text)
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences, padding='post')
vocab_size = len(tokenizer.word_index) + 1
print(td_idf.items())
print(vocab_size)
Creating TfidfVectorizer...
dict_items([('she', 1.6931471805599454), ('let', 1.6931471805599454), ('the', 1.2876820724517808), ('balloon', 1.6931471805599454), ('float', 1.6931471805599454), ('up', 1.6931471805599454), ('into', 1.6931471805599454), ('air', 1.6931471805599454), ('with', 1.2876820724517808), ('her', 1.6931471805599454), ('hopes', 1.6931471805599454), ('and', 1.6931471805599454), ('dreams', 1.6931471805599454), ('old', 1.6931471805599454), ('rusted', 1.6931471805599454), ('farm', 1.6931471805599454), ('equipment', 1.6931471805599454), ('surrounded', 1.6931471805599454), ('house', 1.6931471805599454), ('predicting', 1.6931471805599454), ('its', 1.6931471805599454), ('demise', 1.6931471805599454), ('he', 1.6931471805599454), ('was', 1.6931471805599454), ('so', 1.6931471805599454), ('preoccupied', 1.6931471805599454), ('whether', 1.6931471805599454), ('or', 1.6931471805599454), ('not', 1.6931471805599454), ('could', 1.6931471805599454), ('that', 1.6931471805599454), ('failed', 1.6931471805599454), ('to', 1.6931471805599454), ('stop', 1.6931471805599454), ('consider', 1.6931471805599454), ('if', 1.6931471805599454), ('should', 1.6931471805599454)])
38

Create tf_idf-weighted embeddings matrix:

model = api.load("glove-twitter-25")
embedding_dim = 25
weight_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
  try:
    embedding_vector = model[word] * td_idf[word]
    weight_matrix[i] = embedding_vector 
  except KeyError:
    weight_matrix[i] = np.random.uniform(-5, 5, embedding_dim)
print(weight_matrix.shape)
(38, 25)

CLICK HERE to find out more related problems solutions.

Leave a Comment

Your email address will not be published.

Scroll to Top