import pandas as pd, numpy as np, torch, scipy.sparse as sps
from sklearn.feature_extraction.text import TfidfVectorizer
from rime.util.score_array import auto_cast_lazy_score
from collections.abc import Iterable
[docs]class TF_IDF:
""" create cosine similarity between the last user item and the item to recommend based on tf-idf embedding """
[docs] def __init__(self, item_df):
assert "TITLE" in item_df or "embedding" in item_df, "require TITLE or embedding"
self.item_id = item_df.index
if 'embedding' in item_df:
tfidf_emb = np.vstack(item_df['embedding'].tolist())
zeros = tfidf_emb[:1] * 0
self.tfidf_pad_zeros = np.vstack([tfidf_emb, zeros])
else:
self.tfidf_fit = TfidfVectorizer().fit(item_df['TITLE'].tolist())
tfidf_emb = self.tfidf_fit.transform(item_df['TITLE'].tolist())
zeros = tfidf_emb[:1] * 0
zeros.eliminate_zeros()
self.tfidf_pad_zeros = sps.vstack([tfidf_emb, zeros])
def fit(self, *args, **kw):
return self
def transform(self, D):
user_last_item = D.user_in_test['_hist_items'].apply(
lambda x: x[-1] if isinstance(x, Iterable) else None)
user_last_item_index = self.item_id.get_indexer(user_last_item.values)
user_emb = self.tfidf_pad_zeros[user_last_item_index]
item_index = self.item_id.get_indexer(D.item_in_test.index)
item_emb = self.tfidf_pad_zeros[item_index]
return auto_cast_lazy_score(user_emb) @ auto_cast_lazy_score(item_emb).T