Hello guys
my friend asked me if he can get my code and i wanna give it to him but im sure he won’t be able to rewrite it. Is it possible if somebody could rewrite my following code:
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer()
posting_list = imdb_all.iloc[intersection].sort_values('rating', ascending=False)
def create_query_term_count(posting_list, query_terms):
posting_list['term_count'] = posting_list['plotterms'].apply(lambda x: len(x.split()))
# count how many times any of the query terms appear in the plot
posting_list['query_term_count'] = posting_list['plotterms'].apply(lambda x: len([item for item in x.split() if item in query_terms.split()]))
# order by the query_term_count column and filter if query_term_count is greater than 0
return posting_list[posting_list['query_term_count'] > 0].sort_values('query_term_count', ascending=False)
def inverse_document_frequency(all_documents, terms):
num_documents_with_this_term = 0
for term in terms.split():
for index, doc in all_documents.iterrows():
if term in doc['plotterms'].split():
num_documents_with_this_term += 1
if num_documents_with_this_term > 0:
return 1.0 + log(float(len(all_documents)) / num_documents_with_this_term)
else:
return 1.0
def cos_sim(df, query):
X = vectorizer.fit_transform(imdb_all['plotterms'])
query_vec = vectorizer.transform([query])
results = cosine_similarity(X,query_vec)
results = results.flatten()
imdb_all['cos_sim'] = results
def tf_idf(posting_list, query):
# calculate the term frequency
tf = posting_list['query_term_count'] / posting_list['term_count']
posting_list['term_frequency'] = tf
# calculate the inverse document frequency
idf = inverse_document_frequency(imdb_all, query)
# calculate the TF-IDF score
posting_list['tf_idf'] = tf * idf
def search(query):
query_list = create_query_term_count(posting_list, query)
tf_idf(query_list, query)
cos_sim(query_list, query)
return query_list
queries = ['american dream', 'american', 'dream']
for query in queries:
query_list = search(query)
display(query_list)
best regards
willien