Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """HS_Text-based_Recom_Metacritic.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1MmWRwRJT04GVAO2SKCpwSqQ2bWghVGtQ | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from fuzzywuzzy import fuzz | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| df = pd.read_csv("Metacritic_Reviews_Only.csv", error_bad_lines=False, encoding='utf-8') | |
| #Remove title from review | |
| def remove_title(row): | |
| game_title = row['Game Title'] | |
| body_text = row['Reviews'] | |
| new_doc = body_text.replace(game_title, "") | |
| return new_doc | |
| df['Reviews'] = df.apply(remove_title, axis=1) | |
| #drop redundant column | |
| df = df.drop(['Unnamed: 0'], axis=1) | |
| df.dropna(inplace=True) #Drop Null Reviews | |
| # Instantiate the vectorizer object to the vectorizer variable | |
| #Minimum word count 2 to be included, words that appear in over 70% of docs should not be included | |
| vectorizer = TfidfVectorizer(min_df=2, max_df=0.7) | |
| # Fit and transform the plot column | |
| vectorized_data = vectorizer.fit_transform(df['Reviews']) | |
| # Create Dataframe from TF-IDFarray | |
| tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names()) | |
| # Assign the game titles to the index | |
| tfidf_df.index = df['Game Title'] | |
| # Find the cosine similarity measures between all game and assign the results to cosine_similarity_array. | |
| cosine_similarity_array = cosine_similarity(tfidf_df) | |
| # Create a DataFrame from the cosine_similarity_array with tfidf_df.index as its rows and columns. | |
| cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_df.index, columns=tfidf_df.index) | |
| # create a function to find the closest title | |
| def matching_score(a,b): | |
| #fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance | |
| return fuzz.ratio(a,b) | |
| # exactly the same, the score becomes 100 | |
| #Convert index to title_year | |
| def get_title_from_index(index): | |
| return df[df.index == index]['Game Title'].values[0] | |
| # A function to return the most similar title to the words a user type | |
| # Without this, the recommender only works when a user enters the exact title which the data has. | |
| def find_closest_title(title): | |
| #matching_score(a,b) > a is the current row, b is the title we're trying to match | |
| leven_scores = list(enumerate(df['Game Title'].apply(matching_score, b=title))) #[(0, 30), (1,95), (2, 19)~~] A tuple of distances per index | |
| sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True) #Sorts list of tuples by distance [(1, 95), (3, 49), (0, 30)~~] | |
| closest_title = get_title_from_index(sorted_leven_scores[0][0]) | |
| distance_score = sorted_leven_scores[0][1] | |
| return closest_title, distance_score | |
| # Bejeweled Twist, 100 | |
| #find_closest_title('Batman Arkham Knight') | |
| """# Build Recommender Function | |
| Our recommender function will take in two inputs. The game title and the keyword exclusion. The keyword exclusion was added when I realised that the recommendations were returning a lot of DLCs and sequels which isn't a very useful recommender. | |
| By combining everything we've done from building the user profile onwards we will pull out the Top 5 games we want to recommend. | |
| 1. Text Match the closest title in the dataset | |
| 2. Assign number for the final ranking | |
| 3. Create your user profile based on previous games | |
| 4. Create TFIDF subset without previously mentioned titles | |
| 5. Calculate cosine similarity based on selected titles and convert back into DataFrame | |
| 6. Sort DataFrame by similarity | |
| 7. Return most similarity game titles that don't contain keyword | |
| """ | |
| def recommend_games(game1, game2, game3, keyword1, keyword2, keyword3, max_results): | |
| #Insert closest title here | |
| title1, distance_score1 = find_closest_title(game1) | |
| title2, distance_score2 = find_closest_title(game2) | |
| title3, distance_score3 = find_closest_title(game3) | |
| #Counter for Ranking | |
| number = 1 | |
| print('Recommended because you played {}, {} and {}:\n'.format(title1, title2, title3)) | |
| list_of_games_enjoyed = [title1, title2, title3] | |
| games_enjoyed_df = tfidf_df.reindex(list_of_games_enjoyed) | |
| user_prof = games_enjoyed_df.mean() | |
| tfidf_subset_df = tfidf_df.drop([title1, title2, title3], axis=0) | |
| similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df) | |
| similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"]) | |
| # Sort the values from high to low by the values in the similarity_score | |
| sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False) | |
| # Inspect the most similar to the user preferences | |
| print("Without Keywords Exclusions:") | |
| print(sorted_similarity_df.head()) | |
| print("\n") | |
| print("With Keywords Exclusions:\n ") | |
| number = 0 | |
| rank = 1 | |
| for n in sorted_similarity_df.index: | |
| if rank <= max_results: | |
| if keyword1.lower() not in n.lower() and keyword2.lower() not in n.lower() and keyword3.lower() not in n.lower(): | |
| print("#" + str(rank) + ": " + n + ", " + str(round(sorted_similarity_df.iloc[number]['similarity_score']*100,2)) + "% " + "match") | |
| number+=1 | |
| rank +=1 | |
| else: | |
| continue | |
| # recommend_games('Mortal Kombat', 'Street Fighter', 'Overwatch', 'Kombat', 'Fighter', 'Overwatch', 5) | |
| import gradio as gr | |
| recommender_interface = gr.Interface(fn=recommend_games, | |
| inputs=["text","text","text","text","text","text", gr.inputs.Slider(1, 20, step=1)], | |
| title="Text-based Recommendation Engine for Video Games", | |
| description="""This is a Recommendation Engine based on the review texts of Metacritic critics for games between 2011-2019. | |
| You need to enter 3 games you've enjoyed playing followed by 3 keywords from those game titles so that I can avoid recommending the same games to you.""", | |
| examples= [['Mortal Kombat', 'Street Fighter', 'Overwatch', 'Kombat', 'Fighter', 'Overwatch', 5], | |
| ["Batman Arkham Knight","Dying Light","Left 4 Dead","Batman","Dying","Left", 10], | |
| ["Mario Kart","Zelda","Final Fantasy","Mario","Zelda","Final", 7]], | |
| outputs=["dataframe"]) | |
| recommender_interface.launch(debug=True) |