-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
136 lines (76 loc) · 3.32 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
"""main.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1FFsKVtmeuD_YTA6hbDzV5hKnaCK1xpJ3
"""
"""# New Section"""
import pandas as pd
data = pd.read_csv('movie_dataset.csv')
print(data)
data.describe()
data.head()
data.info()
"""here we can check the possible values that are empty or null in the dataset using this code"""
data.isnull().sum()
"""check the columns and do the feature engineering to select the imp features columns for the pridiction"""
data.columns
movies = data[['id', 'title', 'overview', 'genre']]
movies
"""here the imp columns is id, title, overview, genres as feature engineering."""
movies['tags'] = movies['overview'] + movies['genre']
movies
"""here we added the two columns of overview and genre to find out the tags of the movies and based on tags we will recommend the movies"""
new_data = movies.drop(columns=['overview', 'genre'])
new_data
"""dropping the overview and genre to easy the data understanding = new column tags
"""
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000,stop_words='english')
cv
CountVectorizer(max_features=10000,stop_words='english')
"""here our data is 10000 rows × 3 columns so the count vectorizer will me max_features=10000
"""
vector = cv.fit_transform(new_data['tags'].values.astype('U')).toarray()
"""here we convertedd the texual data into vector form i.e array"""
vector.shape
"""max_features = 10,000
count_vectors = 10,000
cosine similarities
1. action -> avatar
2. action -> iron man
here the distance to the similarities is less so the recommendation will detect the distance of the movies that are similiar.
root(X1-X2)^2 + (Y2-Y1)^2
"""
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity
"""here are the similiar movie pars that are similiar like action, romantic, etc"""
new_data[new_data['title'] == 'The Godfather']
new_data[new_data['title'] == 'The Godfather'].index[0]
sorted(list(enumerate(similarity[2])))
# from google.colab import drive
# drive.mount('/content/drive')
#to save it automatically in the drive
"""here we got the list of similar movies based on the similarites of the movie called The Godfather"""
sorted(list(enumerate(similarity[2])),reverse=True,key=lambda vector:vector[1])
"""here we reversed the list to find out the most matched movie
here we will short the list and find only relevant movies that are recommended to near values i.e shortest distance between the tags
"""
distance = sorted(list(enumerate(similarity[2])),reverse=True,key=lambda vector:vector[1])
for i in distance[0:5]:
print(new_data.iloc[i[0]].title)
"""here is the list of the movies that connects the tags of the movies in the dataset"""
def recommend(movie):
index = new_data[new_data['title'] == movie].index[0]
distance = sorted(list(enumerate(similarity[index])),reverse=True,key=lambda vector:vector[1])
for i in distance[0:5]:
print(new_data.iloc[i[0]].title)
recommend("Iron Man")
import pickle
"""saving this algorithm to use it in web based application"""
pickle.dump(new_data,open('movies.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.load(open('movies.pkl','rb'))
import streamlit as st
st.header("movie_recommender_system")