AnanyaDas162 · SubodhSenpai · Oct 20, 2022
diff --git a/Jupyter_Notebook/Cancer_Prediction_Model.ipynb b/Jupyter_Notebook/Cancer_Prediction_Model.ipynb
@@ -0,0 +1,177 @@
+# Breast Cancer Prediction 
+
+#importing libraries
+import numpy
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+
+
+import os 
+os.getcwd()
+
+#cahnging the directory using os library
+#reading the database using os pandas
+#display the dataframe using display function
+os.chdir('C:\\Users\\Subod\\Downloads\\ML\\ML Day20- Projects\\Projects\\PRJ Cancer Prediction')
+df=pd.read_csv("data.csv")
+display(df)
+
+#shape of dataset
+df.shape
+
+#taking the first 5 entries from dataframe using the head function
+df.head()
+
+#printing the Non-null Values( A method to find the null values)
+print(df.info())
+
+#return all the coulumns with null values(another method to find the null values)
+display(df.isna().sum())
+
+#describe the dataset
+df.describe()
+
+df['diagnosis']= df['diagnosis'].astype('category')
+df.dtypes
+
+# Get the count of malignant<M> adn Benign<B> cells
+df['diagnosis'].value_counts()
+
+sns.countplot(df['diagnosis'],label='count')
+
+#Label_encoding(convert the value of M and B into 1 and 0)
+from sklearn.preprocessing import LabelEncoder
+labelencoder_Y = LabelEncoder()
+df.iloc[:,1]=labelencoder_Y.fit_transform(df.iloc[:,1].values)
+
+#verifying the encoding using the head funciton
+df.head()
+
+#Ploting some columns using the seaborn library
+sns.pairplot(df.iloc[:,1:8], hue='diagnosis')
+
+#get the corelation matrix
+df.iloc[:,1:32].corr()
+
+#Visualize the correlation
+plt.rcParams['figure.figsize']=(120,108)
+sns.set(font_scale=5.8)
+sns.heatmap(df.iloc[:,1:32].corr(),annot=True,fmt=".0%")
+
+#split the dataset into dependent(X) and independent(Y) datasets
+X=df.iloc[:,2:31].values
+Y=df.iloc[:,1].values
+#It is no more a dataframe it is a list now so the head and other functions will not work on them and print funtion can be used
+
+print(X)
+
+print(Y)
+
+#splitting the data into training and test dataset
+from sklearn.model_selection import train_test_split
+X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20, random_state=0)
+
+#features scaling
+from sklearn.preprocessing import StandardScaler
+X_train=StandardScaler().fit_transform(X_train)
+X_test=StandardScaler().fit_transform(X_test)
+
+#models / Algorithms
+
+def models(X_train,Y_train):
+    #logistic regression
+    from sklearn.linear_model import LogisticRegression
+    log=LogisticRegression(random_state=0)
+    log.fit(X_train,Y_train)
+
+    #Decision Tree
+    from sklearn.tree import DecisionTreeClassifier
+    tree=DecisionTreeClassifier(random_state=0, criterion="entropy")
+    tree.fit(X_train,Y_train)
+
+    #Random Forest
+    from sklearn.ensemble import RandomForestClassifier
+    forest = RandomForestClassifier(random_state=0,n_estimators=10, criterion="entropy")
+    forest.fit(X_train,Y_train)
+
+    #SVC 
+    clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
+    clf.fit(X, y)
+
+    print('[0]logistic regression accuracy:', log.score(X_train,Y_train))
+    print('[1]Decision Tree accuracy:', tree.score(X_train,Y_train))
+    print('[2]Random Forest accuracy:', forest.score(X_train,Y_train))
+
+
+    return log,tree,forest
+
+model=models(X_train,Y_train)
+
+#testing the models/result
+
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import classification_report
+
+
+for i in range(len(model)):
+    print("Model",i)
+    print(classification_report(Y_test,model[i].predict(X_test)))
+    print("Accuracy :" , accuracy_score(Y_test,model[i].predict(X_test)))
+
+#prediction of random-forest
+
+
+pred=model[2].predict(X_test)
+print('Predicted values')
+print(pred)
+print('Actual values')
+print(Y_test)
+
+#dump the model
+import joblib
+filename = 'finalized_model.sav'
+joblib.dump(model[2], filename)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+