Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions Jupyter_Notebook/Cancer_Prediction_Model.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
# Breast Cancer Prediction

#importing libraries
import numpy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


import os
os.getcwd()

#cahnging the directory using os library
#reading the database using os pandas
#display the dataframe using display function
os.chdir('C:\\Users\\Subod\\Downloads\\ML\\ML Day20- Projects\\Projects\\PRJ Cancer Prediction')
df=pd.read_csv("data.csv")
display(df)

#shape of dataset
df.shape

#taking the first 5 entries from dataframe using the head function
df.head()

#printing the Non-null Values( A method to find the null values)
print(df.info())

#return all the coulumns with null values(another method to find the null values)
display(df.isna().sum())

#describe the dataset
df.describe()

df['diagnosis']= df['diagnosis'].astype('category')
df.dtypes

# Get the count of malignant<M> adn Benign<B> cells
df['diagnosis'].value_counts()

sns.countplot(df['diagnosis'],label='count')

#Label_encoding(convert the value of M and B into 1 and 0)
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
df.iloc[:,1]=labelencoder_Y.fit_transform(df.iloc[:,1].values)

#verifying the encoding using the head funciton
df.head()

#Ploting some columns using the seaborn library
sns.pairplot(df.iloc[:,1:8], hue='diagnosis')

#get the corelation matrix
df.iloc[:,1:32].corr()

#Visualize the correlation
plt.rcParams['figure.figsize']=(120,108)
sns.set(font_scale=5.8)
sns.heatmap(df.iloc[:,1:32].corr(),annot=True,fmt=".0%")

#split the dataset into dependent(X) and independent(Y) datasets
X=df.iloc[:,2:31].values
Y=df.iloc[:,1].values
#It is no more a dataframe it is a list now so the head and other functions will not work on them and print funtion can be used

print(X)

print(Y)

#splitting the data into training and test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20, random_state=0)

#features scaling
from sklearn.preprocessing import StandardScaler
X_train=StandardScaler().fit_transform(X_train)
X_test=StandardScaler().fit_transform(X_test)

#models / Algorithms

def models(X_train,Y_train):
#logistic regression
from sklearn.linear_model import LogisticRegression
log=LogisticRegression(random_state=0)
log.fit(X_train,Y_train)

#Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(random_state=0, criterion="entropy")
tree.fit(X_train,Y_train)

#Random Forest
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(random_state=0,n_estimators=10, criterion="entropy")
forest.fit(X_train,Y_train)

#SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

print('[0]logistic regression accuracy:', log.score(X_train,Y_train))
print('[1]Decision Tree accuracy:', tree.score(X_train,Y_train))
print('[2]Random Forest accuracy:', forest.score(X_train,Y_train))


return log,tree,forest

model=models(X_train,Y_train)

#testing the models/result

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


for i in range(len(model)):
print("Model",i)
print(classification_report(Y_test,model[i].predict(X_test)))
print("Accuracy :" , accuracy_score(Y_test,model[i].predict(X_test)))

#prediction of random-forest


pred=model[2].predict(X_test)
print('Predicted values')
print(pred)
print('Actual values')
print(Y_test)

#dump the model
import joblib
filename = 'finalized_model.sav'
joblib.dump(model[2], filename)