diff --git a/Jupyter_Notebook/Cancer_Prediction_Model.ipynb b/Jupyter_Notebook/Cancer_Prediction_Model.ipynb new file mode 100644 index 0000000..ca8e150 --- /dev/null +++ b/Jupyter_Notebook/Cancer_Prediction_Model.ipynb @@ -0,0 +1,177 @@ +# Breast Cancer Prediction + +#importing libraries +import numpy +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + + +import os +os.getcwd() + +#cahnging the directory using os library +#reading the database using os pandas +#display the dataframe using display function +os.chdir('C:\\Users\\Subod\\Downloads\\ML\\ML Day20- Projects\\Projects\\PRJ Cancer Prediction') +df=pd.read_csv("data.csv") +display(df) + +#shape of dataset +df.shape + +#taking the first 5 entries from dataframe using the head function +df.head() + +#printing the Non-null Values( A method to find the null values) +print(df.info()) + +#return all the coulumns with null values(another method to find the null values) +display(df.isna().sum()) + +#describe the dataset +df.describe() + +df['diagnosis']= df['diagnosis'].astype('category') +df.dtypes + +# Get the count of malignant adn Benign cells +df['diagnosis'].value_counts() + +sns.countplot(df['diagnosis'],label='count') + +#Label_encoding(convert the value of M and B into 1 and 0) +from sklearn.preprocessing import LabelEncoder +labelencoder_Y = LabelEncoder() +df.iloc[:,1]=labelencoder_Y.fit_transform(df.iloc[:,1].values) + +#verifying the encoding using the head funciton +df.head() + +#Ploting some columns using the seaborn library +sns.pairplot(df.iloc[:,1:8], hue='diagnosis') + +#get the corelation matrix +df.iloc[:,1:32].corr() + +#Visualize the correlation +plt.rcParams['figure.figsize']=(120,108) +sns.set(font_scale=5.8) +sns.heatmap(df.iloc[:,1:32].corr(),annot=True,fmt=".0%") + +#split the dataset into dependent(X) and independent(Y) datasets +X=df.iloc[:,2:31].values +Y=df.iloc[:,1].values +#It is no more a dataframe it is a list now so the head and other functions will not work on them and print funtion can be used + +print(X) + +print(Y) + +#splitting the data into training and test dataset +from sklearn.model_selection import train_test_split +X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20, random_state=0) + +#features scaling +from sklearn.preprocessing import StandardScaler +X_train=StandardScaler().fit_transform(X_train) +X_test=StandardScaler().fit_transform(X_test) + +#models / Algorithms + +def models(X_train,Y_train): + #logistic regression + from sklearn.linear_model import LogisticRegression + log=LogisticRegression(random_state=0) + log.fit(X_train,Y_train) + + #Decision Tree + from sklearn.tree import DecisionTreeClassifier + tree=DecisionTreeClassifier(random_state=0, criterion="entropy") + tree.fit(X_train,Y_train) + + #Random Forest + from sklearn.ensemble import RandomForestClassifier + forest = RandomForestClassifier(random_state=0,n_estimators=10, criterion="entropy") + forest.fit(X_train,Y_train) + + #SVC + clf = make_pipeline(StandardScaler(), SVC(gamma='auto')) + clf.fit(X, y) + + print('[0]logistic regression accuracy:', log.score(X_train,Y_train)) + print('[1]Decision Tree accuracy:', tree.score(X_train,Y_train)) + print('[2]Random Forest accuracy:', forest.score(X_train,Y_train)) + + + return log,tree,forest + +model=models(X_train,Y_train) + +#testing the models/result + +from sklearn.metrics import accuracy_score +from sklearn.metrics import classification_report + + +for i in range(len(model)): + print("Model",i) + print(classification_report(Y_test,model[i].predict(X_test))) + print("Accuracy :" , accuracy_score(Y_test,model[i].predict(X_test))) + +#prediction of random-forest + + +pred=model[2].predict(X_test) +print('Predicted values') +print(pred) +print('Actual values') +print(Y_test) + +#dump the model +import joblib +filename = 'finalized_model.sav' +joblib.dump(model[2], filename) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +