-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdataexploration.py
More file actions
97 lines (70 loc) · 4.14 KB
/
dataexploration.py
File metadata and controls
97 lines (70 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import streamlit as st
import pandas as pd
import dataframefunctions
import plots
import featuresanalysis
POSSIBLE_DATAEXP_ACTIONS = ["Dataset first look", "Plots", "Features"]
def load_page(dataframe):
if dataframe is None:
st.error("Please upload your dataset!")
else:
dataexp_action = st.sidebar.selectbox("What do you want to explore?", POSSIBLE_DATAEXP_ACTIONS)
if dataexp_action == "Dataset first look":
render_first_look(dataframe)
if st.sidebar.checkbox("Compute missing values"):
render_missing_data(dataframe)
if st.sidebar.checkbox("Compute linear correlation"):
render_linear_correlation(dataframe)
elif dataexp_action == "Plots":
plots.load_page(dataframe)
elif dataexp_action == "Features":
featuresanalysis.load_page(dataframe)
def render_missing_data(dataframe):
"""Renders the missing values and the missing percentages for each column."""
missing_values, missing_percentage = dataframefunctions.get_missing_values(dataframe)
st.markdown("## **Missing values :mag:** ##")
st.dataframe(pd.concat([missing_values, missing_percentage], axis=1, keys=["Total", "percent"]))
def render_first_look(dataframe):
"""Renders the head of the dataset (with nan values colored in red),
and comments regarding instances, columns, and missing values."""
number_of_rows = st.sidebar.slider('Number of rows', 1, 150, 10)
st.markdown("## **Exploring the dataset :mag:** ##")
if st.sidebar.checkbox("Color NaN values in red", value=True):
st.dataframe(dataframe.head(number_of_rows).style.applymap(dataframefunctions.color_null_red))
else:
st.dataframe(dataframe.head(number_of_rows))
render_firstlook_comments(dataframe)
# TODO improve such that all the type of columns are considered
def render_firstlook_comments(dataframe):
"""Makes a first analysis of the dataset and shows comments based on that."""
num_instances, num_features = dataframe.shape
categorical_columns = dataframefunctions.get_categorical_columns(dataframe)
numerical_columns = dataframefunctions.get_numeric_columns(dataframe)
cat_column = categorical_columns[0] if len(categorical_columns) > 0 else ""
num_column = numerical_columns[0] if len(numerical_columns) > 0 else ""
total_missing_values = dataframe.isnull().sum().sum()
st.write("* The dataset has **%d** observations and **%d** variables. \
Hence, the _instances-features ratio_ is ~**%d**."
% (num_instances, num_features, int(num_instances/num_features)))
st.write("* The dataset has **%d** categorical columns (e.g. %s) and **%d** numerical columns (e.g. %s)."
% (len(categorical_columns), cat_column, len(numerical_columns), num_column))
st.write("* Total number of missing values: **%d** (~**%.2f**%%)."
% (total_missing_values, 100*total_missing_values/(num_instances*num_features)))
def render_linear_correlation(dataframe):
"""If the label is not categorical, renders the linear correlation between the features and the label."""
st.markdown("## **Linear correlation ** ##")
df_columns = list(dataframe.columns.values)
label_name = df_columns[len(df_columns) - 1]
# If the label is not categorical, show an error
if dataframefunctions.is_categorical(dataframe[label_name]):
display_correlation_error()
return
positive_corr = dataframefunctions.get_linear_correlation(dataframe, label_name, positive=True)
negative_corr = dataframefunctions.get_linear_correlation(dataframe, label_name, positive=False)
st.write('Positively correlated features :chart_with_upwards_trend:', positive_corr)
st.write('Negatively correlated features :chart_with_downwards_trend:', negative_corr)
def display_correlation_error():
st.write(":no_entry::no_entry::no_entry:")
st.write("It's **not** possible to determine a linear correlation with a categorical label.")
st.write("For more info, please check [this link.]\
(https://stackoverflow.com/questions/47894387/how-to-correlate-an-ordinal-categorical-column-in-pandas)")