-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA Analysis.py
301 lines (180 loc) Β· 8.34 KB
/
EDA Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
"""
@author: Kianoush
GitHUb: https://github.com/Kianoush-h
YouTube: https://www.youtube.com/channel/UCvf9_53f6n3YjNEA4NxAkJA
LinkedIn: https://www.linkedin.com/in/kianoush-haratiannejadi/
Email: [email protected]
"""
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# set style of visualization
sns.set_style("darkgrid")
sns.set_palette("RdBu")
# =============================================================================
# Gathering data from raw data
# =============================================================================
# read data set
data = pd.read_excel("data/Coffee Shop Sales.xlsx")
# see top 5 rows
head = data.head()
# see number of rows and columns
print(f" shape is: {data.shape}")
# check missing values
data.isna().sum()
# see quick info
data.info()
# See quick info of numeric data
data.describe()
# see quick info of categorical data
data.describe(include = object)
# check duplicated rows
data.duplicated().any()
# see unique values in each column
# 1- create new data frame with number of unique value in each column
columnValue = data.nunique().reset_index()
# 2- rename column name
columnValue.rename(columns = {"index" : "Column _name", 0 : "Uniue values"}, inplace = True)
# 3- see columns and number of unique values of each
print(columnValue)
# drop some columns such as "product_id", "transaction_id" and "store_id"
data.drop(columns = ["transaction_id", "store_id", "product_id"], inplace = True)
# =============================================================================
# Analysis & Visualizations
# =============================================================================
# create function to visualized categorical column using count plot
def count_plot(x_axis = None, y_axis = None, hue = None, rotation = 0, top = None):
"""
1) input : x_axis, column name, data type must be object or categorical
3) output : cout plot using seaborn modules, unique values in x-axis and frequency in y-axis
4) use bar_label to show frequency of each unique values above each column in graph
5) top parameter i use it to specify indexes i want to see it
"""
if x_axis: # if we neet to visualized in x-axis
order = data[x_axis].value_counts().iloc[:top].index
else : # if we neet to visualized in y-axis
order = data[y_axis].value_counts().iloc[:top].index
graph = sns.countplot(x = x_axis, y = y_axis, data = data, hue = hue, order = order, palette = "RdBu")
for container in graph.containers:
graph.bar_label(container)
plt.xticks(rotation = rotation)
plt.show()
# create function that visualized categorical column using pie plot
def pie_plot(column_name, explodeIndex = None):
"""
1) input : column name, column data type must be object or categorical
2) explodeIndex, is the index i need to explode it
2) output : circle chart that shows size of each unique values and percentage
"""
# Create explode list with zeros of size equal to the number of unique values
explodeList = [0] * data[column_name].nunique()
# Check and set explodeIndex value
if explodeIndex is not None:
explodeList[explodeIndex] = 0.1
# Create pie plot
plt.pie(data[column_name].value_counts(), labels = data[column_name].value_counts().index, shadow = True, autopct = "%1.1f%%", explode = explodeList)
plt.show()
# =============================================================================
# PART 1: Discovering transaction_date column
# =============================================================================
# Extract some information such as year , month and day
# add new column year
data["year"] = data["transaction_date"].dt.year
# add new column month
data["month"] = data["transaction_date"].dt.month_name()
# add new column day
data["day"] = data["transaction_date"].dt.day_name()
# =============================================================================
# PART 2: transactions over months
# =============================================================================
# see number of transaction in each month
count_plot(x_axis = "month")
plt.title("Sales Over Months")
# =============================================================================
# PART 3: transactions over days of week
# =============================================================================
# see top 5 day of transaction
# set figure size
plt.figure(figsize = (10,6))
# call function i create it in cell 12
count_plot(x_axis = "day")
plt.title("Transaction Over Days of Week")
# =============================================================================
# PART 4: transactions over hours
# =============================================================================
data["hour"] = data["transaction_time"].apply(lambda x : x.hour)
# see distribution
sns.kdeplot(data["hour"])
plt.show()
plt.title("Transaction Over Hours")
# =============================================================================
# PART 5: Discovering transaction_qty column
# =============================================================================
# see number of sales quantity
count_plot(x_axis = "transaction_qty")
plt.title("Sales Over Transaction Quantity")
# =============================================================================
# PART 6: Discovering store_location column
# =============================================================================
# see number of sales in each store
pie_plot(column_name = "store_location", explodeIndex = 0)
plt.title("Sales in Each Store")
# =============================================================================
# PART 7: Discovering product_category column
# =============================================================================
# see most common category
# set figure size
plt.figure(figsize = (15,6))
# call function i create it in cell 12
count_plot(x_axis = "product_category")
plt.title("Most Common Category")
# =============================================================================
# PART 8: Discovering product_type column
# =============================================================================
# Assuming 'counts' is the DataFrame obtained from your groupby operation
counts = data.groupby(["product_category", "product_type"]).size().reset_index(name = "count")
# Get unique categories
categories = counts['product_category'].unique()
# Create subplots for each category in a 4x2 grid
fig, axes = plt.subplots(5, 2, figsize = (15, 20))
# Flatten the axes for easier iteration
axes = axes.flatten()
for i, category in enumerate(categories):
# Filter data for each category
subset = counts[counts['product_category'] == category]
# Sort the data by 'count' column in descending order
subset = subset.sort_values('count', ascending = False)
# Create a bar plot for each category with sorted order
sns.barplot(x = 'count', y = 'product_type', data = subset, ax = axes[i], order = subset['product_type'])
axes[i].set_title(f'Product Types in {category}')
axes[i].set_ylabel('')
axes[i].set_xlabel('')
axes[i].tick_params(axis = 'x', rotation = 45)
axes[i].grid(True)
# Adding bar labels
for idx, bar in enumerate(axes[i].patches):
axes[i].text(bar.get_width(), bar.get_y() + bar.get_height() / 2, subset.iloc[idx]['count'], ha = 'left', va = 'center')
# Hide extra subplots if there are fewer categories than subplots
for j in range(len(categories), len(axes)):
axes[j].axis('off')
plt.tight_layout()
plt.show()
# =============================================================================
# Category VS Total revenue
# =============================================================================
data["Total revenue"] = data["transaction_qty"] * data["unit_price"]
# grouped product_type' and 'Total revenue'
category_revenue = data.groupby("product_type")["Total revenue"].sum().reset_index(name = "revenue")
# sort category by revenue
category_revenue = category_revenue.sort_values(by = 'revenue', ascending = False)
# set figure size
plt.figure(figsize = (15,6))
# create bar plot
sns.barplot(x = "product_type", y = "revenue", data = category_revenue, palette = "RdBu")
# set x-axis rotation
plt.xticks(rotation = 90)
plt.tight_layout()
plt.show()
plt.title("Category VS Total Revenue")