-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcyclistic_2023_analysis.sql
318 lines (290 loc) · 7.32 KB
/
cyclistic_2023_analysis.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
CREATE TABLE cyclistic_2023 (
ride_id varchar(20) NOT NULL,
bike_type varchar(15),
start_time timestamp(2),
end_time timestamp(2),
starting_station text,
ending_station text,
user_type text,
ride_length numeric,
day_of_week text,
month text
);
-- Returning all columns:
SELECT *
FROM cyclistic_2023
LIMIT 100;
-- Returning all columns and order rows by start time:
SELECT *
FROM cyclistic_2023
ORDER BY start_time
LIMIT 100;
-- Confirming row count:
SELECT COUNT(*)
FROM cyclistic_2023;
-- Number of users by user type:
SELECT
COUNT(CASE WHEN user_type = 'member' THEN 1 END) AS number_of_members,
COUNT(CASE WHEN user_type = 'casual' THEN 1 END) AS number_of_casuals
FROM cyclistic_2023;
-- Calculating percentage of users by type:
SELECT
user_type,
COUNT(*) AS users,
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM cyclistic_2023), 1)
FROM cyclistic_2023
WHERE user_type IN ('casual', 'member')
GROUP BY user_type;
-- Number of bikes used in 2023 by bike type:
SELECT
bike_type,
COUNT(*) AS bikes_used
FROM cyclistic_2023
WHERE bike_type IN ('classic_bike', 'electric_bike')
GROUP BY 1;
-- Bikes used by user type:
SELECT
user_type,
bike_type,
COUNT(*) AS bikes_used
FROM cyclistic_2023
WHERE bike_type IN ('classic_bike', 'electric_bike')
GROUP BY 1, 2;
-- The distribution of bikes among members by count and percentage:
WITH members_cte AS (
SELECT *
FROM cyclistic_2023
WHERE user_type = 'member'
)
SELECT
user_type,
bike_type,
COUNT(*) AS bikes_used,
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM members_cte), 1) AS percentage
FROM members_cte
WHERE bike_type IN ('classic_bike', 'electric_bike')
GROUP BY 1, 2;
-- The distribution of bikes among casual riders by count and percentage:
WITH casuals_cte AS (
SELECT *
FROM cyclistic_2023
WHERE user_type = 'casual'
)
SELECT
user_type,
bike_type,
COUNT(*) AS bikes_used,
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM casuals_cte), 1) AS percentage
FROM casuals_cte
WHERE bike_type IN ('classic_bike', 'electric_bike')
GROUP BY 1, 2;
-- The total number of rides by day in 2023:
SELECT
day_of_week,
COUNT(*) AS total_rides
FROM cyclistic_2023
GROUP BY day_of_week
ORDER BY
CASE
WHEN day_of_week = 'Sunday' THEN 1
WHEN day_of_week = 'Monday' THEN 2
WHEN day_of_week = 'Tuesday' THEN 3
WHEN day_of_week = 'Wednesday' THEN 4
WHEN day_of_week = 'Thursday' THEN 5
WHEN day_of_week = 'Friday' THEN 6
WHEN day_of_week = 'Saturday' THEN 7
END ASC;
-- What was the most active day for members? What was the most active day for casual riders?:
SELECT
user_type,
day_of_week,
COUNT(*) AS total_rides
FROM cyclistic_2023
GROUP BY 1, 2
ORDER BY
CASE
WHEN day_of_week = 'Sunday' THEN 1
WHEN day_of_week = 'Monday' THEN 2
WHEN day_of_week = 'Tuesday' THEN 3
WHEN day_of_week = 'Wednesday' THEN 4
WHEN day_of_week = 'Thursday' THEN 5
WHEN day_of_week = 'Friday' THEN 6
WHEN day_of_week = 'Saturday' THEN 7
END ASC;
--- Restructuring the previous query using a CTE and window function:
WITH ranked_days AS (
SELECT
user_type,
day_of_week,
COUNT(*) AS total_rides,
RANK() OVER(PARTITION BY user_type ORDER BY COUNT(*) DESC) AS rank
FROM cyclistic_2023
GROUP BY 1, 2
)
SELECT
user_type,
day_of_week,
total_rides
FROM ranked_days
WHERE rank = 1;
-- How many rides were recorded each month?:
SELECT
month,
COUNT(*) AS total_rides
FROM cyclistic_2023
GROUP BY 1
ORDER BY
CASE
WHEN month = 'Jan' THEN 1
WHEN month = 'Feb' THEN 2
WHEN month = 'Mar' THEN 3
WHEN month = 'Apr' THEN 4
WHEN month = 'May' THEN 5
WHEN month = 'Jun' THEN 6
WHEN month = 'Jul' THEN 7
WHEN month = 'Aug' THEN 8
WHEN month = 'Sep' THEN 9
WHEN month = 'Oct' THEN 10
WHEN month = 'Nov' THEN 11
WHEN month = 'Dec' THEN 12
END ASC; -- The month of August had the most rides, totaling 752,073.
-- In which month did members have the highest number of rides? In which month did casual riders have the highest number of rides?:
WITH ranked_months AS (
SELECT
user_type,
month,
COUNT(*) AS total_rides,
RANK() OVER (PARTITION BY user_type ORDER BY COUNT(*) DESC) AS rank
FROM cyclistic_2023
GROUP BY 1, 2
)
SELECT
user_type,
month,
total_rides
FROM ranked_months
WHERE rank = 1; -- The most active month for casual riders was July, while August was the most active month for members.
-- Calculate the total ride length (duration) by user type:
SELECT
user_type,
ROUND(SUM(ride_length), 2) AS "total_duration(minutes)"
FROM cyclistic_2023
GROUP BY 1;
-- Calculate the average ride length for all users:
SELECT
ROUND(AVG(ride_length), 2) AS "avg_ride_length(minutes)"
FROM cyclistic_2023;
--- Calcuate the average ride length by user type:
SELECT
user_type,
ROUND(AVG(ride_length), 2) AS "avg_ride_length(minutes)"
FROM cyclistic_2023
GROUP BY 1;
-- Calculate the average ride length by month:
SELECT
month,
ROUND(AVG(ride_length), 2) AS "avg_ride_length(minutes)"
FROM cyclistic_2023
GROUP BY 1
ORDER BY
CASE
WHEN month = 'Jan' THEN 1
WHEN month = 'Feb' THEN 2
WHEN month = 'Mar' THEN 3
WHEN month = 'Apr' THEN 4
WHEN month = 'May' THEN 5
WHEN month = 'Jun' THEN 6
WHEN month = 'Jul' THEN 7
WHEN month = 'Aug' THEN 8
WHEN month = 'Sep' THEN 9
WHEN month = 'Oct' THEN 10
WHEN month = 'Nov' THEN 11
WHEN month = 'Dec' THEN 12
END ASC;
-- Calculate the average ride length by month and user type:
SELECT
month,
user_type,
ROUND(AVG(ride_length), 2) AS "avg_ride_length(minutes)"
FROM cyclistic_2023
GROUP BY 1, 2
ORDER BY
CASE
WHEN month = 'Jan' THEN 1
WHEN month = 'Feb' THEN 2
WHEN month = 'Mar' THEN 3
WHEN month = 'Apr' THEN 4
WHEN month = 'May' THEN 5
WHEN month = 'Jun' THEN 6
WHEN month = 'Jul' THEN 7
WHEN month = 'Aug' THEN 8
WHEN month = 'Sep' THEN 9
WHEN month = 'Oct' THEN 10
WHEN month = 'Nov' THEN 11
WHEN month = 'Dec' THEN 12
END ASC;
-- Calculate the average ride length by bike type and user type:
SELECT
user_type,
bike_type,
ROUND(AVG(ride_length), 2) AS "avg_ride_length(minutes)"
FROM cyclistic_2023
GROUP BY 1, 2;
-- What were the top 10 starting stations?:
SELECT
starting_station,
COUNT(*) AS number_of_starts
FROM cyclistic_2023
WHERE starting_station != 'NA'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
-- What were the top 10 ending stations?:
SELECT
ending_station,
COUNT(*) AS number_of_stops
FROM cyclistic_2023
WHERE ending_station != 'NA'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
-- What were the top 10 starting stations for casual riders?
SELECT
starting_station,
COUNT(*) AS number_of_starts
FROM cyclistic_2023
WHERE starting_station != 'NA'
AND user_type = 'casual'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
-- What were the top 10 starting stations for members?
SELECT
starting_station,
COUNT(*) AS number_of_starts
FROM cyclistic_2023
WHERE starting_station != 'NA'
AND user_type = 'member'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
-- What were the top 10 ending stations for casual riders?:
SELECT
ending_station,
COUNT(*) AS number_of_stops
FROM cyclistic_2023
WHERE ending_station != 'NA'
AND user_type = 'casual'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;
-- What were the top 10 ending stations for members?:
SELECT
ending_station,
COUNT(*) AS number_of_stops
FROM cyclistic_2023
WHERE ending_station != 'NA'
AND user_type = 'member'
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10;