|
7 | 7 | "lang": "en" |
8 | 8 | }, |
9 | 9 | "source": [ |
10 | | - "# Data Analysis and Visualization in Python\n", |
| 10 | + "# Data Analysis with Python\n", |
11 | 11 | "## Combining DataFrames with pandas\n", |
12 | 12 | "Questions\n", |
13 | 13 | "* Can I work with data from multiple sources?\n", |
|
25 | 25 | "lang": "en" |
26 | 26 | }, |
27 | 27 | "source": [ |
28 | | - "## Loading our data" |
| 28 | + "## List data files" |
29 | 29 | ] |
30 | 30 | }, |
31 | 31 | { |
|
37 | 37 | }, |
38 | 38 | "outputs": [], |
39 | 39 | "source": [ |
40 | | - "# First make sure pandas is loaded\n", |
41 | | - "import pandas as pd\n", |
| 40 | + "# Function for \"globbing\" (searching by file name pattern)\n", |
| 41 | + "from glob import glob\n", |
42 | 42 | "\n", |
43 | 43 | "# List a collection of CSV files\n", |
44 | | - "from glob import glob\n", |
45 | | - "glob('../data/by_year/*.csv')" |
| 44 | + "csv_files = glob('../data/by_year/*.csv')\n", |
| 45 | + "csv_files[-5:]" |
46 | 46 | ] |
47 | 47 | }, |
48 | 48 | { |
|
64 | 64 | }, |
65 | 65 | "outputs": [], |
66 | 66 | "source": [ |
| 67 | + "import pandas as pd\n", |
| 68 | + "\n", |
67 | 69 | "year2001 = pd.read_csv('../data/by_year/surveys_2001.csv')\n", |
68 | 70 | "year2002 = pd.read_csv('../data/by_year/surveys_2002.csv')\n", |
69 | 71 | "\n", |
|
109 | 111 | "outputs": [], |
110 | 112 | "source": [ |
111 | 113 | "# Accumulate data from all files in the collection\n", |
112 | | - "surveys_df = pd.DataFrame() # Empty DataFrame\n", |
| 114 | + "df_list = []\n", |
113 | 115 | "\n", |
114 | 116 | "for filename in glob('../data/by_year/*.csv'):\n", |
115 | | - " df_year = pd.read_csv(filename)\n", |
116 | | - " surveys_df = pd.concat([surveys_df, df_year], axis='index')\n", |
| 117 | + " df_by_year = pd.read_csv(filename)\n", |
| 118 | + " df_list.append(df_by_year)\n", |
117 | 119 | "\n", |
| 120 | + "surveys_df = pd.concat(df_list, axis='index')\n", |
118 | 121 | "surveys_df = surveys_df.reset_index(drop=True)\n", |
119 | 122 | "surveys_df" |
120 | 123 | ] |
|
128 | 131 | "source": [ |
129 | 132 | "## Exercise - Concatenating DataFrames\n", |
130 | 133 | "* Load the data from all CSV files in the directory\n", |
131 | | - " `../data/by_species_id/` and accumulate them in `surveys_sp`.\n", |
132 | | - "* Reset the index while dropping the accumulated one.\n", |
| 134 | + " `../data/by_species_id/` and accumulate them in a list.\n", |
| 135 | + "* Concatenate the DataFrames of that list.\n", |
| 136 | + "* Reset the index without preserving it.\n", |
133 | 137 | "\n", |
134 | 138 | "(4 min.)" |
135 | 139 | ] |
|
146 | 150 | }, |
147 | 151 | "outputs": [], |
148 | 152 | "source": [ |
149 | | - "surveys_sp = pd.DataFrame() # Empty DataFrame\n", |
| 153 | + "df_list = []\n", |
150 | 154 | "\n", |
151 | 155 | "for filename in ###('../data/by_species_id/*.csv'):\n", |
152 | | - " new_df = pd.read_csv(filename)\n", |
153 | | - " surveys_sp = pd.###([###, new_df], ###='index')\n", |
| 156 | + " df_list.###(pd.read_csv(filename))\n", |
154 | 157 | "\n", |
155 | | - "surveys_sp = surveys_sp.###(drop=###)\n", |
| 158 | + "surveys_sp = pd.###(###, ###='index').###(drop=###)\n", |
156 | 159 | "surveys_sp" |
157 | 160 | ] |
158 | 161 | }, |
|
239 | 242 | "species_sub" |
240 | 243 | ] |
241 | 244 | }, |
| 245 | + { |
| 246 | + "cell_type": "code", |
| 247 | + "execution_count": null, |
| 248 | + "id": "e9729aee-b862-4229-ade9-8ca2a52ce273", |
| 249 | + "metadata": { |
| 250 | + "lang": "en" |
| 251 | + }, |
| 252 | + "outputs": [], |
| 253 | + "source": [ |
| 254 | + "# The first ten records\n", |
| 255 | + "head10 = surveys_df.head(10)\n", |
| 256 | + "head10" |
| 257 | + ] |
| 258 | + }, |
242 | 259 | { |
243 | 260 | "cell_type": "markdown", |
244 | 261 | "id": "8886c399-91c5-49f9-afe5-7095d303ed78", |
|
252 | 269 | { |
253 | 270 | "cell_type": "code", |
254 | 271 | "execution_count": null, |
255 | | - "id": "f2b0fb1d-0a90-47d1-b016-caf51a192033", |
| 272 | + "id": "07441fe7-284b-4458-8561-fa91b50e32d1", |
256 | 273 | "metadata": { |
257 | | - "lang": "en,fr" |
| 274 | + "lang": "en" |
258 | 275 | }, |
259 | 276 | "outputs": [], |
260 | 277 | "source": [ |
261 | | - "surveys_df.columns" |
| 278 | + "head10.columns" |
262 | 279 | ] |
263 | 280 | }, |
264 | 281 | { |
|
302 | 319 | }, |
303 | 320 | "outputs": [], |
304 | 321 | "source": [ |
305 | | - "head10 = surveys_df.head(10)\n", |
306 | | - "\n", |
307 | 322 | "# Computing the inner join of head10 and species_sub\n", |
308 | 323 | "key = 'species_id'\n", |
309 | | - "merged_inner = pd.merge(left=head10, right=species_sub,\n", |
310 | | - " left_on=key, right_on=key)\n", |
| 324 | + "merged_inner = pd.merge(\n", |
| 325 | + " left=head10,\n", |
| 326 | + " right=species_sub,\n", |
| 327 | + " left_on=key,\n", |
| 328 | + " right_on=key\n", |
| 329 | + ")\n", |
311 | 330 | "# What's the size of the output data?\n", |
312 | 331 | "merged_inner.shape" |
313 | 332 | ] |
|
353 | 372 | }, |
354 | 373 | "outputs": [], |
355 | 374 | "source": [ |
356 | | - "merged_left = pd.merge(left=head10, right=species_sub,\n", |
357 | | - " on=key, how='left')\n", |
| 375 | + "merged_left = pd.merge(\n", |
| 376 | + " left=head10,\n", |
| 377 | + " right=species_sub,\n", |
| 378 | + " how='left',\n", |
| 379 | + " on=key\n", |
| 380 | + ")\n", |
358 | 381 | "# What's the size of the output data?\n", |
359 | 382 | "merged_left.shape" |
360 | 383 | ] |
|
411 | 434 | "species_df = pd.read_csv('../data/species.csv')\n", |
412 | 435 | "\n", |
413 | 436 | "merged_left = pd.merge(\n", |
414 | | - " left=surveys_df, right=###, on=###, how=###)\n", |
| 437 | + " left=surveys_df,\n", |
| 438 | + " right=###,\n", |
| 439 | + " how=###,\n", |
| 440 | + " on=###\n", |
| 441 | + ")\n", |
415 | 442 | "merged_left.shape" |
416 | 443 | ] |
417 | 444 | }, |
|
422 | 449 | "lang": "en" |
423 | 450 | }, |
424 | 451 | "source": [ |
425 | | - "`2`. Calculate and plot the evolution of the average\n", |
426 | | - "hindfoot length for each genus from year to year. (3 min.)" |
| 452 | + "`2`. Calculate the evolution of the average hindfoot\n", |
| 453 | + "length for each genus from year to year. Transform the\n", |
| 454 | + "result such that each genus gets its own column. (4 min.)" |
427 | 455 | ] |
428 | 456 | }, |
429 | 457 | { |
|
443 | 471 | "average_lengths.tail()" |
444 | 472 | ] |
445 | 473 | }, |
446 | | - { |
447 | | - "cell_type": "code", |
448 | | - "execution_count": null, |
449 | | - "id": "e678c22f-2a1c-4e90-ad93-f3df07a7e09d", |
450 | | - "metadata": { |
451 | | - "lang": "en" |
452 | | - }, |
453 | | - "outputs": [], |
454 | | - "source": [ |
455 | | - "average_lengths.plot(kind='line')" |
456 | | - ] |
457 | | - }, |
458 | 474 | { |
459 | 475 | "cell_type": "markdown", |
460 | 476 | "id": "706256da-5ac4-49ab-b593-92611792fcc7", |
461 | 477 | "metadata": { |
462 | 478 | "lang": "en" |
463 | 479 | }, |
464 | 480 | "source": [ |
465 | | - "`3`. Calculate and create a bar plot showing\n", |
466 | | - "the average weight per sex for each genus.\n", |
467 | | - "For this exercise, we will use a pivot table instead of `unstack()`.\n", |
468 | | - "(2 min.)" |
| 481 | + "`3`. Calculate the average weight per sex for each genus. For this\n", |
| 482 | + "exercise, we will use a pivot table instead of `unstack()`. (3 min.)" |
469 | 483 | ] |
470 | 484 | }, |
471 | 485 | { |
472 | 486 | "cell_type": "code", |
473 | 487 | "execution_count": null, |
474 | | - "id": "f0d64bba-7946-4db8-89e1-de978eda2434", |
| 488 | + "id": "43d5c2e5-1fe8-4bef-adc8-d0782f5081cc", |
475 | 489 | "metadata": { |
476 | 490 | "lang": "en", |
477 | 491 | "tags": [ |
|
480 | 494 | }, |
481 | 495 | "outputs": [], |
482 | 496 | "source": [ |
483 | | - "weights_by_genus_sex = merged_left.groupby(\n", |
484 | | - " ['genus', 'sex'])['weight'].###()#.reset_index()\n", |
485 | | - "weights_by_genus_sex.tail()" |
486 | | - ] |
487 | | - }, |
488 | | - { |
489 | | - "cell_type": "code", |
490 | | - "execution_count": null, |
491 | | - "id": "5542192f-8c3f-4996-b07a-78d40d035f94", |
492 | | - "metadata": { |
493 | | - "lang": "en" |
494 | | - }, |
495 | | - "outputs": [], |
496 | | - "source": [ |
497 | | - "# Use pivot_table() instead of unstack()\n", |
498 | | - "pivot_weight_genus_sex = weights_by_genus_sex.pivot_table(\n", |
499 | | - " values='weight', index='genus', columns='sex')\n", |
500 | | - "pivot_weight_genus_sex" |
501 | | - ] |
502 | | - }, |
503 | | - { |
504 | | - "cell_type": "code", |
505 | | - "execution_count": null, |
506 | | - "id": "b45d6a03-66f8-429d-9fbe-d1bda91ae9f8", |
507 | | - "metadata": { |
508 | | - "lang": "en,fr" |
509 | | - }, |
510 | | - "outputs": [], |
511 | | - "source": [ |
512 | | - "pivot_weight_genus_sex.plot(kind=\"bar\")" |
| 497 | + "# Use pivot_table() instead of groupby() + unstack()\n", |
| 498 | + "merged_left.###(\n", |
| 499 | + " values=###,\n", |
| 500 | + " index=###,\n", |
| 501 | + " columns=###,\n", |
| 502 | + " aggfunc=###\n", |
| 503 | + ")" |
513 | 504 | ] |
514 | 505 | }, |
515 | 506 | { |
|
0 commit comments