Skip to content

Commit edb5df4

Browse files
Compilation de la révision af73449
1 parent af73449 commit edb5df4

4 files changed

Lines changed: 268 additions & 304 deletions

File tree

en/04-combine.ipynb

Lines changed: 65 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"lang": "en"
88
},
99
"source": [
10-
"# Data Analysis and Visualization in Python\n",
10+
"# Data Analysis with Python\n",
1111
"## Combining DataFrames with pandas\n",
1212
"Questions\n",
1313
"* Can I work with data from multiple sources?\n",
@@ -25,7 +25,7 @@
2525
"lang": "en"
2626
},
2727
"source": [
28-
"## Loading our data"
28+
"## List data files"
2929
]
3030
},
3131
{
@@ -37,12 +37,12 @@
3737
},
3838
"outputs": [],
3939
"source": [
40-
"# First make sure pandas is loaded\n",
41-
"import pandas as pd\n",
40+
"# Function for \"globbing\" (searching by file name pattern)\n",
41+
"from glob import glob\n",
4242
"\n",
4343
"# List a collection of CSV files\n",
44-
"from glob import glob\n",
45-
"glob('../data/by_year/*.csv')"
44+
"csv_files = glob('../data/by_year/*.csv')\n",
45+
"csv_files[-5:]"
4646
]
4747
},
4848
{
@@ -64,6 +64,8 @@
6464
},
6565
"outputs": [],
6666
"source": [
67+
"import pandas as pd\n",
68+
"\n",
6769
"year2001 = pd.read_csv('../data/by_year/surveys_2001.csv')\n",
6870
"year2002 = pd.read_csv('../data/by_year/surveys_2002.csv')\n",
6971
"\n",
@@ -109,12 +111,13 @@
109111
"outputs": [],
110112
"source": [
111113
"# Accumulate data from all files in the collection\n",
112-
"surveys_df = pd.DataFrame() # Empty DataFrame\n",
114+
"df_list = []\n",
113115
"\n",
114116
"for filename in glob('../data/by_year/*.csv'):\n",
115-
" df_year = pd.read_csv(filename)\n",
116-
" surveys_df = pd.concat([surveys_df, df_year], axis='index')\n",
117+
" df_by_year = pd.read_csv(filename)\n",
118+
" df_list.append(df_by_year)\n",
117119
"\n",
120+
"surveys_df = pd.concat(df_list, axis='index')\n",
118121
"surveys_df = surveys_df.reset_index(drop=True)\n",
119122
"surveys_df"
120123
]
@@ -128,8 +131,9 @@
128131
"source": [
129132
"## Exercise - Concatenating DataFrames\n",
130133
"* Load the data from all CSV files in the directory\n",
131-
" `../data/by_species_id/` and accumulate them in `surveys_sp`.\n",
132-
"* Reset the index while dropping the accumulated one.\n",
134+
" `../data/by_species_id/` and accumulate them in a list.\n",
135+
"* Concatenate the DataFrames of that list.\n",
136+
"* Reset the index without preserving it.\n",
133137
"\n",
134138
"(4 min.)"
135139
]
@@ -146,13 +150,12 @@
146150
},
147151
"outputs": [],
148152
"source": [
149-
"surveys_sp = pd.DataFrame() # Empty DataFrame\n",
153+
"df_list = []\n",
150154
"\n",
151155
"for filename in ###('../data/by_species_id/*.csv'):\n",
152-
" new_df = pd.read_csv(filename)\n",
153-
" surveys_sp = pd.###([###, new_df], ###='index')\n",
156+
" df_list.###(pd.read_csv(filename))\n",
154157
"\n",
155-
"surveys_sp = surveys_sp.###(drop=###)\n",
158+
"surveys_sp = pd.###(###, ###='index').###(drop=###)\n",
156159
"surveys_sp"
157160
]
158161
},
@@ -239,6 +242,20 @@
239242
"species_sub"
240243
]
241244
},
245+
{
246+
"cell_type": "code",
247+
"execution_count": null,
248+
"id": "e9729aee-b862-4229-ade9-8ca2a52ce273",
249+
"metadata": {
250+
"lang": "en"
251+
},
252+
"outputs": [],
253+
"source": [
254+
"# The first ten records\n",
255+
"head10 = surveys_df.head(10)\n",
256+
"head10"
257+
]
258+
},
242259
{
243260
"cell_type": "markdown",
244261
"id": "8886c399-91c5-49f9-afe5-7095d303ed78",
@@ -252,13 +269,13 @@
252269
{
253270
"cell_type": "code",
254271
"execution_count": null,
255-
"id": "f2b0fb1d-0a90-47d1-b016-caf51a192033",
272+
"id": "07441fe7-284b-4458-8561-fa91b50e32d1",
256273
"metadata": {
257-
"lang": "en,fr"
274+
"lang": "en"
258275
},
259276
"outputs": [],
260277
"source": [
261-
"surveys_df.columns"
278+
"head10.columns"
262279
]
263280
},
264281
{
@@ -302,12 +319,14 @@
302319
},
303320
"outputs": [],
304321
"source": [
305-
"head10 = surveys_df.head(10)\n",
306-
"\n",
307322
"# Computing the inner join of head10 and species_sub\n",
308323
"key = 'species_id'\n",
309-
"merged_inner = pd.merge(left=head10, right=species_sub,\n",
310-
" left_on=key, right_on=key)\n",
324+
"merged_inner = pd.merge(\n",
325+
" left=head10,\n",
326+
" right=species_sub,\n",
327+
" left_on=key,\n",
328+
" right_on=key\n",
329+
")\n",
311330
"# What's the size of the output data?\n",
312331
"merged_inner.shape"
313332
]
@@ -353,8 +372,12 @@
353372
},
354373
"outputs": [],
355374
"source": [
356-
"merged_left = pd.merge(left=head10, right=species_sub,\n",
357-
" on=key, how='left')\n",
375+
"merged_left = pd.merge(\n",
376+
" left=head10,\n",
377+
" right=species_sub,\n",
378+
" how='left',\n",
379+
" on=key\n",
380+
")\n",
358381
"# What's the size of the output data?\n",
359382
"merged_left.shape"
360383
]
@@ -411,7 +434,11 @@
411434
"species_df = pd.read_csv('../data/species.csv')\n",
412435
"\n",
413436
"merged_left = pd.merge(\n",
414-
" left=surveys_df, right=###, on=###, how=###)\n",
437+
" left=surveys_df,\n",
438+
" right=###,\n",
439+
" how=###,\n",
440+
" on=###\n",
441+
")\n",
415442
"merged_left.shape"
416443
]
417444
},
@@ -422,8 +449,9 @@
422449
"lang": "en"
423450
},
424451
"source": [
425-
"`2`. Calculate and plot the evolution of the average\n",
426-
"hindfoot length for each genus from year to year. (3 min.)"
452+
"`2`. Calculate the evolution of the average hindfoot\n",
453+
"length for each genus from year to year. Transform the\n",
454+
"result such that each genus gets its own column. (4 min.)"
427455
]
428456
},
429457
{
@@ -443,35 +471,21 @@
443471
"average_lengths.tail()"
444472
]
445473
},
446-
{
447-
"cell_type": "code",
448-
"execution_count": null,
449-
"id": "e678c22f-2a1c-4e90-ad93-f3df07a7e09d",
450-
"metadata": {
451-
"lang": "en"
452-
},
453-
"outputs": [],
454-
"source": [
455-
"average_lengths.plot(kind='line')"
456-
]
457-
},
458474
{
459475
"cell_type": "markdown",
460476
"id": "706256da-5ac4-49ab-b593-92611792fcc7",
461477
"metadata": {
462478
"lang": "en"
463479
},
464480
"source": [
465-
"`3`. Calculate and create a bar plot showing\n",
466-
"the average weight per sex for each genus.\n",
467-
"For this exercise, we will use a pivot table instead of `unstack()`.\n",
468-
"(2 min.)"
481+
"`3`. Calculate the average weight per sex for each genus. For this\n",
482+
"exercise, we will use a pivot table instead of `unstack()`. (3 min.)"
469483
]
470484
},
471485
{
472486
"cell_type": "code",
473487
"execution_count": null,
474-
"id": "f0d64bba-7946-4db8-89e1-de978eda2434",
488+
"id": "43d5c2e5-1fe8-4bef-adc8-d0782f5081cc",
475489
"metadata": {
476490
"lang": "en",
477491
"tags": [
@@ -480,36 +494,13 @@
480494
},
481495
"outputs": [],
482496
"source": [
483-
"weights_by_genus_sex = merged_left.groupby(\n",
484-
" ['genus', 'sex'])['weight'].###()#.reset_index()\n",
485-
"weights_by_genus_sex.tail()"
486-
]
487-
},
488-
{
489-
"cell_type": "code",
490-
"execution_count": null,
491-
"id": "5542192f-8c3f-4996-b07a-78d40d035f94",
492-
"metadata": {
493-
"lang": "en"
494-
},
495-
"outputs": [],
496-
"source": [
497-
"# Use pivot_table() instead of unstack()\n",
498-
"pivot_weight_genus_sex = weights_by_genus_sex.pivot_table(\n",
499-
" values='weight', index='genus', columns='sex')\n",
500-
"pivot_weight_genus_sex"
501-
]
502-
},
503-
{
504-
"cell_type": "code",
505-
"execution_count": null,
506-
"id": "b45d6a03-66f8-429d-9fbe-d1bda91ae9f8",
507-
"metadata": {
508-
"lang": "en,fr"
509-
},
510-
"outputs": [],
511-
"source": [
512-
"pivot_weight_genus_sex.plot(kind=\"bar\")"
497+
"# Use pivot_table() instead of groupby() + unstack()\n",
498+
"merged_left.###(\n",
499+
" values=###,\n",
500+
" index=###,\n",
501+
" columns=###,\n",
502+
" aggfunc=###\n",
503+
")"
513504
]
514505
},
515506
{

0 commit comments

Comments
 (0)