Skip to content

Commit 777b90f

Browse files
committed
Exercices de jonction avec données nettoyées
1 parent d8a599e commit 777b90f

5 files changed

Lines changed: 146 additions & 224 deletions

File tree

en/04-combine.ipynb

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,13 @@
252252
{
253253
"cell_type": "code",
254254
"execution_count": null,
255-
"id": "e8192e6e-ecdd-4cef-bc8c-41dcac175415",
255+
"id": "f2b0fb1d-0a90-47d1-b016-caf51a192033",
256256
"metadata": {
257-
"lang": "en"
257+
"lang": "en,fr"
258258
},
259259
"outputs": [],
260260
"source": [
261-
"head10.columns"
261+
"surveys_df.columns"
262262
]
263263
},
264264
{
@@ -302,6 +302,8 @@
302302
},
303303
"outputs": [],
304304
"source": [
305+
"head10 = surveys_df.head(10)\n",
306+
"\n",
305307
"# Computing the inner join of head10 and species_sub\n",
306308
"key = 'species_id'\n",
307309
"merged_inner = pd.merge(left=head10, right=species_sub,\n",
@@ -390,7 +392,7 @@
390392
"source": [
391393
"## Exercise - Joining all data\n",
392394
"`1`. Create a new DataFrame by joining the contents of the\n",
393-
"`surveys.csv` and `species.csv` tables. Keep all survey records.\n",
395+
"`surveys_df` and `species.csv` tables. Keep all survey records.\n",
394396
"(3 min.)"
395397
]
396398
},
@@ -420,8 +422,8 @@
420422
"lang": "en"
421423
},
422424
"source": [
423-
"`2`. Calculate and plot the distribution of surveys (i.e. the\n",
424-
"number of `record_id`) by `taxa` for each `plot_id`. (3 min.)"
425+
"`2`. Calculate and plot the evolution of the average\n",
426+
"hindfoot length for each genus from year to year. (3 min.)"
425427
]
426428
},
427429
{
@@ -436,9 +438,9 @@
436438
},
437439
"outputs": [],
438440
"source": [
439-
"by_site_taxa = merged_left###\n",
440-
"taxa_site = by_site_taxa['record_id']###\n",
441-
"taxa_site.tail()"
441+
"average_lengths = merged_left.###(\n",
442+
" ###)['hindfoot_length']###\n",
443+
"average_lengths.tail()"
442444
]
443445
},
444446
{
@@ -450,7 +452,7 @@
450452
},
451453
"outputs": [],
452454
"source": [
453-
"taxa_site.plot(kind='bar', logy=True)"
455+
"average_lengths.plot(kind='line')"
454456
]
455457
},
456458
{
@@ -460,23 +462,10 @@
460462
"lang": "en"
461463
},
462464
"source": [
463-
"`3`. Calculate and plot the distribution\n",
464-
"of `taxa` by `sex` for each `plot_id`. (2 min.)"
465-
]
466-
},
467-
{
468-
"cell_type": "code",
469-
"execution_count": null,
470-
"id": "f15d004d-3619-46e8-9ed1-df35f2153c5a",
471-
"metadata": {
472-
"lang": "en"
473-
},
474-
"outputs": [],
475-
"source": [
476-
"# Data cleanup\n",
477-
"merged_left['sex'] = merged_left['sex'].fillna('F|M')\n",
478-
"invalid_mask = ~merged_left['sex'].isin(['F', 'F|M', 'M'])\n",
479-
"merged_left.loc[invalid_mask, 'sex'] = \"F|M\""
465+
"`3`. Calculate and create a bar plot showing\n",
466+
"the average weight per sex for each genus.\n",
467+
"For this exercise, we will use a pivot table instead of `unstack()`.\n",
468+
"(2 min.)"
480469
]
481470
},
482471
{
@@ -491,9 +480,9 @@
491480
},
492481
"outputs": [],
493482
"source": [
494-
"ntaxa_sex_site = merged_left.groupby(\n",
495-
" ['plot_id', 'sex'])[###].nunique()#.reset_index(level=1)\n",
496-
"ntaxa_sex_site.tail()"
483+
"weights_by_genus_sex = merged_left.groupby(\n",
484+
" ['genus', 'sex'])['weight'].###()#.reset_index()\n",
485+
"weights_by_genus_sex.tail()"
497486
]
498487
},
499488
{
@@ -506,9 +495,9 @@
506495
"outputs": [],
507496
"source": [
508497
"# Use pivot_table() instead of unstack()\n",
509-
"pivot_taxa_sex_site = ntaxa_sex_site.pivot_table(\n",
510-
" values='taxa', columns='sex', index=ntaxa_sex_site.index)\n",
511-
"pivot_taxa_sex_site.tail()"
498+
"pivot_weight_genus_sex = weights_by_genus_sex.pivot_table(\n",
499+
" values='weight', index='genus', columns='sex')\n",
500+
"pivot_weight_genus_sex"
512501
]
513502
},
514503
{
@@ -520,7 +509,7 @@
520509
},
521510
"outputs": [],
522511
"source": [
523-
"pivot_taxa_sex_site.plot(kind=\"bar\")"
512+
"pivot_weight_genus_sex.plot(kind=\"bar\")"
524513
]
525514
},
526515
{

fr/04-combine.ipynb

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,11 @@
254254
"execution_count": null,
255255
"id": "f2b0fb1d-0a90-47d1-b016-caf51a192033",
256256
"metadata": {
257-
"lang": "fr"
257+
"lang": "en,fr"
258258
},
259259
"outputs": [],
260260
"source": [
261-
"premiers10.columns"
261+
"surveys_df.columns"
262262
]
263263
},
264264
{
@@ -302,6 +302,8 @@
302302
},
303303
"outputs": [],
304304
"source": [
305+
"premiers10 = surveys_df.head(10)\n",
306+
"\n",
305307
"# Calculer l'intersection de premiers10 et trois_especes\n",
306308
"cle = 'species_id'\n",
307309
"intersection = pd.merge(left=premiers10, right=trois_especes,\n",
@@ -390,7 +392,7 @@
390392
"source": [
391393
"## Exercice - Joindre toutes les données\n",
392394
"`1`. Créez un nouveau DataFrame tel que tous les\n",
393-
"enregistrements de `surveys.csv` sont gardés dans une jonction\n",
395+
"enregistrements de `surveys_df` sont gardés dans une jonction\n",
394396
"impliquant les informations correspondantes de `species.csv`.\n",
395397
"(3 min.)"
396398
]
@@ -421,9 +423,9 @@
421423
"lang": "fr"
422424
},
423425
"source": [
424-
"`2`. Calculez et créez un graphique (*bar-plot*) montrant\n",
425-
"le nombre d'enregistrements (soit le nombre de `record_id`)\n",
426-
"par type de `taxa` pour chaque site (`plot_id`). (3 min.)"
426+
"`2`. Calculez et créez un graphique montrant l'évolution de la\n",
427+
"longueur moyenne des arrière-pieds (`'hindfoot_length'`) pour\n",
428+
"chaque genre d'espèce (`'genus'`) d'une année à l'autre. (3 min.)"
427429
]
428430
},
429431
{
@@ -438,9 +440,9 @@
438440
},
439441
"outputs": [],
440442
"source": [
441-
"par_site_taxa = jonc_gauche###\n",
442-
"nb_par_site_taxa = par_site_taxa['record_id']###\n",
443-
"nb_par_site_taxa.tail()"
443+
"longueurs_moyennes = jonc_gauche.###(\n",
444+
" ###)['hindfoot_length']###\n",
445+
"longueurs_moyennes.tail()"
444446
]
445447
},
446448
{
@@ -452,7 +454,7 @@
452454
},
453455
"outputs": [],
454456
"source": [
455-
"nb_par_site_taxa.plot(kind='bar', logy=True)"
457+
"longueurs_moyennes.plot(kind='line')"
456458
]
457459
},
458460
{
@@ -462,26 +464,13 @@
462464
"lang": "fr"
463465
},
464466
"source": [
465-
"`3`. Calculez et créez un graphique (bar-plot) montrant le nombre\n",
466-
"de différents `taxa` par type de sexe pour chaque site (`plot_id`).\n",
467+
"`3`. Calculez et créez un graphique (*bar-plot*) montrant\n",
468+
"le poids moyen selon le sexe pour chaque genre d'espèce.\n",
469+
"Pour cet exercice, nous allons utiliser une\n",
470+
"table de pivot à la place de `unstack()`.\n",
467471
"(2 min.)"
468472
]
469473
},
470-
{
471-
"cell_type": "code",
472-
"execution_count": null,
473-
"id": "fdc14b08-d825-44d1-a87b-480d8b5f2304",
474-
"metadata": {
475-
"lang": "fr"
476-
},
477-
"outputs": [],
478-
"source": [
479-
"# Nettoyage des données\n",
480-
"jonc_gauche['sex'] = jonc_gauche['sex'].fillna('F|M')\n",
481-
"invalides = ~jonc_gauche['sex'].isin(['F', 'F|M', 'M'])\n",
482-
"jonc_gauche.loc[invalides, 'sex'] = 'F|M'"
483-
]
484-
},
485474
{
486475
"cell_type": "code",
487476
"execution_count": null,
@@ -494,9 +483,9 @@
494483
},
495484
"outputs": [],
496485
"source": [
497-
"ntaxa_sex_site = jonc_gauche.groupby(\n",
498-
" ['plot_id', 'sex'])[###].nunique()#.reset_index(level=1)\n",
499-
"ntaxa_sex_site.tail()"
486+
"poids_par_genre_sexe = jonc_gauche.groupby(\n",
487+
" ['genus', 'sex'])['weight'].###()#.reset_index()\n",
488+
"poids_par_genre_sexe.tail()"
500489
]
501490
},
502491
{
@@ -509,9 +498,9 @@
509498
"outputs": [],
510499
"source": [
511500
"# Utiliser pivot_table() au lieu de unstack()\n",
512-
"pivot_taxa_sex_site = ntaxa_sex_site.pivot_table(\n",
513-
" values='taxa', columns='sex', index=ntaxa_sex_site.index)\n",
514-
"pivot_taxa_sex_site.tail()"
501+
"pivot_weight_genus_sex = poids_par_genre_sexe.pivot_table(\n",
502+
" values='weight', index='genus', columns='sex')\n",
503+
"pivot_weight_genus_sex"
515504
]
516505
},
517506
{
@@ -523,7 +512,7 @@
523512
},
524513
"outputs": [],
525514
"source": [
526-
"pivot_taxa_sex_site.plot(kind=\"bar\")"
515+
"pivot_weight_genus_sex.plot(kind=\"bar\")"
527516
]
528517
},
529518
{

solution-en/04-combine.ipynb

Lines changed: 23 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -252,13 +252,13 @@
252252
{
253253
"cell_type": "code",
254254
"execution_count": null,
255-
"id": "e8192e6e-ecdd-4cef-bc8c-41dcac175415",
255+
"id": "f2b0fb1d-0a90-47d1-b016-caf51a192033",
256256
"metadata": {
257-
"lang": "en"
257+
"lang": "en,fr"
258258
},
259259
"outputs": [],
260260
"source": [
261-
"head10.columns"
261+
"surveys_df.columns"
262262
]
263263
},
264264
{
@@ -302,6 +302,8 @@
302302
},
303303
"outputs": [],
304304
"source": [
305+
"head10 = surveys_df.head(10)\n",
306+
"\n",
305307
"# Computing the inner join of head10 and species_sub\n",
306308
"key = 'species_id'\n",
307309
"merged_inner = pd.merge(left=head10, right=species_sub,\n",
@@ -390,7 +392,7 @@
390392
"source": [
391393
"## Exercise - Joining all data\n",
392394
"`1`. Create a new DataFrame by joining the contents of the\n",
393-
"`surveys.csv` and `species.csv` tables. Keep all survey records.\n",
395+
"`surveys_df` and `species.csv` tables. Keep all survey records.\n",
394396
"(3 min.)"
395397
]
396398
},
@@ -420,8 +422,8 @@
420422
"lang": "en"
421423
},
422424
"source": [
423-
"`2`. Calculate and plot the distribution of surveys (i.e. the\n",
424-
"number of `record_id`) by `taxa` for each `plot_id`. (3 min.)"
425+
"`2`. Calculate and plot the evolution of the average\n",
426+
"hindfoot length for each genus from year to year. (3 min.)"
425427
]
426428
},
427429
{
@@ -436,9 +438,9 @@
436438
},
437439
"outputs": [],
438440
"source": [
439-
"by_site_taxa = merged_left.groupby(['plot_id', 'taxa'])\n",
440-
"taxa_site = by_site_taxa['record_id'].count().unstack()\n",
441-
"taxa_site.tail()"
441+
"average_lengths = merged_left.groupby(\n",
442+
" ['year', 'genus'])['hindfoot_length'].mean().unstack()\n",
443+
"average_lengths.tail()"
442444
]
443445
},
444446
{
@@ -450,7 +452,7 @@
450452
},
451453
"outputs": [],
452454
"source": [
453-
"taxa_site.plot(kind='bar', logy=True)"
455+
"average_lengths.plot(kind='line')"
454456
]
455457
},
456458
{
@@ -460,23 +462,10 @@
460462
"lang": "en"
461463
},
462464
"source": [
463-
"`3`. Calculate and plot the distribution\n",
464-
"of `taxa` by `sex` for each `plot_id`. (2 min.)"
465-
]
466-
},
467-
{
468-
"cell_type": "code",
469-
"execution_count": null,
470-
"id": "f15d004d-3619-46e8-9ed1-df35f2153c5a",
471-
"metadata": {
472-
"lang": "en"
473-
},
474-
"outputs": [],
475-
"source": [
476-
"# Data cleanup\n",
477-
"merged_left['sex'] = merged_left['sex'].fillna('F|M')\n",
478-
"invalid_mask = ~merged_left['sex'].isin(['F', 'F|M', 'M'])\n",
479-
"merged_left.loc[invalid_mask, 'sex'] = \"F|M\""
465+
"`3`. Calculate and create a bar plot showing\n",
466+
"the average weight per sex for each genus.\n",
467+
"For this exercise, we will use a pivot table instead of `unstack()`.\n",
468+
"(2 min.)"
480469
]
481470
},
482471
{
@@ -491,9 +480,9 @@
491480
},
492481
"outputs": [],
493482
"source": [
494-
"ntaxa_sex_site = merged_left.groupby(\n",
495-
" ['plot_id', 'sex'])['taxa'].nunique().reset_index(level=1)\n",
496-
"ntaxa_sex_site.tail()"
483+
"weights_by_genus_sex = merged_left.groupby(\n",
484+
" ['genus', 'sex'])['weight'].mean().reset_index()\n",
485+
"weights_by_genus_sex.tail()"
497486
]
498487
},
499488
{
@@ -506,9 +495,9 @@
506495
"outputs": [],
507496
"source": [
508497
"# Use pivot_table() instead of unstack()\n",
509-
"pivot_taxa_sex_site = ntaxa_sex_site.pivot_table(\n",
510-
" values='taxa', columns='sex', index=ntaxa_sex_site.index)\n",
511-
"pivot_taxa_sex_site.tail()"
498+
"pivot_weight_genus_sex = weights_by_genus_sex.pivot_table(\n",
499+
" values='weight', index='genus', columns='sex')\n",
500+
"pivot_weight_genus_sex"
512501
]
513502
},
514503
{
@@ -520,7 +509,7 @@
520509
},
521510
"outputs": [],
522511
"source": [
523-
"pivot_taxa_sex_site.plot(kind=\"bar\")"
512+
"pivot_weight_genus_sex.plot(kind=\"bar\")"
524513
]
525514
},
526515
{

0 commit comments

Comments
 (0)