calculquebec
diff --git a/‎en/04-combine.ipynb‎
Lines changed: 65 additions & 74 deletions b/‎en/04-combine.ipynb‎
Lines changed: 65 additions & 74 deletions
@@ -7,7 +7,7 @@
     "lang": "en"
    },
    "source": [
-    "# Data Analysis and Visualization in Python\n",
+    "# Data Analysis with Python\n",
     "## Combining DataFrames with pandas\n",
     "Questions\n",
     "* Can I work with data from multiple sources?\n",
@@ -25,7 +25,7 @@
     "lang": "en"
    },
    "source": [
-    "## Loading our data"
+    "## List data files"
    ]
   },
   {
@@ -37,12 +37,12 @@
    },
    "outputs": [],
    "source": [
-    "# First make sure pandas is loaded\n",
-    "import pandas as pd\n",
+    "# Function for \"globbing\" (searching by file name pattern)\n",
+    "from glob import glob\n",
     "\n",
     "# List a collection of CSV files\n",
-    "from glob import glob\n",
-    "glob('../data/by_year/*.csv')"
+    "csv_files = glob('../data/by_year/*.csv')\n",
+    "csv_files[-5:]"
    ]
   },
   {
@@ -64,6 +64,8 @@
    },
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "\n",
     "year2001 = pd.read_csv('../data/by_year/surveys_2001.csv')\n",
     "year2002 = pd.read_csv('../data/by_year/surveys_2002.csv')\n",
     "\n",
@@ -109,12 +111,13 @@
    "outputs": [],
    "source": [
     "# Accumulate data from all files in the collection\n",
-    "surveys_df = pd.DataFrame()  # Empty DataFrame\n",
+    "df_list = []\n",
     "\n",
     "for filename in glob('../data/by_year/*.csv'):\n",
-    "    df_year = pd.read_csv(filename)\n",
-    "    surveys_df = pd.concat([surveys_df, df_year], axis='index')\n",
+    "    df_by_year = pd.read_csv(filename)\n",
+    "    df_list.append(df_by_year)\n",
     "\n",
+    "surveys_df = pd.concat(df_list, axis='index')\n",
     "surveys_df = surveys_df.reset_index(drop=True)\n",
     "surveys_df"
    ]
@@ -128,8 +131,9 @@
    "source": [
     "## Exercise - Concatenating DataFrames\n",
     "* Load the data from all CSV files in the directory\n",
-    "  `../data/by_species_id/` and accumulate them in `surveys_sp`.\n",
-    "* Reset the index while dropping the accumulated one.\n",
+    "  `../data/by_species_id/` and accumulate them in a list.\n",
+    "* Concatenate the DataFrames of that list.\n",
+    "* Reset the index without preserving it.\n",
     "\n",
     "(4 min.)"
    ]
@@ -146,13 +150,12 @@
    },
    "outputs": [],
    "source": [
-    "surveys_sp = pd.DataFrame()  # Empty DataFrame\n",
+    "df_list = []\n",
     "\n",
     "for filename in ###('../data/by_species_id/*.csv'):\n",
-    "    new_df = pd.read_csv(filename)\n",
-    "    surveys_sp = pd.###([###, new_df], ###='index')\n",
+    "    df_list.###(pd.read_csv(filename))\n",
     "\n",
-    "surveys_sp = surveys_sp.###(drop=###)\n",
+    "surveys_sp = pd.###(###, ###='index').###(drop=###)\n",
     "surveys_sp"
    ]
   },
@@ -239,6 +242,20 @@
     "species_sub"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9729aee-b862-4229-ade9-8ca2a52ce273",
+   "metadata": {
+    "lang": "en"
+   },
+   "outputs": [],
+   "source": [
+    "# The first ten records\n",
+    "head10 = surveys_df.head(10)\n",
+    "head10"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8886c399-91c5-49f9-afe5-7095d303ed78",
@@ -252,13 +269,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f2b0fb1d-0a90-47d1-b016-caf51a192033",
+   "id": "07441fe7-284b-4458-8561-fa91b50e32d1",
    "metadata": {
-    "lang": "en,fr"
+    "lang": "en"
    },
    "outputs": [],
    "source": [
-    "surveys_df.columns"
+    "head10.columns"
    ]
   },
   {
@@ -302,12 +319,14 @@
    },
    "outputs": [],
    "source": [
-    "head10 = surveys_df.head(10)\n",
-    "\n",
     "# Computing the inner join of head10 and species_sub\n",
     "key = 'species_id'\n",
-    "merged_inner = pd.merge(left=head10, right=species_sub,\n",
-    "                        left_on=key, right_on=key)\n",
+    "merged_inner = pd.merge(\n",
+    "    left=head10,\n",
+    "    right=species_sub,\n",
+    "    left_on=key,\n",
+    "    right_on=key\n",
+    ")\n",
     "# What's the size of the output data?\n",
     "merged_inner.shape"
    ]
@@ -353,8 +372,12 @@
    },
    "outputs": [],
    "source": [
-    "merged_left = pd.merge(left=head10, right=species_sub,\n",
-    "                       on=key, how='left')\n",
+    "merged_left = pd.merge(\n",
+    "    left=head10,\n",
+    "    right=species_sub,\n",
+    "    how='left',\n",
+    "    on=key\n",
+    ")\n",
     "# What's the size of the output data?\n",
     "merged_left.shape"
    ]
@@ -411,7 +434,11 @@
     "species_df = pd.read_csv('../data/species.csv')\n",
     "\n",
     "merged_left = pd.merge(\n",
-    "    left=surveys_df, right=###, on=###, how=###)\n",
+    "    left=surveys_df,\n",
+    "    right=###,\n",
+    "    how=###,\n",
+    "    on=###\n",
+    ")\n",
     "merged_left.shape"
    ]
   },
@@ -422,8 +449,9 @@
     "lang": "en"
    },
    "source": [
-    "`2`. Calculate and plot the evolution of the average\n",
-    "hindfoot length for each genus from year to year. (3 min.)"
+    "`2`. Calculate the evolution of the average hindfoot\n",
+    "length for each genus from year to year. Transform the\n",
+    "result such that each genus gets its own column. (4 min.)"
    ]
   },
   {
@@ -443,35 +471,21 @@
     "average_lengths.tail()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e678c22f-2a1c-4e90-ad93-f3df07a7e09d",
-   "metadata": {
-    "lang": "en"
-   },
-   "outputs": [],
-   "source": [
-    "average_lengths.plot(kind='line')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "706256da-5ac4-49ab-b593-92611792fcc7",
    "metadata": {
     "lang": "en"
    },
    "source": [
-    "`3`. Calculate and create a bar plot showing\n",
-    "the average weight per sex for each genus.\n",
-    "For this exercise, we will use a pivot table instead of `unstack()`.\n",
-    "(2 min.)"
+    "`3`. Calculate the average weight per sex for each genus. For this\n",
+    "exercise, we will use a pivot table instead of `unstack()`. (3 min.)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f0d64bba-7946-4db8-89e1-de978eda2434",
+   "id": "43d5c2e5-1fe8-4bef-adc8-d0782f5081cc",
    "metadata": {
     "lang": "en",
     "tags": [
@@ -480,36 +494,13 @@
    },
    "outputs": [],
    "source": [
-    "weights_by_genus_sex = merged_left.groupby(\n",
-    "    ['genus', 'sex'])['weight'].###()#.reset_index()\n",
-    "weights_by_genus_sex.tail()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5542192f-8c3f-4996-b07a-78d40d035f94",
-   "metadata": {
-    "lang": "en"
-   },
-   "outputs": [],
-   "source": [
-    "# Use pivot_table() instead of unstack()\n",
-    "pivot_weight_genus_sex = weights_by_genus_sex.pivot_table(\n",
-    "    values='weight', index='genus', columns='sex')\n",
-    "pivot_weight_genus_sex"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b45d6a03-66f8-429d-9fbe-d1bda91ae9f8",
-   "metadata": {
-    "lang": "en,fr"
-   },
-   "outputs": [],
-   "source": [
-    "pivot_weight_genus_sex.plot(kind=\"bar\")"
+    "# Use pivot_table() instead of groupby() + unstack()\n",
+    "merged_left.###(\n",
+    "    values=###,\n",
+    "    index=###,\n",
+    "    columns=###,\n",
+    "    aggfunc=###\n",
+    ")"
    ]
   },
   {