Update cloc_code_stats notebook

oruebel · oruebel · commit a1006b8e0beb · 2025-07-04T19:23:47.000-07:00
diff --git a/notebooks/cloc_code_stats.ipynb b/notebooks/cloc_code_stats.ipynb
@@ -49,6 +49,7 @@
     "from datetime import datetime\n",
     "from matplotlib import pyplot as plt\n",
     "from matplotlib import cm as cm\n",
+    "from collections import OrderedDict\n",
     "import pandas as pd\n",
     "%matplotlib inline"
    ]
@@ -80,16 +81,7 @@
     "date_range = pd.date_range(\n",
     "            start=NWBGitInfo.NWB2_START_DATE if start_date is None else start_date,\n",
     "            end=datetime.today() if end_date is None else end_date,\n",
-    "            freq=\"D\")\n",
-    "\n",
-    "# Select the repos and their order for the summary plot with the lines of code\n",
-    "summary_plot_repos = [\n",
-    "     'PyNWB', 'HDMF', 'MatNWB',\n",
-    "     'NWB_Schema_Language', 'NWB_Schema', \n",
-    "     'HDMF_Common_Schema', 'HDMF_DocUtils', 'HDMF_Zarr',\n",
-    "     'NDX_Catalog', 'NDX_Template', 'NDX_Staged_Extensions', 'NDX_Extension_Smithy',\n",
-    "     'NWBWidgets', 'NWBInspector',\n",
-    "     'NeuroConv']"
+    "            freq=\"D\")"
    ]
   },
   {
@@ -122,12 +114,51 @@
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Repository Keys:\")\n",
+    "print(summary_stats['codes'].keys().values)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. Plot summary of the lines of code across all NWB repos\n",
-    "### 3.1. Plot version 1: Using default colors for repos"
+    "## 3. Plot summary of the lines of code across all NWB repos\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the grouping of the repos\n",
+    "summary_plot_repos_grouped = OrderedDict()\n",
+    "summary_plot_repos_grouped['NWB APIs'] = ['PyNWB', 'MatNWB', 'AqNWB']\n",
+    "summary_plot_repos_grouped['Data Modeling'] = ['HDMF', 'HDMF_Zarr', 'HDMF_Schema_Language', 'NWB_Schema_Language']\n",
+    "summary_plot_repos_grouped['Extension Tools'] = ['NDX_Catalog', 'NDX_Template', 'NDX_Extension_Smithy', 'NDX_Staged_Extensions', 'HDMF_DocUtils']\n",
+    "summary_plot_repos_grouped['Format Schema'] = ['NWB_Schema', 'HDMF_Common_Schema']\n",
+    "summary_plot_repos_grouped['Data Conversion'] = ['NeuroConv', 'NWBInspector', 'NWB_GUIDE']\n",
+    "summary_plot_repos_grouped['Cloud'] = ['LINDI', 'NWB_Benchmarks', 'NWBWidgets']\n",
+    "summary_plot_repos_grouped['Online Resources'] = ['NWB_Overview', 'NWB_Project_Analytics', 'Hackathons']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create flat list of repos\n",
+    "summary_plot_repos = [repo \n",
+    "                      for repo_type in summary_plot_repos_grouped\n",
+    "                      for repo in summary_plot_repos_grouped[repo_type]\n",
+    "                     ] "
    ]
   },
   {
@@ -136,11 +167,45 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Define base colors for each category\n",
+    "base_colors = {\n",
+    "    'NWB APIs':       (0.121, 0.466, 0.705, 1.0),  # Blue\n",
+    "    'Data Modeling':  (1.000, 0.843, 0.000, 1.0),  # Gold/Yellow\n",
+    "    'Data Conversion':(0.200, 0.627, 0.172, 1.0),  # Green\n",
+    "    'Extension Tools':(1.000, 0.498, 0.054, 1.0),  # Orange\n",
+    "    'Format Schema':  (0.839, 0.153, 0.157, 1.0),  # Red\n",
+    "    'Cloud':          (0.580, 0.404, 0.741, 1.0),  # Purple\n",
+    "    'Online Resources':(0.549, 0.337, 0.294, 1.0), # Brown\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.1. Version 1: Using default colors for repos"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create colors such that each repo is assigned a distinct color\n",
     "evenly_spaced_interval = np.linspace(0, 1, len(summary_plot_repos))\n",
-    "#colors = [cm.tab20(x) for x in evenly_spaced_interval]\n",
-    "colors = [cm.Paired(x) for x in evenly_spaced_interval]\n",
+    "colors = [cm.tab20(x) for x in evenly_spaced_interval]\n",
+    "#colors = [cm.Paired(x) for x in evenly_spaced_interval]\n",
     "# mix up colors so that neighbouring areas have more dissimilar colors\n",
-    "colors = [c for i, c in enumerate(colors) if i % 2 == 0] + [c for i, c in enumerate(colors) if i % 2 == 1]\n",
+    "colors = [c for i, c in enumerate(colors) if i % 2 == 0] + [c for i, c in enumerate(colors) if i % 2 == 1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "ax = summary_stats['sizes'][summary_plot_repos].plot.area(\n",
     "    figsize=(18,10), \n",
     "    stacked=True, \n",
@@ -149,13 +214,14 @@
     "    color=colors)\n",
     "ax.get_yaxis().set_major_formatter(\n",
     "    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))\n",
-    "plt.legend(loc=2, prop={'size': 20})\n",
+    "plt.legend(loc=2, prop={'size': 16})\n",
     "plt.ylabel('Lines of Code', fontsize=24)\n",
     "plt.xlabel('Date', fontsize=24)\n",
     "plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')\n",
     "plt.tight_layout()\n",
     "if save_figs:\n",
     "    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all.pdf'))\n",
+    "    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all.png'), dpi=300)\n",
     "plt.title('NWB Code Repository Sizes', fontsize=20)\n",
     "plt.show()"
    ]
@@ -164,9 +230,29 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### 3.1 Plot grouped summary of the lines of code across all NWB repos \n",
+    "### 3.2 Group by color type but keep repos separate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Generate colors to visually group all repos by color and distinguish repos within\n",
+    "## each category based on their alpha value\n",
+    "# Function to generate colors with varying alpha values\n",
+    "def generate_colors(base_color, num_colors):\n",
+    "    r, g, b, _ = base_color\n",
+    "    alpha_step = 0.7 / (num_colors - 1) if num_colors > 1 else 0.7\n",
+    "    return [(r, g, b, max(0.3, 1.0 - i * alpha_step)) for i in range(num_colors)]\n",
     "\n",
-    "For the paper we want to group tools to ease overview."
+    "# Generate colors for each category\n",
+    "colors = []\n",
+    "for category, repos in summary_plot_repos_grouped.items():\n",
+    "    base_color = base_colors[category]\n",
+    "    category_colors = generate_colors(base_color, len(repos))\n",
+    "    colors.extend(category_colors)"
    ]
   },
   {
@@ -175,23 +261,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Sort repos so we can group them category\n",
-    "summary_plot_repos_accum = [\n",
-    "     'NWB_Schema', 'HDMF_Common_Schema', 'NWB_Schema_Language', \n",
-    "     'PyNWB', \n",
-    "     'HDMF',\n",
-    "     'MatNWB',\n",
-    "     'HDMF_DocUtils', 'NWBWidgets', 'NWBInspector',\n",
-    "     'HDMF_Zarr', 'NeuroConv',\n",
-    "     'NDX_Catalog', 'NDX_Template', 'NDX_Staged_Extensions', 'NDX_Extension_Smithy']\n",
-    "colors = [(0.7, 0.0, 0.0, 1.0), (0.7, 0.0, 0.0, 0.6), (0.7, 0.0, 0.0, 0.4),\n",
-    "          (0.0, 0.5, 0.6, 1.0),\n",
-    "          (0.0, 0.75, 0.85, 1.0),\n",
-    "          (0.4, 1.0, 1.0 ,1.0),\n",
-    "          (0.8, 0.4, 0.0, 1.0), (0.8, 0.4, 0.0, 0.7), (0.8, 0.4, 0.0, 0.5),\n",
-    "          (0.8, 0.8, 0.2, 1.0), (0.8, 0.8, 0.2, 0.5),\n",
-    "          (0.0, 0.0, 0.7, 1.0), (0.0, 0.0, 0.7, 0.85), (0.0, 0.0, 0.7, 0.7), (0.0, 0.0, 0.7, 0.55)]\n",
-    "ax = summary_stats['sizes'][summary_plot_repos_accum].plot.area(\n",
+    "ax = summary_stats['sizes'][summary_plot_repos].plot.area(\n",
     "    figsize=(18,10), \n",
     "    stacked=True, \n",
     "    linewidth=0,\n",
@@ -204,39 +274,50 @@
     "plt.xlabel('Date', fontsize=24)\n",
     "plt.grid(color='black', linestyle='--', linewidth=0.7, axis='both')\n",
     "plt.tight_layout()\n",
-    "plt.legend(loc=2, prop={'size': 20,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)\n",
+    "plt.legend(loc=2, prop={'size': 16,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)\n",
     "if save_figs:\n",
     "    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all_grouped.pdf'))\n",
+    "    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_all_grouped.png'), dpi=300)\n",
     "plt.title('NWB Code Repository Sizes', fontsize=20)\n",
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.3 Combine repos into broad categories"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "repo_sizes_grouped_df = pd.DataFrame.from_dict(\n",
-    "    {'Format Schema': (summary_stats['sizes']['NWB_Schema'] + \n",
-    "                       summary_stats['sizes']['HDMF_Common_Schema'] + \n",
-    "                       summary_stats['sizes']['NWB_Schema_Language']),\n",
-    "     'HDMF': summary_stats['sizes']['HDMF'],\n",
-    "     'PyNWB': summary_stats['sizes']['PyNWB'],\n",
-    "     'MatNWB': summary_stats['sizes']['MatNWB'],\n",
-    "     'NWB Tools': (summary_stats['sizes']['HDMF_DocUtils'] + \n",
-    "                   summary_stats['sizes']['NWBWidgets'] + \n",
-    "                   summary_stats['sizes']['NWBInspector'] + \n",
-    "                   summary_stats['sizes']['HDMF_Zarr'] + \n",
-    "                   summary_stats['sizes']['NeuroConv']),\n",
-    "     'NDX Catalog': (summary_stats['sizes']['NDX_Catalog'] +\n",
-    "                     summary_stats['sizes']['NDX_Template'] +\n",
-    "                     summary_stats['sizes']['NDX_Staged_Extensions'] +\n",
-    "                     summary_stats['sizes']['NDX_Extension_Smithy']),\n",
-    "    }\n",
-    ")\n",
-    "colors = [(78, 92, 150), (81, 133, 189), (155, 187, 89), (115, 147, 49), (191, 80, 77), (207, 130, 58)]\n",
-    "colors = [ (c[0]/255.0, c[1]/255.0, c[2]/255.0, 1.0) for c in colors]\n",
+    "# Create DataFrame with the total lines of code for each category (instead of for each repo)\n",
+    "repo_sizes_grouped = OrderedDict()\n",
+    "for category, repos in summary_plot_repos_grouped.items():\n",
+    "    category_size = None\n",
+    "    for repo in repos:\n",
+    "        if category_size is None:\n",
+    "            category_size = summary_stats['sizes'][repo]\n",
+    "        else:\n",
+    "            category_size += summary_stats['sizes'][repo]\n",
+    "    repo_sizes_grouped [category] = category_size\n",
+    "repo_sizes_grouped_df = pd.DataFrame.from_dict(repo_sizes_grouped)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# colors = [(78, 92, 150), (81, 133, 189), (155, 187, 89), (115, 147, 49), (191, 80, 77), (207, 130, 58)]\n",
+    "# colors = [ (c[0]/255.0, c[1]/255.0, c[2]/255.0, 1.0) for c in colors]\n",
+    "colors = [(c[0], c[1], c[2], 0.8)    for c in base_colors.values()]\n",
+    "\n",
     "ax = repo_sizes_grouped_df.plot.area(\n",
     "    figsize=(18,10), \n",
     "    stacked=True, \n",
@@ -253,6 +334,7 @@
     "plt.legend(loc=2, prop={'size': 24,}, facecolor=(1.0, 1.0, 1.0, 1.0), framealpha=1.0)\n",
     "if save_figs:\n",
     "    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_grouped.pdf'))\n",
+    "    plt.savefig(os.path.join(plot_dir, 'nwb_repo_sizes_grouped.png'), dpi=300)\n",
     "    \n",
     "plt.title('NWB Code Repository Sizes', fontsize=20)\n",
     "plt.show()"
@@ -262,7 +344,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4. Plot per-repo total lines of code statistics broken down by: code, blank, comment"
+    "## 4. Plot per-repo stats\n",
+    "### 4.1 Plot total lines of code statistics broken down by: code, blank, comment"
    ]
   },
   {
@@ -295,7 +378,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 4. Per-repo total lines of code statistics broken down by language type"
+    "## 4.2 Per-repo total lines of code statistics broken down by language type"
    ]
   },
   {
@@ -365,7 +448,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,