Enhance FAQ page layout, add package info parsing utility, and update workflow schedule

OllyButters · OllyButters · commit ab82675e1d13 · 2026-06-09T23:34:59.000+01:00
diff --git a/.github/workflows/build_packages_info.yml b/.github/workflows/build_packages_info.yml
@@ -1,12 +1,12 @@
 # Start with a list of packages in packages.csv, and get external data about them (e.g. last commit date) to generate an HTML page with the package information. This workflow is triggered manually from the Actions tab.
 # Olly Butters
-# 15/5/26
+# 9/6/26
 name: Package list
 
 # Controls when the workflow will run
 on:
   schedule:
-    - cron: '30 5 * * 0'
+    - cron: '30 5 * * *'
       timezone: "Europe/London"
   push:
     branches: [ "main" ]
@@ -89,7 +89,7 @@ jobs:
                 fi
               fi
   
-              # Use the GitHub API to pull in info about the repoisitory
+              # Use the GitHub API to pull in info about the repository
               echo "\nGitHub API information"
               gh repo view --json codeOfConduct,description,homepageUrl,latestRelease,licenseInfo,owner,parent,updatedAt > gh_repo_info.json
               echo "API response:"
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 /output/*
 output.csv
-/cache/*
+/cache/*
+*.pyc
diff --git a/build_web_pages/build_packages_long_page.py b/build_web_pages/build_packages_long_page.py
@@ -1,9 +1,9 @@
-import csv
 import datetime
 import shutil
 import os
 import re
 from parse_DESCRIPTION_file import parse_description_file
+from utils import parse_package_info
 
 def main(package_file_path, functions_file_path, html_file_path):
     """
@@ -23,28 +23,8 @@ def main(package_file_path, functions_file_path, html_file_path):
     if not os.path.exists("output"):
         os.makedirs("output")
 
-    # Initialize a dictionary to hold package information
-    package_info = {}
-
-    # Parse the package information from the CSV file
-    with open(package_file_path, 'r', encoding='utf-8') as csv_file:
-        reader = csv.reader(csv_file)
-        next(reader)
-
-        for row in reader:
-            package_name = row[0]
-            package_info[package_name] = {
-                "short_description": row[1],
-                "cran_link": row[2],
-                "cran_version": row[3],
-                "cran_license": row[4],
-                "github_link": row[5],
-                "github_last_update": row[6],
-                "github_version": row[7],
-                "github_license": row[9],
-                "github_owner": row[10],
-                "status": row[11]
-            }
+    # Dictionary to hold package information
+    package_info = parse_package_info(package_file_path)
 
     # Parse the functions.txt file
     with open(functions_file_path, 'r', encoding='utf-8') as file:
@@ -57,6 +37,10 @@ def main(package_file_path, functions_file_path, html_file_path):
                 functions = match.group(2).split(',')
                 package_info[package_name]["functions"] = functions
 
+    with open(functions_file_path, 'r', encoding='utf-8') as functions_file:
+        full_function_file_text = functions_file.read()
+        number_of_functions = full_function_file_text.count(",")
+
     # Parse the DESCRIPTION files
     for package_name in package_info.keys():
         try:
@@ -88,10 +72,13 @@ def main(package_file_path, functions_file_path, html_file_path):
 
         top_content = '<div class="top-content">'
         top_content += '<h1>DataSHIELD packages</h1>'
-        top_content += '<p>This page lists all the packages that are used in the DataSHIELD ecosystem. It includes packages that are in production, in development, retired, and unknown status.</p>'
+        top_content += '<p>This page lists all the packages that have been developed in the <a href="https://www.datashield.org">DataSHIELD</a> ecosystem. It includes packages that are in production, in development, retired, and unknown status. More info is in the <a href="./faq.html">FAQ</a>.</p>'
 
         html_file.write(top_content)
 
+        stats = f'<p>There are {len(package_info)} packages and {number_of_functions} functions listed on these pages.</p>'
+        html_file.write(stats)
+
         for this_package_name, this_package_info in package_info.items():
             try:
                 html_content = f"""
@@ -102,7 +89,7 @@ def main(package_file_path, functions_file_path, html_file_path):
                     <tr><td class="label">Short description</td><td class="left">{this_package_info.get('short_description', 'No short description available.')}</td></tr>
                     <tr><td class="label">Long description</td><td class="left">{this_package_info.get('DESCRIPTION', {}).get('Description', 'No long description available.')}</td></tr>
                 """
-                
+
                 if this_package_info.get('cran_link'):
                     html_content += f"""
                     <tr><td class="label">CRAN link</td><td class="left"><a href="{this_package_info.get('cran_link')}" target="_blank">{this_package_info.get('cran_link')}</a></td></tr>
@@ -123,8 +110,14 @@ def main(package_file_path, functions_file_path, html_file_path):
                 else:
                     html_content += '<tr><td class="label">CRAN link</td><td class="left">N/A</td></tr>'
 
+                if this_package_info.get('github_version_url') == 'null':
+                    html_content += '<tr><td class="label">GitHub version</td><td class="left"></td></tr>'
+                else:
+                    html_content += f"""
+                    <tr><td class="label">GitHub version</td><td class="left"><a href="{this_package_info.get('github_version_url')}" target="_blank">{this_package_info.get('github_version')}</a></td></tr>
+                    """
+
                 html_content += f"""
-                    <tr><td class="label">GitHub version</td><td class="left">{this_package_info.get('github_version', 'No GitHub version available.')}</td></tr>
                     <tr><td class="label">GitHub license</td><td class="left">{this_package_info.get('github_license', 'No GitHub license available.')}</td></tr>
                     <tr><td class="label">GitHub owner</td><td class="left">{this_package_info.get('github_owner', 'No GitHub owner available.')}</td></tr>
                     <tr><td class="label">Status</td><td class="left">{this_package_info.get('status', 'Unknown')}</td></tr> 
@@ -138,6 +131,7 @@ def main(package_file_path, functions_file_path, html_file_path):
                 continue
 
         # Footer
+        html_file.write('<hr/>')
         html_file.write('Generated on ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
         html_file.write('&nbsp;&nbsp;Made by the <a href="https://github.com/FederatedMethods">Federated Methods team</a>')
 
@@ -146,3 +140,4 @@ def main(package_file_path, functions_file_path, html_file_path):
 if __name__ == '__main__':
     main('cache/output.csv', 'cache/functions.txt', './output/packages.html')
     shutil.copy('./build_web_pages/template/style_main.css', './output/style_main.css')
+    shutil.copy('./build_web_pages/template/faq.html', './output/faq.html')
diff --git a/build_web_pages/build_packages_summary_page.py b/build_web_pages/build_packages_summary_page.py
@@ -1,7 +1,7 @@
-import csv
 import datetime
 import shutil
 import os
+from utils import parse_package_info
 
 def main(csv_file_path, html_file_path, functions_file_path):
     print(f"Current working directory: {os.getcwd()}")
@@ -32,109 +32,115 @@ def main(csv_file_path, html_file_path, functions_file_path):
 
         top_content = '<div class="top-content">'
         top_content += '<h1>DataSHIELD packages</h1>'
-        top_content += '<p>This page lists all the packages that have been developed in the DataSHIELD ecosystem. It includes packages that are in production, in development, retired, and unknown status.</p>'
+        top_content += '<p>This page lists all the packages that have been developed in the <a href="https://www.datashield.org">DataSHIELD</a> ecosystem. It includes packages that are in production, in development, retired, and unknown status.</p>'
 
         html_file.write(top_content)
 
-        with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
-            package_list = csv.reader(csv_file)
-
-            # Count the number of lines in the CSV file then go back to the beginning
-            row_count = sum(1 for row in package_list)  # fileObject is your csv.reader
-            csv_file.seek(0)
-
-            stats = f'<p>There are {row_count - 1} packages and {number_of_functions} functions listed on these pages.</p>'
-            html_file.write(stats)
-
-            # Skip header
-            next(package_list)
-
-            html_file.write('<table border="1">\n')
-
-            html_file.write('<tr><td></td>')
-            html_file.write('<th>Name</th>')
-            html_file.write('<th>Description</th>')
-            html_file.write('<th>CRAN version</th>')
-            html_file.write('<th>CRAN license</th>')
-            html_file.write('<th>GitHub last update</th>')
-            html_file.write('<th>GitHub version</th>')
-            html_file.write('<th>GitHub license</th>')
-            html_file.write('<th>GitHub owner</th>')
-            html_file.write('</tr>')
-
-            production_rows = ""
-            development_rows = ""
-            retired_rows = ""
-            unknown_rows = ""
-
-            # Counts for each rows
-            production_rows_count = 1
-            development_rows_count = 1
-            retired_rows_count = 1
-            unknown_rows_count = 1
-
-            for row in package_list:
-                print(row)
-                this_row = ""
-                
-                # Add GitHub link if available, otherwise just the name
-                this_row += '<td class="left"><a href="packages.html#' + row[0] + '">' + row[0] + '</a></td>'
-
-                # Description
-                if len(row[1]) > 0:
-                    this_row += '<td class="left">' + row[1] + '</td>'
-                else:
-                    this_row += '<td></td>'
-                
-                # CRAN version with link
-                if len(row[2]) > 0:
-                    this_row += '<td><a href="' + row[2] + '" target="blank">' + row[3] + '</a></td>' 
-                else:
-                    this_row += '<td></td>'
-
-                this_row += '<td>' + row[4] + '</td>' # CRAN license
-                this_row += '<td>' + row[6] + '</td>' # GH last update
-                this_row += '<td>' + row[7] + '</td>' # GH version
-                this_row += '<td>' + row[9] + '</td>' # GH license
-                this_row += '<td class="left">' + row[10] + '</td>' # GH owner
-
-                # Group rows by status and add put a row number in
-                if row[11].strip() == 'production':
-                    production_rows += '<tr>'
-                    production_rows += '<td>' + str(production_rows_count) + '</td>' # Row number
-                    production_rows += this_row
-                    production_rows += '</tr>'
-                    production_rows_count += 1
-                elif row[11].strip() == 'development':
-                    development_rows += '<tr>'
-                    development_rows += '<td>' + str(development_rows_count) + '</td>'
-                    development_rows += this_row
-                    development_rows += '</tr>'
-                    development_rows_count += 1
-                elif row[11].strip() == 'retired':
-                    retired_rows += '<tr>'
-                    retired_rows += '<td>' + str(retired_rows_count) + '</td>'
-                    retired_rows += this_row
-                    retired_rows += '</tr>'
-                    retired_rows_count += 1
-                else:
-                    unknown_rows += '<tr>'
-                    unknown_rows += '<td>' + str(unknown_rows_count) + '</td>'
-                    unknown_rows += this_row
-                    unknown_rows += '</tr>'
-                    unknown_rows_count += 1
-
-            # Build the table with the rows grouped by status
-            html_file.write('<tr><td colspan="9">Production</td></tr>')
-            html_file.write(production_rows)
-            html_file.write('<tr><td colspan="9">Development</td></tr>')
-            html_file.write(development_rows)
-            html_file.write('<tr><td colspan="9">Retired</td></tr>')
-            html_file.write(retired_rows)
-            html_file.write('<tr><td colspan="9">Unknown</td></tr>')
-            html_file.write(unknown_rows)
-            html_file.write('</table>')
-
+        package_info = parse_package_info(csv_file_path)
+
+        stats = f'<p>There are {len(package_info)} packages and {number_of_functions} functions listed on these pages.</p>'
+        html_file.write(stats)
+
+        html_file.write('<table border="1">\n')
+
+        html_file.write('<tr><td></td>')
+        html_file.write('<th>Name</th>')
+        html_file.write('<th>Description</th>')
+        html_file.write('<th>CRAN version</th>')
+        html_file.write('<th>CRAN license</th>')
+        html_file.write('<th>GitHub last update</th>')
+        html_file.write('<th>GitHub version</th>')
+        html_file.write('<th>GitHub license</th>')
+        html_file.write('<th>GitHub owner</th>')
+        html_file.write('</tr>')
+
+        production_rows = ""
+        development_rows = ""
+        retired_rows = ""
+        unknown_rows = ""
+
+        # Counts for each rows
+        production_rows_count = 1
+        development_rows_count = 1
+        retired_rows_count = 1
+        unknown_rows_count = 1
+
+        #for row in package_list:
+        for this_package_name, this_package_info in package_info.items():
+            print(this_package_name)
+            print(this_package_info)
+            this_row = ""
+
+            # Add GitHub link if available, otherwise just the name
+            this_row += '<td class="left"><a href="packages.html#' + this_package_name + '">' + this_package_name + '</a></td>'
+
+            # Description
+            if len(this_package_info['short_description']) > 0:
+                this_row += '<td class="left">' + this_package_info['short_description'] + '</td>'
+            else:
+                this_row += '<td></td>'
+
+            # CRAN version with link
+            if len(this_package_info.get('cran_link')) > 0:
+                this_row += '<td><a href="' + this_package_info['cran_link'] + '" target="blank">' + this_package_info['cran_version'] + '</a></td>' 
+            else:
+                this_row += '<td></td>'
+
+            this_row += '<td>' + this_package_info.get('cran_license', "") + '</td>' # CRAN license
+            this_row += '<td>' + this_package_info.get('github_last_update', "") + '</td>' # GH last update
+            
+                # GH version with link
+            if this_package_info.get('github_version_url') == "null":
+                this_row += '<td></td>'
+            else:
+                this_row += '<td><a href="' + this_package_info['github_version_url'] + '" target="_blank">' + this_package_info['github_version'] + '</a></td>'
+            
+            # GH license
+            if this_package_info.get('github_license') == "null":
+                this_row += '<td></td>'
+            else:
+                this_row += '<td>' + this_package_info.get('github_license', "") + '</td>'
+            
+            this_row += '<td class="left"><a href="https://github.com/' + this_package_info['github_owner'] + '" target="_blank">' + this_package_info['github_owner'] + '</a></td>' # GH owner
+
+            # Group rows by status and add put a row number in
+            if this_package_info['status'].strip() == 'production':
+                production_rows += '<tr>'
+                production_rows += '<td>' + str(production_rows_count) + '</td>' # Row number
+                production_rows += this_row
+                production_rows += '</tr>'
+                production_rows_count += 1
+            elif this_package_info['status'].strip() == 'development':
+                development_rows += '<tr>'
+                development_rows += '<td>' + str(development_rows_count) + '</td>'
+                development_rows += this_row
+                development_rows += '</tr>'
+                development_rows_count += 1
+            elif this_package_info['status'].strip() == 'retired':
+                retired_rows += '<tr>'
+                retired_rows += '<td>' + str(retired_rows_count) + '</td>'
+                retired_rows += this_row
+                retired_rows += '</tr>'
+                retired_rows_count += 1
+            else:
+                unknown_rows += '<tr>'
+                unknown_rows += '<td>' + str(unknown_rows_count) + '</td>'
+                unknown_rows += this_row
+                unknown_rows += '</tr>'
+                unknown_rows_count += 1
+
+        # Build the table with the rows grouped by status
+        html_file.write('<tr><td colspan="9">Production</td></tr>')
+        html_file.write(production_rows)
+        html_file.write('<tr><td colspan="9">Development</td></tr>')
+        html_file.write(development_rows)
+        html_file.write('<tr><td colspan="9">Retired</td></tr>')
+        html_file.write(retired_rows)
+        html_file.write('<tr><td colspan="9">Unknown</td></tr>')
+        html_file.write(unknown_rows)
+        html_file.write('</table>')
+
+        html_file.write('<hr/>')
         html_file.write('Generated on ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
         html_file.write('&nbsp;&nbsp;Made by the <a href="https://github.com/FederatedMethods">Federated Methods team</a>')
 
diff --git a/build_web_pages/template/faq.html b/build_web_pages/template/faq.html
@@ -4,12 +4,14 @@
 </head>
 <body>
     <h1>Frequently Asked Questions</h1>
-<p>
-What is a DataSHIELD package?
-</p>
 
-<p>
-How can I add my package to this list?
+    
+<h2>What is a DataSHIELD package?</h2>
+<p></p>
+
+
+<h2>How can I add my package to this list or update some information?</h2>
+<p>You could submit an issue or a pull request to update the csv file at <a href="https://github.com/FederatedMethods/packages/blob/main/package_list.csv">https://github.com/FederatedMethods/packages/blob/main/package_list.csv</a>. Most of this information is gathered from the package info on GitHub or the CRAN, so you may need to update the information there, then it will appear here tomorrow.
 </p>
 
 <h2>Why are retired packages listed?</h2>
diff --git a/build_web_pages/utils.py b/build_web_pages/utils.py