Merge pull request #32 from spacetelescope/ASB-29533_db_outputs

astrojimig · web-flow · commit f539100eebe3 · 2026-01-26T09:30:22.000-05:00
ASB-29533: Adding option to save filename checker output in alternate format (csv, fits, excel, html)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ## [Unreleased]
 ### Added
+[PR # 32](https://github.com/spacetelescope/mast_contributor_tools/pull/32)
+    - Adding option to save filename checker output in altnerate format (csv, fits, excel, html)
 
 ### Changed
 
diff --git a/TUTORIAL/tutorial_readme.md b/TUTORIAL/tutorial_readme.md
@@ -164,6 +164,16 @@ mct check_filenames mct-tutorial --directory='tutorial-data/'  --exclude='*_spec
 ```
 The output message will report that this command only checked 2 files, but both files passed.
 
+### Step 4e. Exporting results to alternate format
+
+If you're not used to working with SQLite `.db` files, you can also save the output to an alternate format using the `--output_format` flag. Currently supported values include "csv", "fits", "html", and "excel". The `.db` file is always written by default, but this option will write an additional file in the chosen format.
+
+For example, this command will save out a html file with the filename checker results in a color-coded table:
+
+```shell
+mct check_filenames mct-tutorial --directory='tutorial-data/' --output_format='html'
+```
+
 
 # Additional Resources
 Congratulations! You have completed this tutorial and now know the basic usage of the MAST Contributor's Tools Filename Checker.
diff --git a/docs/filename_check_readme.md b/docs/filename_check_readme.md
@@ -22,6 +22,7 @@ The various options for this command are described below:
 | `-e` or `--exclude`     | File pattern to exclude from testing, for example '*.jpg' to test all files except the jpgs | None                 |
 | `-n` or `--max_n`       | Maximum number of files to check, for testing purposes.                       | None (all files)                   |
 | `-db` or `--dbFile`     | Name of Results database file                                                 | `results_<hlsp_name>.db`           |
+| `-f` or `--output_format`     | Write output to alternate format. Currently supports "csv", "fits", "html" or "excel"                                                 | `db`           |
 | `-v` or `--verbose`     | Enables verbose output for more information                                   | `False`                            |
 | `--help`                | Prints information about this command                                         |                                    |
 
diff --git a/mast_contributor_tools/filename_check/fc_app.py b/mast_contributor_tools/filename_check/fc_app.py
@@ -92,7 +92,7 @@ def get_file_paths(
     return file_list
 
 
-def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str) -> None:
+def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str, output_format: str = "db") -> None:
     """Recursively check filenames in a directory tree of HLSP products
 
     Parameters
@@ -103,6 +103,8 @@ def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str) -> None:
         List of files to check, typically output from get_file_paths()
     dbFile : str, optional
         Name of SQLite database file to contain results
+    output_format : str, optional
+       Alternate format to save results to: 'csv', 'fits', 'html', or 'excel'. Default: "db"
     """
     # Make sure hlsp name is valid
     if not FieldRule.match_pattern(hlsp_name, HLSPNAME_REGEX):
@@ -153,6 +155,14 @@ def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str) -> None:
             logger.debug(f"Verdict for {f.name}: '{file_rec['final_verdict']}'")
 
     logger.critical(db.print_summary())  # print summary information on how many files passed
+    logger.critical(f"\nResults written to {dbFile}")
+
+    # Write ouput to alternate format if specified
+    if output_format != "db":
+        logger.debug(f"Also writing to alternate format '{output_format}'")
+        ouput_files = db.write_to_alternate_format(output_format)
+        logger.critical(f"Written to {ouput_files}")
+
     db.close_db()
     logger.critical(f"\nFilename checking complete. Results written to {dbFile}")
 
diff --git a/mast_contributor_tools/filename_check/fc_db.py b/mast_contributor_tools/filename_check/fc_db.py
@@ -2,6 +2,10 @@
 
 import sqlite3
 
+import astropy.io.fits as fits
+import pandas as pd
+from astropy.table import Table
+
 # The following SQL will create am SQLite database
 FILENAME_TABLE = """
         CREATE TABLE IF NOT EXISTS filename (
@@ -133,3 +137,113 @@ def print_summary(self) -> str:
             # Add more detail here later? - could break down by fields, etc.
 
         return summary_message
+
+    def write_to_alternate_format(self, save_format: str) -> list[str]:
+        """
+        Write out the SQLite DB as an alternate format.
+
+        Parameters
+        ----------
+        save_format : str
+            Format to save output: 'csv', 'excel', 'html', or 'fits'
+
+        Returns
+        --------
+        files_written: list[str]: List of file names written out
+        """
+        # Check that input is a valid option
+        supported_formats = ["csv", "excel", "fits", "html"]
+        if save_format.lower() not in supported_formats:
+            msg = f"Save Format '{save_format}' is not supported."
+            msg += f"Please choose from {supported_formats}"
+            raise ValueError(msg)
+
+        # Construct new filename to save to
+        fileroot = self.db_file.strip(".db")
+
+        # Read DB file as pandas dataframe
+        self.conn = sqlite3.connect(self.db_file)
+        filename_data = pd.read_sql_query("SELECT * FROM filename", self.conn)
+        fields_data = pd.read_sql_query("SELECT * FROM fields", self.conn)
+        # Close connection
+        self.conn.close()
+
+        # Save out data in new format
+        # CSV files
+        if save_format == "csv":
+            ouput_filename1 = f"{fileroot}_filenames.csv"
+            ouput_filename2 = f"{fileroot}_fields.csv"
+            filename_data.to_csv(ouput_filename1)
+            fields_data.to_csv(ouput_filename2)
+            files_written = [ouput_filename1, ouput_filename2]
+        # Excel spreadsheet
+        elif save_format == "excel":
+            output_filename = f"{fileroot}.xlsx"
+            # Style dataframe: change color of cell depending on verdict
+            filename_data = filename_data.style.map(color_formatter)
+            fields_data = fields_data.style.map(color_formatter)
+            # Save as one Excel document with two sheets:
+            # One for filenames table and one for fields
+            with pd.ExcelWriter(output_filename) as excel_writer:
+                filename_data.to_excel(excel_writer, sheet_name="FileNames")
+                fields_data.to_excel(excel_writer, sheet_name="Fields")
+            files_written = [output_filename]
+
+        # Fits table
+        elif save_format == "fits":
+            # Save as one fits file with two table extensions:
+            # One for filenames table and one for fields
+            output_filename = f"{fileroot}.fits"
+            hdu_list = fits.HDUList(
+                [
+                    fits.PrimaryHDU(),  # TODO: add some metadata?
+                    fits.table_to_hdu(Table.from_pandas(filename_data), name="FILENAMES"),
+                    fits.table_to_hdu(Table.from_pandas(fields_data), name="FIELDS"),
+                ]
+            )
+            hdu_list.writeto(output_filename, overwrite=True)
+            files_written = [output_filename]
+
+        # Html table
+        elif save_format == "html":
+            ouput_filename1 = f"{fileroot}_filenames.html"
+            ouput_filename2 = f"{fileroot}_fields.html"
+            # Style dataframe: change color of cell depending on verdict
+            filename_data = filename_data.style.map(color_formatter)
+            fields_data = fields_data.style.map(color_formatter)
+            # Then write to html file
+            filename_data.to_html(
+                ouput_filename1,
+                header=True,
+            )
+            fields_data.to_html(
+                ouput_filename2,
+                header=True,
+            )
+            files_written = [ouput_filename1, ouput_filename2]
+
+        else:
+            # No alternate format - only DB file
+            files_written = [self.db_file]
+
+        return files_written
+
+
+def color_formatter(value: str) -> str:
+    """Color mapping for use in write_to_alternate_format().
+    Color-codes table cells based on verdict: green for "PASS", red for "FAIL", etc.
+
+    Parameters
+    ------
+    value: str
+
+    """
+    if str(value).upper() == "PASS":
+        color = "lightgreen"
+    elif str(value).upper() == "FAIL":
+        color = "red"
+    elif str(value).upper() == "NEEDS REVIEW":
+        color = "yellow"
+    else:
+        color = None
+    return "background-color: %s" % color
diff --git a/mast_contributor_tools/mast_cli.py b/mast_contributor_tools/mast_cli.py
@@ -47,6 +47,12 @@ def cli() -> None:
 @click.option("-e", "--exclude", default="", help="File pattern to exclude from testing, for example '\\*.png'")
 @click.option("-n", "--max_n", default=None, help="Maximum number of files to check, for testing purposes.")
 @click.option("-db", "--dbFile", default="", help="Results database filename (defaults to: results_<hlsp_name>.db)")
+@click.option(
+    "-f",
+    "--output_format",
+    default="db",
+    help="Write output to alternate format (csv, fits, html, excel)",
+)
 @click.option("-v", "--verbose", default=False, flag_value=True, help="Enable verbose output")
 def filenames_cli(
     hlsp_name: str,
@@ -56,6 +62,7 @@ def filenames_cli(
     exclude: str = "",
     max_n: Union[int, None] = None,
     dbfile: str = "",
+    output_format: str = "db",
     verbose: bool = False,
 ) -> None:
     """
@@ -102,7 +109,7 @@ def filenames_cli(
     )
 
     # Perform the file name check
-    check_filenames(hlsp_name, file_list, dbFile=dbfile)
+    check_filenames(hlsp_name, file_list, dbFile=dbfile, output_format=output_format)
 
 
 @cli.command("check_filename", short_help="Check a single file name against MAST HLSP naming standards")
diff --git a/mast_contributor_tools/tests/filename_check/test_fc_db.py b/mast_contributor_tools/tests/filename_check/test_fc_db.py
@@ -171,6 +171,27 @@ def test_add_fields_xfail(field_record) -> None:
     test_db.close_db()
 
 
+# Test write_to_alternate_format() function
+@pytest.mark.parametrize(
+    "format, output_filename",
+    [
+        ("csv", "test_file_fields.csv"),
+        ("fits", "test_file.fits"),
+        ("excel", "test_file.xlsx"),
+        ("html", "test_file_filenames.html"),
+    ],
+)
+def test_write_to_alternate_format(format, output_filename):
+    """Test write_to_alternate_format() function"""
+    test_db = Hlsp_SQLiteDb(TEST_DB_FILE)
+    test_db.write_to_alternate_format(format)
+    # Assert new file format was written
+    output_filename = TEST_DB_FILE.replace("test_file.db", output_filename)
+    assert os.path.exists(output_filename), TEST_DB_FILE  # f"File {output_filename} not found"
+    # Delete the file after test complete
+    os.remove(output_filename)
+
+
 # Remove the test.db file once the tests are complete
 def test_remove_test_db_file():
     """Delete the test_file.db now that the tests are complete"""
diff --git a/mast_contributor_tools/tests/mast_cli/test_mast_cli.py b/mast_contributor_tools/tests/mast_cli/test_mast_cli.py
@@ -53,9 +53,9 @@ def test_filenames_cli_defaults(mock_checkfiles, mock_filepaths) -> None:
     # Assert logging level is correct
     assert logger.level == logging.getLevelNamesMapping()["INFO"]
     # Assert get_file_paths called with right arguments
-    mock_filepaths.assert_called_with(".", from_file='', search_pattern="*.*", exclude_pattern="", max_n=None)
+    mock_filepaths.assert_called_with(".", from_file="", search_pattern="*.*", exclude_pattern="", max_n=None)
     # Assert check_filenames was called with right arguments
-    mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db")
+    mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db", output_format="db")
 
 
 def test_filenames_cli_logging(mock_checkfiles, mock_filepaths, mock_singlefile) -> None:
@@ -87,26 +87,32 @@ def test_filenames_cli_fileparams(mock_checkfiles, mock_singlefile, mock_filepat
     # Assert it ran successfully
     assert output.exit_code == 0
     # Assert get_file_paths called with right arguments
-    mock_filepaths.assert_called_with(".", from_file='', search_pattern="*.fits", exclude_pattern="*.png", max_n="2")
+    mock_filepaths.assert_called_with(".", from_file="", search_pattern="*.fits", exclude_pattern="*.png", max_n="2")
     # Assert check_filenames was called with right arguments
-    mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db")
+    mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db", output_format="db")
     # Assert check_single_filename was not called
     mock_singlefile.assert_not_called()
 
+
 def test_filenames_cli_fromfile(mock_checkfiles, mock_singlefile, mock_filepaths) -> None:
     # Test multiple file names from a file list
     # equivalent to command "mct check_filenames --from_file='file_list.txt'"
     runner = CliRunner()
-    output = runner.invoke(filenames_cli, ["my-hlsp", "--from_file=file_list.txt", "--pattern=*.fits", "--exclude=*.png", "--max_n=2"])
+    output = runner.invoke(
+        filenames_cli, ["my-hlsp", "--from_file=file_list.txt", "--pattern=*.fits", "--exclude=*.png", "--max_n=2"]
+    )
     # Assert it ran successfully
     assert output.exit_code == 0
     # Assert get_file_paths called with right arguments
-    mock_filepaths.assert_called_with(".", from_file='file_list.txt', search_pattern="*.fits", exclude_pattern="*.png", max_n="2")
+    mock_filepaths.assert_called_with(
+        ".", from_file="file_list.txt", search_pattern="*.fits", exclude_pattern="*.png", max_n="2"
+    )
     # Assert check_filenames was called with right arguments
-    mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db")
+    mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db", output_format="db")
     # Assert check_single_filename was not called
     mock_singlefile.assert_not_called()
 
+
 def test_filenames_cli_singlefile(mock_checkfiles, mock_singlefile, mock_filepaths) -> None:
     """Test different flags are working as expected for the single filename checker CLI"""
     # Test single file
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,10 +8,13 @@ authors = [
   { name = "Mikulski Archive for Space Telescopes", email = "mast_contrib@stsci.edu" },
 ]
 dependencies = [
+    "astropy >= 7.2.0",
     "click >= 8.1.0",
+    "openpyxl >= 3.1.5",
+    "pandas >= 2.3.3",
     "pyyaml > 6.0.1",
+    "setuptools-scm >= 8.3.1",
     "tqdm >= 4.67.1",
-    "setuptools-scm >= 8.3.1"
 ]
 dynamic = ["version"]
 

Original file line number	Diff line number	Diff line change
`@@ -8,10 +8,13 @@ authors = [`
`8`	`8`	`{ name = "Mikulski Archive for Space Telescopes", email = "mast_contrib@stsci.edu" },`
`9`	`9`	`]`
`10`	`10`	`dependencies = [`
	`11`	`+ "astropy >= 7.2.0",`
`11`	`12`	`"click >= 8.1.0",`
	`13`	`+ "openpyxl >= 3.1.5",`
	`14`	`+ "pandas >= 2.3.3",`
`12`	`15`	`"pyyaml > 6.0.1",`
	`16`	`+ "setuptools-scm >= 8.3.1",`
`13`	`17`	`"tqdm >= 4.67.1",`
`14`		`- "setuptools-scm >= 8.3.1"`
`15`	`18`	`]`
`16`	`19`	`dynamic = ["version"]`
`17`	`20`