diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f4963b..82123ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## [Unreleased] ### Added +[PR # 32](https://github.com/spacetelescope/mast_contributor_tools/pull/32) + - Adding option to save filename checker output in altnerate format (csv, fits, excel, html) ### Changed diff --git a/TUTORIAL/tutorial_readme.md b/TUTORIAL/tutorial_readme.md index 14c8b1d..e97810c 100644 --- a/TUTORIAL/tutorial_readme.md +++ b/TUTORIAL/tutorial_readme.md @@ -164,6 +164,16 @@ mct check_filenames mct-tutorial --directory='tutorial-data/' --exclude='*_spec ``` The output message will report that this command only checked 2 files, but both files passed. +### Step 4e. Exporting results to alternate format + +If you're not used to working with SQLite `.db` files, you can also save the output to an alternate format using the `--output_format` flag. Currently supported values include "csv", "fits", "html", and "excel". The `.db` file is always written by default, but this option will write an additional file in the chosen format. + +For example, this command will save out a html file with the filename checker results in a color-coded table: + +```shell +mct check_filenames mct-tutorial --directory='tutorial-data/' --output_format='html' +``` + # Additional Resources Congratulations! You have completed this tutorial and now know the basic usage of the MAST Contributor's Tools Filename Checker. diff --git a/docs/filename_check_readme.md b/docs/filename_check_readme.md index 2a87611..469ed4e 100644 --- a/docs/filename_check_readme.md +++ b/docs/filename_check_readme.md @@ -22,6 +22,7 @@ The various options for this command are described below: | `-e` or `--exclude` | File pattern to exclude from testing, for example '*.jpg' to test all files except the jpgs | None | | `-n` or `--max_n` | Maximum number of files to check, for testing purposes. | None (all files) | | `-db` or `--dbFile` | Name of Results database file | `results_.db` | +| `-f` or `--output_format` | Write output to alternate format. Currently supports "csv", "fits", "html" or "excel" | `db` | | `-v` or `--verbose` | Enables verbose output for more information | `False` | | `--help` | Prints information about this command | | diff --git a/mast_contributor_tools/filename_check/fc_app.py b/mast_contributor_tools/filename_check/fc_app.py index 74c398f..aaefab2 100644 --- a/mast_contributor_tools/filename_check/fc_app.py +++ b/mast_contributor_tools/filename_check/fc_app.py @@ -92,7 +92,7 @@ def get_file_paths( return file_list -def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str) -> None: +def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str, output_format: str = "db") -> None: """Recursively check filenames in a directory tree of HLSP products Parameters @@ -103,6 +103,8 @@ def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str) -> None: List of files to check, typically output from get_file_paths() dbFile : str, optional Name of SQLite database file to contain results + output_format : str, optional + Alternate format to save results to: 'csv', 'fits', 'html', or 'excel'. Default: "db" """ # Make sure hlsp name is valid if not FieldRule.match_pattern(hlsp_name, HLSPNAME_REGEX): @@ -153,6 +155,14 @@ def check_filenames(hlsp_name: str, file_list: list[Path], dbFile: str) -> None: logger.debug(f"Verdict for {f.name}: '{file_rec['final_verdict']}'") logger.critical(db.print_summary()) # print summary information on how many files passed + logger.critical(f"\nResults written to {dbFile}") + + # Write ouput to alternate format if specified + if output_format != "db": + logger.debug(f"Also writing to alternate format '{output_format}'") + ouput_files = db.write_to_alternate_format(output_format) + logger.critical(f"Written to {ouput_files}") + db.close_db() logger.critical(f"\nFilename checking complete. Results written to {dbFile}") diff --git a/mast_contributor_tools/filename_check/fc_db.py b/mast_contributor_tools/filename_check/fc_db.py index 9933bb4..b595e05 100644 --- a/mast_contributor_tools/filename_check/fc_db.py +++ b/mast_contributor_tools/filename_check/fc_db.py @@ -2,6 +2,10 @@ import sqlite3 +import astropy.io.fits as fits +import pandas as pd +from astropy.table import Table + # The following SQL will create am SQLite database FILENAME_TABLE = """ CREATE TABLE IF NOT EXISTS filename ( @@ -133,3 +137,113 @@ def print_summary(self) -> str: # Add more detail here later? - could break down by fields, etc. return summary_message + + def write_to_alternate_format(self, save_format: str) -> list[str]: + """ + Write out the SQLite DB as an alternate format. + + Parameters + ---------- + save_format : str + Format to save output: 'csv', 'excel', 'html', or 'fits' + + Returns + -------- + files_written: list[str]: List of file names written out + """ + # Check that input is a valid option + supported_formats = ["csv", "excel", "fits", "html"] + if save_format.lower() not in supported_formats: + msg = f"Save Format '{save_format}' is not supported." + msg += f"Please choose from {supported_formats}" + raise ValueError(msg) + + # Construct new filename to save to + fileroot = self.db_file.strip(".db") + + # Read DB file as pandas dataframe + self.conn = sqlite3.connect(self.db_file) + filename_data = pd.read_sql_query("SELECT * FROM filename", self.conn) + fields_data = pd.read_sql_query("SELECT * FROM fields", self.conn) + # Close connection + self.conn.close() + + # Save out data in new format + # CSV files + if save_format == "csv": + ouput_filename1 = f"{fileroot}_filenames.csv" + ouput_filename2 = f"{fileroot}_fields.csv" + filename_data.to_csv(ouput_filename1) + fields_data.to_csv(ouput_filename2) + files_written = [ouput_filename1, ouput_filename2] + # Excel spreadsheet + elif save_format == "excel": + output_filename = f"{fileroot}.xlsx" + # Style dataframe: change color of cell depending on verdict + filename_data = filename_data.style.map(color_formatter) + fields_data = fields_data.style.map(color_formatter) + # Save as one Excel document with two sheets: + # One for filenames table and one for fields + with pd.ExcelWriter(output_filename) as excel_writer: + filename_data.to_excel(excel_writer, sheet_name="FileNames") + fields_data.to_excel(excel_writer, sheet_name="Fields") + files_written = [output_filename] + + # Fits table + elif save_format == "fits": + # Save as one fits file with two table extensions: + # One for filenames table and one for fields + output_filename = f"{fileroot}.fits" + hdu_list = fits.HDUList( + [ + fits.PrimaryHDU(), # TODO: add some metadata? + fits.table_to_hdu(Table.from_pandas(filename_data), name="FILENAMES"), + fits.table_to_hdu(Table.from_pandas(fields_data), name="FIELDS"), + ] + ) + hdu_list.writeto(output_filename, overwrite=True) + files_written = [output_filename] + + # Html table + elif save_format == "html": + ouput_filename1 = f"{fileroot}_filenames.html" + ouput_filename2 = f"{fileroot}_fields.html" + # Style dataframe: change color of cell depending on verdict + filename_data = filename_data.style.map(color_formatter) + fields_data = fields_data.style.map(color_formatter) + # Then write to html file + filename_data.to_html( + ouput_filename1, + header=True, + ) + fields_data.to_html( + ouput_filename2, + header=True, + ) + files_written = [ouput_filename1, ouput_filename2] + + else: + # No alternate format - only DB file + files_written = [self.db_file] + + return files_written + + +def color_formatter(value: str) -> str: + """Color mapping for use in write_to_alternate_format(). + Color-codes table cells based on verdict: green for "PASS", red for "FAIL", etc. + + Parameters + ------ + value: str + + """ + if str(value).upper() == "PASS": + color = "lightgreen" + elif str(value).upper() == "FAIL": + color = "red" + elif str(value).upper() == "NEEDS REVIEW": + color = "yellow" + else: + color = None + return "background-color: %s" % color diff --git a/mast_contributor_tools/mast_cli.py b/mast_contributor_tools/mast_cli.py index 192f584..382cc2c 100644 --- a/mast_contributor_tools/mast_cli.py +++ b/mast_contributor_tools/mast_cli.py @@ -47,6 +47,12 @@ def cli() -> None: @click.option("-e", "--exclude", default="", help="File pattern to exclude from testing, for example '\\*.png'") @click.option("-n", "--max_n", default=None, help="Maximum number of files to check, for testing purposes.") @click.option("-db", "--dbFile", default="", help="Results database filename (defaults to: results_.db)") +@click.option( + "-f", + "--output_format", + default="db", + help="Write output to alternate format (csv, fits, html, excel)", +) @click.option("-v", "--verbose", default=False, flag_value=True, help="Enable verbose output") def filenames_cli( hlsp_name: str, @@ -56,6 +62,7 @@ def filenames_cli( exclude: str = "", max_n: Union[int, None] = None, dbfile: str = "", + output_format: str = "db", verbose: bool = False, ) -> None: """ @@ -102,7 +109,7 @@ def filenames_cli( ) # Perform the file name check - check_filenames(hlsp_name, file_list, dbFile=dbfile) + check_filenames(hlsp_name, file_list, dbFile=dbfile, output_format=output_format) @cli.command("check_filename", short_help="Check a single file name against MAST HLSP naming standards") diff --git a/mast_contributor_tools/tests/filename_check/test_fc_db.py b/mast_contributor_tools/tests/filename_check/test_fc_db.py index fe18e9a..ab7c42d 100644 --- a/mast_contributor_tools/tests/filename_check/test_fc_db.py +++ b/mast_contributor_tools/tests/filename_check/test_fc_db.py @@ -171,6 +171,27 @@ def test_add_fields_xfail(field_record) -> None: test_db.close_db() +# Test write_to_alternate_format() function +@pytest.mark.parametrize( + "format, output_filename", + [ + ("csv", "test_file_fields.csv"), + ("fits", "test_file.fits"), + ("excel", "test_file.xlsx"), + ("html", "test_file_filenames.html"), + ], +) +def test_write_to_alternate_format(format, output_filename): + """Test write_to_alternate_format() function""" + test_db = Hlsp_SQLiteDb(TEST_DB_FILE) + test_db.write_to_alternate_format(format) + # Assert new file format was written + output_filename = TEST_DB_FILE.replace("test_file.db", output_filename) + assert os.path.exists(output_filename), TEST_DB_FILE # f"File {output_filename} not found" + # Delete the file after test complete + os.remove(output_filename) + + # Remove the test.db file once the tests are complete def test_remove_test_db_file(): """Delete the test_file.db now that the tests are complete""" diff --git a/mast_contributor_tools/tests/mast_cli/test_mast_cli.py b/mast_contributor_tools/tests/mast_cli/test_mast_cli.py index c814530..c731f5c 100644 --- a/mast_contributor_tools/tests/mast_cli/test_mast_cli.py +++ b/mast_contributor_tools/tests/mast_cli/test_mast_cli.py @@ -53,9 +53,9 @@ def test_filenames_cli_defaults(mock_checkfiles, mock_filepaths) -> None: # Assert logging level is correct assert logger.level == logging.getLevelNamesMapping()["INFO"] # Assert get_file_paths called with right arguments - mock_filepaths.assert_called_with(".", from_file='', search_pattern="*.*", exclude_pattern="", max_n=None) + mock_filepaths.assert_called_with(".", from_file="", search_pattern="*.*", exclude_pattern="", max_n=None) # Assert check_filenames was called with right arguments - mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db") + mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db", output_format="db") def test_filenames_cli_logging(mock_checkfiles, mock_filepaths, mock_singlefile) -> None: @@ -87,26 +87,32 @@ def test_filenames_cli_fileparams(mock_checkfiles, mock_singlefile, mock_filepat # Assert it ran successfully assert output.exit_code == 0 # Assert get_file_paths called with right arguments - mock_filepaths.assert_called_with(".", from_file='', search_pattern="*.fits", exclude_pattern="*.png", max_n="2") + mock_filepaths.assert_called_with(".", from_file="", search_pattern="*.fits", exclude_pattern="*.png", max_n="2") # Assert check_filenames was called with right arguments - mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db") + mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db", output_format="db") # Assert check_single_filename was not called mock_singlefile.assert_not_called() + def test_filenames_cli_fromfile(mock_checkfiles, mock_singlefile, mock_filepaths) -> None: # Test multiple file names from a file list # equivalent to command "mct check_filenames --from_file='file_list.txt'" runner = CliRunner() - output = runner.invoke(filenames_cli, ["my-hlsp", "--from_file=file_list.txt", "--pattern=*.fits", "--exclude=*.png", "--max_n=2"]) + output = runner.invoke( + filenames_cli, ["my-hlsp", "--from_file=file_list.txt", "--pattern=*.fits", "--exclude=*.png", "--max_n=2"] + ) # Assert it ran successfully assert output.exit_code == 0 # Assert get_file_paths called with right arguments - mock_filepaths.assert_called_with(".", from_file='file_list.txt', search_pattern="*.fits", exclude_pattern="*.png", max_n="2") + mock_filepaths.assert_called_with( + ".", from_file="file_list.txt", search_pattern="*.fits", exclude_pattern="*.png", max_n="2" + ) # Assert check_filenames was called with right arguments - mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db") + mock_checkfiles.assert_called_with("my-hlsp", mock_filepaths(), dbFile="results_my-hlsp.db", output_format="db") # Assert check_single_filename was not called mock_singlefile.assert_not_called() + def test_filenames_cli_singlefile(mock_checkfiles, mock_singlefile, mock_filepaths) -> None: """Test different flags are working as expected for the single filename checker CLI""" # Test single file diff --git a/pyproject.toml b/pyproject.toml index 681fcee..2155005 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,10 +8,13 @@ authors = [ { name = "Mikulski Archive for Space Telescopes", email = "mast_contrib@stsci.edu" }, ] dependencies = [ + "astropy >= 7.2.0", "click >= 8.1.0", + "openpyxl >= 3.1.5", + "pandas >= 2.3.3", "pyyaml > 6.0.1", + "setuptools-scm >= 8.3.1", "tqdm >= 4.67.1", - "setuptools-scm >= 8.3.1" ] dynamic = ["version"]