Skip to content

Feat/add dataset csv file type #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 24 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a410e14
(HP-2029) add dependencies to pyproject.toml
george42-ctds Apr 23, 2025
2c3aa92
(HP-2029) add new test data file
george42-ctds Apr 23, 2025
7ce019d
(HP-2029) add 'typesets' and 'sync_fields'
george42-ctds Apr 23, 2025
c372e8d
(HP-2029) add new fixtures to conftest
george42-ctds Apr 23, 2025
033f94d
(HP-2029) include new allowed file_types
george42-ctds Apr 23, 2025
9858cfa
(HP-2029) do not validate dataset input files
george42-ctds Apr 23, 2025
4a97f95
(HP-2029) add new conversion logic
george42-ctds Apr 23, 2025
67ab7dd
(HP-2029) add dataset conversion module
george42-ctds Apr 23, 2025
e0c5e13
(HP-2029) add new extract logic
george42-ctds Apr 23, 2025
d8b35c0
(HP-2029) add test dataset data
george42-ctds Apr 23, 2025
0052caf
(HP-2029) Add file_type options to extract CLI
george42-ctds Apr 23, 2025
afecf4b
(HP-2029) add csv-header check to validate
george42-ctds Apr 24, 2025
7c92e48
(HP-2029) add unit tests for validating dataset as dictionary
george42-ctds Apr 24, 2025
77b8060
(HP-2029) swap order of fields lines
george42-ctds Apr 24, 2025
e3bd56a
(HP-2029) re-lock to upgrade dependencies
george42-ctds Apr 24, 2025
676ee59
Merge branch 'master' into feat/add-dataset-csv-file-type
george42-ctds Apr 24, 2025
dbe5c76
(HP-2029) Remove some debug log messages
george42-ctds Apr 24, 2025
19b2293
(HP-2029) Add row to test REDCap dictionary
george42-ctds Apr 29, 2025
854b6e4
(HP-2029) Add invalid REDCap test dictionary
george42-ctds Apr 29, 2025
4f1a086
(HP-2029) skip fallback to dataset for invalid REDCap dictionaries
george42-ctds Apr 30, 2025
ec20d00
(HP-2029): remove some debug statements
george42-ctds Apr 30, 2025
dfb77c2
(HP-2029) re-order imports
george42-ctds Apr 30, 2025
c75492c
(HP-2029) move 'test_title' variable to local fixture
george42-ctds Apr 30, 2025
47cfd9f
(HP-2029) add docstrings to tests
george42-ctds Apr 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions heal/cli/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@
required=True,
type=click.Path(writable=True),
)
@click.option(
"--file_type",
"file_type",
help="Type of input file: auto, csv, json, tsv, dataset_csv, dataset_tsv, redcap",
default="auto",
type=str,
show_default=True,
)
@click.option(
"--title",
"title",
Expand All @@ -29,15 +37,16 @@
type=click.Path(writable=True),
show_default=True,
)
def extract(input_file, title, output_dir):
def extract(input_file, title, file_type, output_dir):
"""Extract HEAL-compliant VLMD file from input file"""

logging.info(f"Extracting VLMD from {input_file}")

try:
vlmd_extract(
input_file,
title,
title=title,
file_type=file_type,
output_dir=output_dir,
)
except Exception as e:
Expand Down
10 changes: 9 additions & 1 deletion heal/vlmd/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,15 @@

# file suffixes
ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"]
ALLOWED_FILE_TYPES = ["auto", "csv", "tsv", "json", "redcap"]
ALLOWED_FILE_TYPES = [
"auto",
"csv",
"dataset_csv",
"dataset_tsv",
"json",
"redcap",
"tsv",
]
ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"]
ALLOWED_OUTPUT_TYPES = ["csv", "json"]

Expand Down
48 changes: 41 additions & 7 deletions heal/vlmd/extract/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@
from cdislogging import get_logger

from heal.vlmd import mappings
from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS
from heal.vlmd.config import CSV_SCHEMA, JSON_SCHEMA, TOP_LEVEL_PROPS
from heal.vlmd.extract.csv_data_conversion import convert_dataset_csv
from heal.vlmd.extract.csv_dict_conversion import convert_datadict_csv
from heal.vlmd.extract.json_dict_conversion import convert_template_json
from heal.vlmd.extract.redcap_csv_dict_conversion import convert_redcap_csv
from heal.vlmd.extract.utils import sync_fields
from heal.vlmd.utils import clean_json_fields

logger = get_logger("vlmd-conversion", log_level="debug")

choice_fxn = {
"csv-data-set": convert_dataset_csv,
"csv-data-dict": partial(
convert_datadict_csv,
rename_map=mappings.rename_map,
Expand All @@ -36,20 +39,25 @@ def _detect_input_type(filepath, ext_to_input_type=ext_map):

def convert_to_vlmd(
input_filepath,
input_type=None,
data_dictionary_props=None,
input_type: str = None,
data_dictionary_props: dict = None,
include_all_fields: bool = True,
) -> dict:
"""
Converts a data dictionary to HEAL compliant json or csv format.
Converts a data dictionary or data file to HEAL compliant json or csv format.

Args
input_filepath (str): Path to input file. Currently converts data dictionaries in csv, json, and tsv.
input_filepath (str): Path to input file. Currently converts data
dictionaries in csv, json, and tsv.
input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently:
csv-data-dict, json-template.
csv-data-dict, csv-data-set, json-template, redcap-data-dict.
data_dictionary_props (dict):
The other data-dictionary level properties. By default,
will give the data_dictionary `title` property as the file name stem.

include_all_fields (bool): If true then csv dictionaries extracted from
csv datasets will include columns for all fields in the schema.
Useful for generating a template that can be manually updated.
Default = True.
Returns
Dictionary with:
1. csvtemplated array of fields.
Expand Down Expand Up @@ -81,9 +89,35 @@ def convert_to_vlmd(
package = data_dictionary_package

# add schema version
schema_version = {"schemaVersion": JSON_SCHEMA["version"]}
for field in package["template_csv"]["fields"]:
field.update({"schemaVersion": JSON_SCHEMA["version"], **field})

if input_type == "csv-data-set":
# add a value placeholder for description field
description_placeholder = "description required"
for field in package["template_json"]["fields"]:
if not field.get("description"):
field.update({"description": description_placeholder, **field})
for field in package["template_csv"]["fields"]:
if not field.get("description"):
field.update({"description": description_placeholder, **field})

if include_all_fields:
# include schema fields that were not present in data file, don't include schema flags
field_list = [
key
for key in CSV_SCHEMA["properties"].keys()
if isinstance(CSV_SCHEMA["properties"][key], dict)
] + [
key
for key in CSV_SCHEMA["patternProperties"].keys()
if isinstance(CSV_SCHEMA["patternProperties"][key], dict)
]
package["template_csv"]["fields"] = sync_fields(
package["template_csv"]["fields"], field_list
)

# remove empty json fields, add schema version (in TOP_LEVEL_PROPS)
package["template_json"]["fields"] = clean_json_fields(
package["template_json"]["fields"]
Expand Down
27 changes: 27 additions & 0 deletions heal/vlmd/extract/csv_data_conversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
CSV data to HEAL VLMD conversion
"""

from heal.vlmd.extract.json_dict_conversion import convert_template_json
from heal.vlmd.mappings import typesets
from heal.vlmd.validate.utils import read_delim


def convert_dataset_csv(file_path, data_dictionary_props={}):
"""
Takes a CSV file containing data (not metadata) and
infers each of it's variables data types and names.
These inferred properties are then outputted as partially-completed
HEAL variable level metadata files. That is, it outputs the `name` and `type` property.

NOTE: this will be an invalid file as `description` is required
for each variable. However, this serves as a great way to start
the basis of a VLMD submission.
"""
df = read_delim(file_path)
data_dictionary = data_dictionary_props.copy()
fields = typesets.infer_frictionless_fields(df)
data_dictionary["fields"] = fields

package = convert_template_json(data_dictionary)
return package
16 changes: 12 additions & 4 deletions heal/vlmd/extract/csv_dict_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
from heal.vlmd.utils import has_redcap_headers
from heal.vlmd.validate.utils import read_delim


class RedcapExtractionError(Exception):
pass


logger = get_logger("csv-conversion", log_level="debug")


Expand Down Expand Up @@ -129,17 +134,20 @@ def infer_delim(series: pd.Series, char_list: list, firstmatch: bool):
return inferred_delim

if isinstance(csv_template, (str, PathLike)):
logger.debug("Getting data from path to CSV file")
template_tbl = read_delim(str(Path(csv_template)))
else:
logger.debug("Getting data from input dataframe")
template_tbl = pd.DataFrame(csv_template)

# If REDCap then get dictionary and return.
# If REDCap then get dictionary and return or raise RedcapExtractionError.
column_names = template_tbl.columns
if has_redcap_headers(column_names):
logger.debug("File appears to have REDCap headers. Ready to convert.")
converted_dict = convert_redcap_csv(template_tbl)
try:
converted_dict = convert_redcap_csv(template_tbl)
except Exception as err:
logger.error("Error in extracting REDCap dictionary")
logger.error(err)
raise RedcapExtractionError(str(err))
return converted_dict
else:
logger.debug("File is CSV dictionary, not REDCap dictionary.")
Expand Down
Loading