uc-cdis · george42-ctds · May 20, 2025 · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/heal/cli/extract.py b/heal/cli/extract.py
@@ -14,6 +14,14 @@
     required=True,
     type=click.Path(writable=True),
 )
+@click.option(
+    "--file_type",
+    "file_type",
+    help="Type of input file: auto, csv, json, tsv, dataset_csv, dataset_tsv, redcap",
+    default="auto",
+    type=str,
+    show_default=True,
+)
 @click.option(
     "--title",
     "title",
@@ -29,15 +37,16 @@
     type=click.Path(writable=True),
     show_default=True,
 )
-def extract(input_file, title, output_dir):
+def extract(input_file, title, file_type, output_dir):
     """Extract HEAL-compliant VLMD file from input file"""
 
     logging.info(f"Extracting VLMD from {input_file}")
 
     try:
         vlmd_extract(
             input_file,
-            title,
+            title=title,
+            file_type=file_type,
             output_dir=output_dir,
         )
     except Exception as e:

diff --git a/heal/vlmd/config.py b/heal/vlmd/config.py
@@ -6,7 +6,15 @@
 
 # file suffixes
 ALLOWED_INPUT_TYPES = ["csv", "tsv", "json"]
-ALLOWED_FILE_TYPES = ["auto", "csv", "tsv", "json", "redcap"]
+ALLOWED_FILE_TYPES = [
+    "auto",
+    "csv",
+    "dataset_csv",
+    "dataset_tsv",
+    "json",
+    "redcap",
+    "tsv",
+]
 ALLOWED_SCHEMA_TYPES = ["auto", "csv", "json", "tsv"]
 ALLOWED_OUTPUT_TYPES = ["csv", "json"]
 

diff --git a/heal/vlmd/extract/conversion.py b/heal/vlmd/extract/conversion.py
@@ -4,15 +4,18 @@
 from cdislogging import get_logger
 
 from heal.vlmd import mappings
-from heal.vlmd.config import JSON_SCHEMA, TOP_LEVEL_PROPS
+from heal.vlmd.config import CSV_SCHEMA, JSON_SCHEMA, TOP_LEVEL_PROPS
+from heal.vlmd.extract.csv_data_conversion import convert_dataset_csv
 from heal.vlmd.extract.csv_dict_conversion import convert_datadict_csv
 from heal.vlmd.extract.json_dict_conversion import convert_template_json
 from heal.vlmd.extract.redcap_csv_dict_conversion import convert_redcap_csv
+from heal.vlmd.extract.utils import sync_fields
 from heal.vlmd.utils import clean_json_fields
 
-logger = get_logger("vlmd-conversion", log_level="debug")
+logger = get_logger("vlmd-conversion", log_level="info")
 
 choice_fxn = {
+    "csv-data-set": convert_dataset_csv,
     "csv-data-dict": partial(
         convert_datadict_csv,
         rename_map=mappings.rename_map,
@@ -36,20 +39,25 @@ def _detect_input_type(filepath, ext_to_input_type=ext_map):
 
 def convert_to_vlmd(
     input_filepath,
-    input_type=None,
-    data_dictionary_props=None,
+    input_type: str = None,
+    data_dictionary_props: dict = None,
+    include_all_fields: bool = True,
 ) -> dict:
     """
-    Converts a data dictionary to HEAL compliant json or csv format.
+    Converts a data dictionary or data file to HEAL compliant json or csv format.
 
     Args
-        input_filepath (str): Path to input file. Currently converts data dictionaries in csv, json, and tsv.
+        input_filepath (str): Path to input file. Currently converts data
+            dictionaries in csv, json, and tsv.
         input_type (str): The input type. See keys of 'choice_fxn' dict for options, currently:
-            csv-data-dict, json-template.
+            csv-data-dict, csv-data-set, json-template, redcap-data-dict.
         data_dictionary_props (dict):
             The other data-dictionary level properties. By default,
             will give the data_dictionary `title` property as the file name stem.
-
+        include_all_fields (bool): If true then csv dictionaries extracted from
+            csv datasets will include columns for all fields in the schema.
+            Useful for generating a template that can be manually updated.
+            Default = True.
     Returns
         Dictionary with:
          1. csvtemplated array of fields.
@@ -81,9 +89,35 @@ def convert_to_vlmd(
     package = data_dictionary_package
 
     # add schema version
+    schema_version = {"schemaVersion": JSON_SCHEMA["version"]}
     for field in package["template_csv"]["fields"]:
         field.update({"schemaVersion": JSON_SCHEMA["version"], **field})
 
+    if input_type == "csv-data-set":
+        # add a value placeholder for description field
+        description_placeholder = "description required"
+        for field in package["template_json"]["fields"]:
+            if not field.get("description"):
+                field.update({"description": description_placeholder, **field})
+        for field in package["template_csv"]["fields"]:
+            if not field.get("description"):
+                field.update({"description": description_placeholder, **field})
+
+        if include_all_fields:
+            # include schema fields that were not present in data file, don't include schema flags
+            field_list = [
+                key
+                for key in CSV_SCHEMA["properties"].keys()
+                if isinstance(CSV_SCHEMA["properties"][key], dict)
+            ] + [
+                key
+                for key in CSV_SCHEMA["patternProperties"].keys()
+                if isinstance(CSV_SCHEMA["patternProperties"][key], dict)
+            ]
+            package["template_csv"]["fields"] = sync_fields(
+                package["template_csv"]["fields"], field_list
+            )
+
     # remove empty json fields, add schema version (in TOP_LEVEL_PROPS)
     package["template_json"]["fields"] = clean_json_fields(
         package["template_json"]["fields"]

diff --git a/heal/vlmd/extract/csv_data_conversion.py b/heal/vlmd/extract/csv_data_conversion.py
@@ -0,0 +1,27 @@
+"""
+CSV data to HEAL VLMD conversion
+"""
+
+from heal.vlmd.extract.json_dict_conversion import convert_template_json
+from heal.vlmd.mappings import typesets
+from heal.vlmd.validate.utils import read_delim
+
+
+def convert_dataset_csv(file_path, data_dictionary_props={}):
+    """
+    Takes a CSV file containing data (not metadata) and
+    infers each of it's variables data types and names.
+    These inferred properties are then outputted as partially-completed
+    HEAL variable level metadata files. That is, it outputs the `name` and `type` property.
+
+    NOTE: this will be an invalid file as `description` is required
+    for each variable. However, this serves as a great way to start
+    the basis of a VLMD submission.
+    """
+    df = read_delim(file_path)
+    data_dictionary = data_dictionary_props.copy()
+    fields = typesets.infer_frictionless_fields(df)
+    data_dictionary["fields"] = fields
+
+    package = convert_template_json(data_dictionary)
+    return package
diff --git a/heal/vlmd/extract/csv_dict_conversion.py b/heal/vlmd/extract/csv_dict_conversion.py
@@ -11,7 +11,12 @@
 from heal.vlmd.utils import has_redcap_headers
 from heal.vlmd.validate.utils import read_delim
 
-logger = get_logger("csv-conversion", log_level="debug")
+
+class RedcapExtractionError(Exception):
+    pass
+
+
+logger = get_logger("csv-conversion", log_level="info")
 
 
 def _parse_string_objects(
@@ -129,17 +134,20 @@ def infer_delim(series: pd.Series, char_list: list, firstmatch: bool):
         return inferred_delim
 
     if isinstance(csv_template, (str, PathLike)):
-        logger.debug("Getting data from path to CSV file")
         template_tbl = read_delim(str(Path(csv_template)))
     else:
-        logger.debug("Getting data from input dataframe")
         template_tbl = pd.DataFrame(csv_template)
 
-    # If REDCap then get dictionary and return.
+    # If REDCap then get dictionary and return or raise RedcapExtractionError.
     column_names = template_tbl.columns
     if has_redcap_headers(column_names):
         logger.debug("File appears to have REDCap headers. Ready to convert.")
-        converted_dict = convert_redcap_csv(template_tbl)
+        try:
+            converted_dict = convert_redcap_csv(template_tbl)
+        except Exception as err:
+            logger.error("Error in extracting REDCap dictionary")
+            logger.error(err)
+            raise RedcapExtractionError(str(err))
         return converted_dict
     else:
         logger.debug("File is CSV dictionary, not REDCap dictionary.")
@@ -197,6 +205,10 @@ def infer_delim(series: pd.Series, char_list: list, firstmatch: bool):
                 )
             elif field_prop["type"] == "number":
                 tbl_csv[new_column_name] = tbl_csv[new_column_name].astype(float)
+            elif field_prop["type"] == ["integer", "number"]:
+                tbl_csv[new_column_name] = tbl_csv[new_column_name].apply(
+                    lambda s: float(s) if s else s
+                )
             elif field_prop["type"] == "object":
                 possible_key_val = ["=", ":"]
                 possible_list = [";", "|"]