Add --infer-enum-from-integers flag for low-cardinality integer enum detection

Sigfried · web-flow · commit 59f27296b02c · 2026-03-23T10:51:17.000-05:00
diff --git a/schema_automator/cli.py b/schema_automator/cli.py
@@ -72,6 +72,7 @@
 infer_foreign_keys_option = click.option('--infer-foreign-keys/--no-infer-foreign-keys', default=False, help='infer ranges/foreign keys')
 infer_optional_option = click.option('--infer-optional/--no-infer-optional', default=False, help='mark slots as not required when columns have null or empty values (ignored in pandera mode)')
 infer_mixed_types_option = click.option('--infer-mixed-types/--no-infer-mixed-types', default=False, help='use any_of to represent columns with mixed types')
+infer_enum_from_integers_option = click.option('--infer-enum-from-integers/--no-infer-enum-from-integers', default=False, help='treat low-cardinality integer columns as enum candidates')
 enum_columns_option = click.option('--enum-columns', '-E', multiple=True, help='column(s) that is forced to be an enum')
 enum_mask_columns_option = click.option('--enum-mask-columns', multiple=True, help='column(s) that are excluded from being enums')
 max_enum_size_option = click.option('--max-enum-size', default=50, help='do not create an enum if more than max distinct members')
@@ -123,6 +124,7 @@ def main(verbose: int, quiet: bool):
 @max_enum_size_option
 @infer_optional_option
 @infer_mixed_types_option
+@infer_enum_from_integers_option
 @click.option('--data-dictionary-row-count',
               type=click.INT,
               help='rows that provide metadata about columns')
@@ -163,6 +165,7 @@ def generalize_tsv(tsvfile, output, class_name, schema_name, pandera: bool, anno
 @max_enum_size_option
 @infer_optional_option
 @infer_mixed_types_option
+@infer_enum_from_integers_option
 @click.option('--robot/--no-robot', default=False, help='set if the TSV is a ROBOT template')
 def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
     """
@@ -193,6 +196,7 @@ def generalize_tsvs(tsvfiles, output, schema_name, **kwargs):
 @max_enum_size_option
 @infer_optional_option
 @infer_mixed_types_option
+@infer_enum_from_integers_option
 @click.option('--class-name', '-c', default=DEFAULT_CLASS_NAME, help='Core class name in schema')
 @click.option('--pandera/--no-pandera', default=False, help='set to use panderas as inference engine')
 @click.option('--data-output', help='Path to file of downloaded data')
diff --git a/schema_automator/generalizers/csv_data_generalizer.py b/schema_automator/generalizers/csv_data_generalizer.py
@@ -116,6 +116,9 @@ class CsvDataGeneralizer(Generalizer):
     infer_mixed_types: bool = False
     """If true, use any_of to represent columns with mixed types instead of collapsing to string"""
 
+    infer_enum_from_integers: bool = False
+    """If true, treat low-cardinality integer columns as enum candidates"""
+
     def infer_linkages(self, files: List[str], **kwargs) -> List[ForeignKey]:
         """
         Heuristic procedure for determining which tables are linked to others via implicit foreign keys
@@ -456,6 +459,21 @@ def convert_dicts(self,
                 logging.info(f"Slot {sn} has range {s['range']}")
             if self.infer_optional and sn in slot_has_nulls and not s.get('identifier'):
                 s['required'] = False
+            if (self.infer_enum_from_integers
+                    and s.get('range') == 'integer'
+                    and sn not in enum_mask_columns
+                    and not s.get('identifier')):
+                n_distinct = len(vals)
+                n_total = len(slot_values[sn]) + 1
+                if (sn in enum_columns
+                        or ((n_distinct / n_total) < self.enum_threshold
+                            and 0 < n_distinct <= self.max_enum_size)):
+                    enum_name = sn.replace(' ', '_').replace('(s)', '') + '_enum'
+                    s['range'] = enum_name
+                    enums[enum_name] = {
+                        'permissible_values': {str(v): {'description': str(v)} for v in vals}
+                    }
+                    logging.info(f"Slot {sn}: low-cardinality integers treated as enum {enum_name}")
             if 'any_of' not in s and (s.get('range') == 'string' or sn in enum_columns) and sn not in enum_mask_columns:
                 filtered_vals = \
                     [v
diff --git a/tests/test_generalizers/test_csv_data_generalizer.py b/tests/test_generalizers/test_csv_data_generalizer.py
@@ -170,6 +170,50 @@ def test_infer_mixed_types_off_by_default(self):
         self.assertEqual(schema.slots["score"].range, "string")
         self.assertEqual(len(schema.slots["score"].any_of), 0)
 
+    def test_infer_enum_from_integers(self):
+        rows = [
+            {"id": "1", "name": "Alice", "status": "1"},
+            {"id": "2", "name": "Bob", "status": "2"},
+            {"id": "3", "name": "Carol", "status": "1"},
+            {"id": "4", "name": "Dave", "status": "2"},
+            {"id": "5", "name": "Eve", "status": "1"},
+            {"id": "6", "name": "Frank", "status": "2"},
+            {"id": "7", "name": "Grace", "status": "1"},
+            {"id": "8", "name": "Hank", "status": "2"},
+            {"id": "9", "name": "Ivy", "status": "1"},
+            {"id": "10", "name": "Jack", "status": "2"},
+        ]
+        ie = CsvDataGeneralizer(infer_enum_from_integers=True, enum_threshold=0.5)
+        schema = ie.convert_dicts(rows, "test", "Pet")
+        # status has 2 distinct values out of 10 rows => ratio 0.18 < 0.5 threshold
+        self.assertEqual(schema.slots["status"].range, "status_enum")
+        pvs = list(schema.enums["status_enum"].permissible_values.keys())
+        self.assertCountEqual(pvs, ["1", "2"])
+
+    def test_infer_enum_from_integers_high_cardinality_stays_integer(self):
+        rows = [{"id": str(i), "val": str(i)} for i in range(1, 21)]
+        ie = CsvDataGeneralizer(infer_enum_from_integers=True, enum_threshold=0.1)
+        schema = ie.convert_dicts(rows, "test", "Thing")
+        # 20 distinct out of 20 rows => ratio 1.0, well above threshold
+        self.assertEqual(schema.slots["val"].range, "integer")
+
+    def test_infer_enum_from_integers_off_by_default(self):
+        rows = [
+            {"id": "1", "status": "1"},
+            {"id": "2", "status": "2"},
+            {"id": "3", "status": "1"},
+            {"id": "4", "status": "2"},
+            {"id": "5", "status": "1"},
+            {"id": "6", "status": "2"},
+            {"id": "7", "status": "1"},
+            {"id": "8", "status": "2"},
+            {"id": "9", "status": "1"},
+            {"id": "10", "status": "2"},
+        ]
+        ie = CsvDataGeneralizer()
+        schema = ie.convert_dicts(rows, "test", "Pet")
+        self.assertEqual(schema.slots["status"].range, "integer")
+
     def _convert(self, base_name: str, cn='Example', index_slot='examples') -> SchemaDefinition:
         ie = CsvDataGeneralizer()
         fn = f'{base_name}.tsv'