keras-team · haifeng-jin · Nov 9, 2025 · Nov 8, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/a.patch b/a.patch
diff --git a/autokeras/__init__.py b/autokeras/__init__.py
@@ -19,6 +19,7 @@
 from autokeras.blocks import ConvBlock
 from autokeras.blocks import DenseBlock
 from autokeras.blocks import EfficientNetBlock
+from autokeras.blocks import Embedding
 from autokeras.blocks import Flatten
 from autokeras.blocks import ImageAugmentation
 from autokeras.blocks import ImageBlock
@@ -28,6 +29,7 @@
 from autokeras.blocks import ResNetBlock
 from autokeras.blocks import RNNBlock
 from autokeras.blocks import SpatialReduction
+from autokeras.blocks import StructuredDataBlock
 from autokeras.blocks import TemporalReduction
 from autokeras.blocks import TextBlock
 from autokeras.blocks import XceptionBlock
@@ -38,9 +40,12 @@
 from autokeras.keras_layers import ExpandLastDim
 from autokeras.nodes import ImageInput
 from autokeras.nodes import Input
+from autokeras.nodes import StructuredDataInput
 from autokeras.nodes import TextInput
 from autokeras.tasks import ImageClassifier
 from autokeras.tasks import ImageRegressor
+from autokeras.tasks import StructuredDataClassifier
+from autokeras.tasks import StructuredDataRegressor
 from autokeras.tasks import TextClassifier
 from autokeras.tasks import TextRegressor
 from autokeras.tuners import BayesianOptimization
@@ -51,8 +56,6 @@
 __version__ = "2.1.0dev"
 
 CUSTOM_OBJECTS = {
-    "BertPreprocessor": keras_nlp.models.BertPreprocessor,
-    "BertBackbone": keras_nlp.models.BertBackbone,
     "CastToFloat32": CastToFloat32,
     "ExpandLastDim": ExpandLastDim,
 }
diff --git a/autokeras/adapters/__init__.py b/autokeras/adapters/__init__.py
@@ -14,6 +14,7 @@
 
 from autokeras.adapters.input_adapters import ImageAdapter
 from autokeras.adapters.input_adapters import InputAdapter
+from autokeras.adapters.input_adapters import StructuredDataAdapter
 from autokeras.adapters.input_adapters import TextAdapter
 from autokeras.adapters.output_adapters import ClassificationAdapter
 from autokeras.adapters.output_adapters import RegressionAdapter
diff --git a/autokeras/adapters/input_adapters.py b/autokeras/adapters/input_adapters.py
@@ -55,3 +55,12 @@ def check(self, x):
                 "Expect the data to TextInput to be numpy.ndarray or "
                 "data.Dataset, but got {type}.".format(type=type(x))
             )
+
+
+class StructuredDataAdapter(adapter_module.Adapter):
+    def check(self, x):
+        if not isinstance(x, np.ndarray):
+            raise TypeError(
+                "Unsupported type {type} for "
+                "{name}.".format(type=type(x), name=self.__class__.__name__)
+            )
diff --git a/autokeras/adapters/input_adapters_test.py b/autokeras/adapters/input_adapters_test.py
@@ -14,6 +14,7 @@
 
 
 import numpy as np
+import pandas as pd
 import pytest
 
 from autokeras import test_utils
@@ -74,3 +75,20 @@ def test_text_input_type_error():
     with pytest.raises(TypeError) as info:
         x = adapter.adapt(x)
     assert "Expect the data to TextInput to be numpy" in str(info.value)
+
+
+def test_structured_data_input_unsupported_type_error():
+    with pytest.raises(TypeError) as info:
+        adapter = input_adapters.StructuredDataAdapter()
+        adapter.adapt("unknown")
+
+    assert "Unsupported type" in str(info.value)
+
+
+def test_structured_data_input_transform_to_dataset():
+    x = pd.read_csv(test_utils.TRAIN_CSV_PATH).to_numpy().astype(str)
+    adapter = input_adapters.StructuredDataAdapter()
+
+    x = adapter.adapt(x)
+
+    assert isinstance(x, np.ndarray)
diff --git a/autokeras/analysers/__init__.py b/autokeras/analysers/__init__.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from autokeras.analysers.input_analysers import CATEGORICAL
+from autokeras.analysers.input_analysers import NUMERICAL
 from autokeras.analysers.input_analysers import ImageAnalyser
 from autokeras.analysers.input_analysers import InputAnalyser
+from autokeras.analysers.input_analysers import StructuredDataAnalyser
 from autokeras.analysers.input_analysers import TextAnalyser
 from autokeras.analysers.output_analysers import ClassificationAnalyser
 from autokeras.analysers.output_analysers import RegressionAnalyser
diff --git a/autokeras/analysers/input_analysers.py b/autokeras/analysers/input_analysers.py
@@ -11,8 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy as np
+
 from autokeras.engine import analyser
 
+CATEGORICAL = "categorical"
+NUMERICAL = "numerical"
+
 
 class InputAnalyser(analyser.Analyser):
     def finalize(self):
@@ -52,3 +57,107 @@ def finalize(self):
                 "Expect the data to TextInput to be strings, but got "
                 "{type}.".format(type=self.dtype)
             )
+
+
+class StructuredDataAnalyser(InputAnalyser):
+    def __init__(self, column_names=None, column_types=None, **kwargs):
+        super().__init__(**kwargs)
+        self.column_names = column_names
+        self.column_types = column_types
+        # Variables for inferring column types.
+        self.count_numerical = None
+        self.count_categorical = None
+        self.count_unique_numerical = []
+        self.num_col = None
+
+    def update(self, data):
+        # TODO: update this to treat data as a np.ndarray containing all the
+        # data. Currently, it is a numpy array containing one batch of data.
+        super().update(data)
+        # The super class set self.dtype to "string" based on the input numpy
+        # array.  However, the preprocessor will encode it to float32.  So, set
+        # self.dtype to "float32", which would be propagated to the Keras Input
+        # node.
+        self.dtype = "float32"
+        if len(self.shape) != 2:
+            return
+        # data is a numpy array
+        for instance in data:
+            self._update_instance(instance)
+
+    def _update_instance(self, x):
+        if self.num_col is None:
+            self.num_col = len(x)
+            self.count_numerical = np.zeros(self.num_col)
+            self.count_categorical = np.zeros(self.num_col)
+            for _ in range(len(x)):
+                self.count_unique_numerical.append({})
+        for i in range(self.num_col):
+            x_i = x[i]
+            if isinstance(x_i, bytes):
+                x_i = x_i.decode("utf-8")
+            try:
+                tmp_num = float(x_i)
+                self.count_numerical[i] += 1
+                if tmp_num not in self.count_unique_numerical[i]:
+                    self.count_unique_numerical[i][tmp_num] = 1
+                else:
+                    self.count_unique_numerical[i][tmp_num] += 1
+            except ValueError:
+                self.count_categorical[i] += 1
+
+    def finalize(self):
+        self.check()
+        self.infer_column_types()
+
+    def get_input_name(self):
+        return "StructuredDataInput"
+
+    def check(self):
+        if len(self.shape) != 2:
+            raise ValueError(
+                "Expect the data to {input_name} to have shape "
+                "(batch_size, num_features), but "
+                "got input shape {shape}.".format(
+                    input_name=self.get_input_name(), shape=self.shape
+                )
+            )
+
+        # Fill in the column_names
+        if self.column_names is None:
+            if self.column_types:
+                raise ValueError(
+                    "column_names must be specified, if "
+                    "column_types is specified."
+                )
+            self.column_names = [str(index) for index in range(self.shape[1])]
+
+        # Check if column_names has the correct length.
+        if len(self.column_names) != self.shape[1]:
+            raise ValueError(
+                "Expect column_names to have length {expect} "
+                "but got {actual}.".format(
+                    expect=self.shape[1], actual=len(self.column_names)
+                )
+            )
+
+    def infer_column_types(self):
+        column_types = {}
+
+        for i in range(self.num_col):
+            if self.count_categorical[i] > 0:
+                column_types[self.column_names[i]] = CATEGORICAL
+            elif (
+                len(self.count_unique_numerical[i]) / self.count_numerical[i]
+                < 0.05
+            ):
+                column_types[self.column_names[i]] = CATEGORICAL
+            else:
+                column_types[self.column_names[i]] = NUMERICAL
+
+        # Partial column_types is provided.
+        if self.column_types is None:
+            self.column_types = {}
+        for key, value in column_types.items():
+            if key not in self.column_types:
+                self.column_types[key] = value
diff --git a/autokeras/analysers/input_analysers_test.py b/autokeras/analysers/input_analysers_test.py
@@ -13,12 +13,78 @@
 # limitations under the License.
 
 
+import copy
+
 import numpy as np
+import pandas as pd
 import pytest
 
+from autokeras import test_utils
 from autokeras.analysers import input_analysers
 
 
+def test_structured_data_input_less_col_name_error():
+    with pytest.raises(ValueError) as info:
+        analyser = input_analysers.StructuredDataAnalyser(
+            column_names=list(range(8))
+        )
+        dataset = np.random.rand(20, 10)
+        analyser.update(dataset)
+
+        analyser.finalize()
+
+    assert "Expect column_names to have length" in str(info.value)
+
+
+def test_structured_data_infer_col_types():
+    analyser = input_analysers.StructuredDataAnalyser(
+        column_names=test_utils.COLUMN_NAMES,
+        column_types=None,
+    )
+    x = pd.read_csv(test_utils.TRAIN_CSV_PATH)
+    x.pop("survived")
+    dataset = x.values.astype(str)
+
+    analyser.update(dataset)
+    analyser.finalize()
+
+    assert analyser.column_types == test_utils.COLUMN_TYPES
+
+
+def test_dont_infer_specified_column_types():
+    column_types = copy.copy(test_utils.COLUMN_TYPES)
+    column_types.pop("sex")
+    column_types["age"] = "categorical"
+
+    analyser = input_analysers.StructuredDataAnalyser(
+        column_names=test_utils.COLUMN_NAMES,
+        column_types=column_types,
+    )
+    x = pd.read_csv(test_utils.TRAIN_CSV_PATH)
+    x.pop("survived")
+    dataset = x.values.astype(str)
+
+    analyser.update(dataset)
+    analyser.finalize()
+
+    assert analyser.column_types["age"] == "categorical"
+
+
+def test_structured_data_input_with_illegal_dim():
+    analyser = input_analysers.StructuredDataAnalyser(
+        column_names=test_utils.COLUMN_NAMES,
+        column_types=None,
+    )
+    dataset = np.random.rand(100, 32, 32)
+    with pytest.raises(ValueError) as info:
+        analyser.update(dataset)
+        analyser.finalize()
+
+    assert "Expect the data to StructuredDataInput to have shape" in str(
+        info.value
+    )
+
+
 def test_image_input_analyser_shape_is_list_of_int():
     analyser = input_analysers.ImageAnalyser()
     dataset = np.random.rand(100, 32, 32, 3)

diff --git a/autokeras/analysers/output_analysers_test.py b/autokeras/analysers/output_analysers_test.py
@@ -21,7 +21,7 @@
 
 def test_clf_head_one_hot_shape_error():
     analyser = output_analysers.ClassificationAnalyser(name="a", num_classes=9)
-    dataset = test_utils.generate_one_hot_labels(dtype="np", num_classes=10)
+    dataset = test_utils.generate_one_hot_labels(num_classes=10)
 
     with pytest.raises(ValueError) as info:
         analyser.update(dataset)
@@ -67,9 +67,7 @@ def test_one_class_error():
 
 def test_infer_ten_classes():
     analyser = output_analysers.ClassificationAnalyser(name="a")
-    dataset = test_utils.generate_one_hot_labels(
-        dtype="dataset", num_classes=10
-    )
+    dataset = test_utils.generate_one_hot_labels(num_classes=10)
 
     analyser.update(dataset)
     analyser.finalize()

diff --git a/autokeras/blocks/__init__.py b/autokeras/blocks/__init__.py
@@ -31,6 +31,7 @@
 from autokeras.blocks.reduction import TemporalReduction
 from autokeras.blocks.wrapper import GeneralBlock
 from autokeras.blocks.wrapper import ImageBlock
+from autokeras.blocks.wrapper import StructuredDataBlock
 from autokeras.blocks.wrapper import TextBlock
 from autokeras.utils import utils
 

diff --git a/autokeras/blocks/heads_test.py b/autokeras/blocks/heads_test.py
@@ -103,7 +103,7 @@ def test_clf_head_build_with_zero_dropout_return_tensor():
 
 
 def test_clf_head_hpps_with_uint8_contain_cast_to_int32():
-    dataset = test_utils.generate_one_hot_labels(100, 10, "dataset")
+    dataset = test_utils.generate_one_hot_labels(100, 10)
     dataset = dataset.astype("uint8")
     head = head_module.ClassificationHead(shape=(8,))
     analyser = head.get_analyser()

diff --git a/autokeras/blocks/preprocessing.py b/autokeras/blocks/preprocessing.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""Blocks for data preprocessing.
+
+They are built into keras preprocessing layers and will be part of the Keras
+model.
+
+"""
 from typing import Optional
 from typing import Tuple
 from typing import Union