Bug fixes for python notebooks (#1955)

haifeng-jin · web-flow · commit 0b2c3633b040 · 2025-11-10T23:29:08.000-08:00
* bug fixes

* update

* use torch backend for all notebooks

* test coverage
diff --git a/autokeras/engine/analyser.py b/autokeras/engine/analyser.py
@@ -41,7 +41,9 @@ def update(self, data):
             data: np.ndarray. The entire dataset.
         """
         if self.dtype is None:
-            if np.issubdtype(data.dtype, np.str_):
+            if np.issubdtype(data.dtype, np.str_) or np.issubdtype(
+                data.dtype, np.bytes_
+            ):
                 self.dtype = "string"
             else:
                 self.dtype = str(data.dtype)
diff --git a/autokeras/engine/analyser_test.py b/autokeras/engine/analyser_test.py
@@ -0,0 +1,73 @@
+# Copyright 2020 The AutoKeras Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+
+from autokeras.engine.analyser import Analyser
+
+
+def test_analyser_update_unicode_string_dtype():
+    analyser = Analyser()
+    data = np.array(["hello", "world"], dtype="U10")
+
+    analyser.update(data)
+
+    assert analyser.dtype == "string"
+    assert analyser.shape == [2]
+    assert analyser.batch_size == 2
+    assert analyser.num_samples == 2
+
+
+def test_analyser_update_byte_string_dtype():
+    analyser = Analyser()
+    data = np.array([b"hello", b"world"], dtype="S10")
+
+    analyser.update(data)
+
+    assert analyser.dtype == "string"
+    assert analyser.shape == [2]
+    assert analyser.batch_size == 2
+    assert analyser.num_samples == 2
+
+
+def test_analyser_update_numeric_dtype():
+    analyser = Analyser()
+    data = np.array([1, 2, 3], dtype=np.int32)
+
+    analyser.update(data)
+
+    assert analyser.dtype == "int32"
+    assert analyser.shape == [3]
+    assert analyser.batch_size == 3
+    assert analyser.num_samples == 3
+
+
+def test_analyser_update_float_dtype():
+    analyser = Analyser()
+    data = np.array([1.0, 2.0, 3.0], dtype=np.float64)
+
+    analyser.update(data)
+
+    assert analyser.dtype == "float64"
+    assert analyser.shape == [3]
+    assert analyser.batch_size == 3
+    assert analyser.num_samples == 3
+
+
+def test_analyser_finalize_not_implemented():
+    analyser = Analyser()
+
+    with pytest.raises(NotImplementedError):
+        analyser.finalize()
diff --git a/autokeras/graph.py b/autokeras/graph.py
@@ -296,7 +296,9 @@ def _compile_keras_model(self, hp, model):
         elif optimizer_name == "sgd":
             optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
         elif optimizer_name == "adam_weight_decay":
-            steps_per_epoch = int(self.num_samples / self.batch_size)
+            steps_per_epoch = max(
+                1, int(self.num_samples / (self.batch_size or 32))
+            )
             num_train_steps = steps_per_epoch * self.epochs
 
             lr_schedule = keras.optimizers.schedules.PolynomialDecay(
@@ -335,6 +337,7 @@ def set_fit_args(self, validation_split, epochs=None):
         # Epochs not specified by the user
         if self.epochs is None:
             self.epochs = 1
+        validation_split = validation_split or 0
         # num_samples from analysers are before split
         self.num_samples = self.inputs[0].num_samples * (1 - validation_split)
 
diff --git a/autokeras/graph_test.py b/autokeras/graph_test.py
@@ -89,9 +89,10 @@ def test_adamw_optimizer():
     hp.Choice("optimizer", ["adam", "sgd", "adam_weight_decay"], default="adam")
     hp.values["optimizer"] = "adam_weight_decay"
     graph = graph_module.Graph(inputs=input_node, outputs=output_node)
-    graph.num_samples = 10000
+    graph.inputs[0].num_samples = 100
     graph.inputs[0].batch_size = 32
     graph.epochs = 10
+    graph.set_fit_args(0, epochs=10)
     model = graph.build(hp)
     assert model.input_shape == (None, 30)
     assert model.output_shape == (None, 1)
@@ -168,3 +169,16 @@ def test_graph_can_init_with_one_missing_output():
     ak.ClassificationHead()(output_node)
 
     graph_module.Graph(input_node, output_node)
+
+
+def test_set_fit_args_with_none_validation_split():
+    input_node = ak.Input(shape=(30,))
+    output_node = input_node
+    output_node = ak.DenseBlock()(output_node)
+    output_node = ak.RegressionHead(shape=(1,))(output_node)
+
+    graph = graph_module.Graph(inputs=input_node, outputs=output_node)
+    graph.inputs[0].num_samples = 100
+    graph.inputs[0].batch_size = 32
+    graph.set_fit_args(None, epochs=1)
+    assert graph.num_samples == 100  # Should handle None as 0
diff --git a/autokeras/nodes.py b/autokeras/nodes.py
@@ -145,9 +145,12 @@ def get_block(self):
 
     def get_hyper_preprocessors(self):
         return [
+            hyper_preprocessors.DefaultHyperPreprocessor(
+                preprocessors.CastToString()
+            ),
             hyper_preprocessors.DefaultHyperPreprocessor(
                 preprocessors.TextTokenizer()
-            )
+            ),
         ]
 
 
diff --git a/autokeras/preprocessors/common.py b/autokeras/preprocessors/common.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import Counter
+
 import keras
 import numpy as np
 
@@ -39,28 +41,35 @@ class CastToString(preprocessor.Preprocessor):
     """Cast the dataset shape to string."""
 
     def transform(self, dataset):
-        return dataset.astype("str")
+        if np.issubdtype(dataset.dtype, np.bytes_):
+            return np.array(
+                [x.decode("utf-8", errors="ignore") for x in dataset]
+            )
+        else:
+            return dataset.astype("str")
 
 
 @keras.utils.register_keras_serializable(package="autokeras")
 class TextTokenizer(preprocessor.Preprocessor):
     """Simple text tokenizer that converts strings to integer sequences."""
 
-    def __init__(self, max_len=100, vocab=None, **kwargs):
+    def __init__(self, max_len=100, vocab=None, max_vocab=500, **kwargs):
         super().__init__(**kwargs)
         self.max_len = max_len
         self.vocab = vocab
+        self.max_vocab = max_vocab
 
     def fit(self, dataset):
         # Build vocab from unique words in the dataset
-        unique_words = set()
+        unique_words = []
         for text in dataset:
             words = text.split()
-            unique_words.update(words)
-        # Sort for consistency
-        sorted_words = sorted(unique_words)
+            unique_words.extend(words)
+        word_counts = Counter(unique_words)
+        sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
         self.vocab = {
-            word: idx + 1 for idx, word in enumerate(sorted_words)
+            word: idx + 1
+            for idx, word in enumerate(sorted_words[: self.max_vocab])
         }  # Start from 1, 0 for padding
 
     def transform(self, dataset):
@@ -80,6 +89,7 @@ def get_config(self):
             {
                 "max_len": self.max_len,
                 "vocab": self.vocab,
+                "max_vocab": self.max_vocab,
             }
         )
         return config
diff --git a/autokeras/preprocessors/common_test.py b/autokeras/preprocessors/common_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+
 from autokeras import test_utils
 from autokeras.preprocessors import common
 
@@ -21,3 +23,39 @@ def test_cast_to_int32_return_int32():
     x = x.astype("uint8")
     x = common.CastToInt32().transform(x)
     assert x.dtype == "int32"
+
+
+def test_cast_to_string_with_bytes():
+    x = np.array([b"hello", b"world"])
+    result = common.CastToString().transform(x)
+    assert result.dtype.kind in ["U", "S"]  # Unicode or byte string
+    assert result[0] == "hello"
+    assert result[1] == "world"
+
+
+def test_cast_to_string_with_strings():
+    x = np.array(["hello", "world"])
+    result = common.CastToString().transform(x)
+    assert result.dtype.kind in ["U", "S"]
+    assert result[0] == "hello"
+    assert result[1] == "world"
+
+
+def test_text_tokenizer_vocab_limit():
+    x = np.array(["word1 word2 word3", "word1 word4 word5"])
+    tokenizer = common.TextTokenizer(max_vocab=2)
+    tokenizer.fit(x)
+    assert len(tokenizer.vocab) <= 3  # 2 words + 1 for unknown (0 is padding)
+    # word1 should be most frequent
+    assert "word1" in tokenizer.vocab
+    assert tokenizer.vocab["word1"] == 1
+
+
+def test_text_tokenizer_transform():
+    x = np.array(["hello world", "hello"])
+    tokenizer = common.TextTokenizer(max_vocab=10)
+    tokenizer.fit(x)
+    result = tokenizer.transform(x)
+    assert result.shape == (2, 100)  # max_len=100
+    assert result.dtype == np.int32
+    assert result[0][0] == tokenizer.vocab.get("hello", 0)
diff --git a/docs/py/customized.py b/docs/py/customized.py
@@ -1,4 +1,5 @@
 """shell
+export KERAS_BACKEND="torch"
 pip install autokeras
 """
 
diff --git a/docs/py/export.py b/docs/py/export.py
@@ -1,4 +1,5 @@
 """shell
+export KERAS_BACKEND="torch"
 pip install autokeras
 """
 
diff --git a/docs/py/image_classification.py b/docs/py/image_classification.py
@@ -1,4 +1,5 @@
 """shell
+export KERAS_BACKEND="torch"
 pip install autokeras
 """
 
diff --git a/docs/py/image_regression.py b/docs/py/image_regression.py
@@ -1,4 +1,5 @@
 """shell
+export KERAS_BACKEND="torch"
 pip install autokeras
 """
 
diff --git a/docs/py/multi.py b/docs/py/multi.py
@@ -1,4 +1,5 @@
 """shell
+export KERAS_BACKEND="torch"
 pip install autokeras
 """
 
diff --git a/docs/py/structured_data_classification.py b/docs/py/structured_data_classification.py
@@ -1,4 +1,5 @@
 """shell
+export KERAS_BACKEND="torch"
 pip install autokeras
 """
 
@@ -113,59 +114,6 @@
     epochs=10,
 )
 
-"""
-## Customized Search Space
-For advanced users, you may customize your search space by using
-[AutoModel](/auto_model/#automodel-class) instead of
-[StructuredDataClassifier](/structured_data_classifier). You can configure the
-[StructuredDataBlock](/block/#structureddatablock-class) for some high-level
-configurations, e.g., `categorical_encoding` for whether to use the
-[CategoricalToNumerical](/block/#categoricaltonumerical-class). You can also do
-not specify these arguments, which would leave the different choices to be
-tuned automatically. See the following example for detail.
-"""
-
-input_node = ak.StructuredDataInput()
-output_node = ak.StructuredDataBlock(categorical_encoding=True)(input_node)
-output_node = ak.ClassificationHead()(output_node)
-clf = ak.AutoModel(
-    inputs=input_node, outputs=output_node, overwrite=True, max_trials=3
-)
-clf.fit(x_train, y_train, epochs=10)
-
-"""
-The usage of [AutoModel](/auto_model/#automodel-class) is similar to the
-[functional API](https://keras.io/api/models/model/#with-the-functional-api) of
-Keras. Basically, you are building a graph, whose edges are blocks and the
-nodes are
-intermediate outputs of blocks.
-To add an edge from `input_node` to `output_node` with
-`output_node = ak.[some_block]([block_args])(input_node)`.
-
-You can even also use more fine grained blocks to customize the search space
-even further. See the following example.
-"""
-
-
-input_node = ak.StructuredDataInput()
-output_node = ak.DenseBlock()(input_node)
-output_node = ak.ClassificationHead()(output_node)
-clf = ak.AutoModel(
-    inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
-)
-clf.fit(x_train, y_train, epochs=1)
-clf.predict(x_train)
-
-"""
-You can also export the best model found by AutoKeras as a Keras Model.
-"""
-
-model = clf.export_model()
-model.summary()
-print(x_train.dtype)
-# numpy array in object (mixed type) is not supported.
-# convert it to unicode.
-model.predict(x_train.astype(str))
 
 """
 ## Reference
@@ -175,5 +123,4 @@
 [DenseBlock](/block/#denseblock-class),
 [StructuredDataInput](/node/#structureddatainput-class),
 [ClassificationHead](/block/#classificationhead-class),
-[CategoricalToNumerical](/block/#categoricaltonumerical-class).
 """
diff --git a/docs/py/structured_data_regression.py b/docs/py/structured_data_regression.py
diff --git a/docs/py/text_classification.py b/docs/py/text_classification.py
diff --git a/docs/py/text_regression.py b/docs/py/text_regression.py
diff --git a/docs/run_py_files.sh b/docs/run_py_files.sh

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""shell`
	`2`	`+export KERAS_BACKEND="torch"`
`2`	`3`	`pip install autokeras`
`3`	`4`	`"""`
`4`	`5`