Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,957 changes: 1,957 additions & 0 deletions a.patch

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions autokeras/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from autokeras.blocks import ConvBlock
from autokeras.blocks import DenseBlock
from autokeras.blocks import EfficientNetBlock
from autokeras.blocks import Embedding
from autokeras.blocks import Flatten
from autokeras.blocks import ImageAugmentation
from autokeras.blocks import ImageBlock
Expand All @@ -28,6 +29,7 @@
from autokeras.blocks import ResNetBlock
from autokeras.blocks import RNNBlock
from autokeras.blocks import SpatialReduction
from autokeras.blocks import StructuredDataBlock
from autokeras.blocks import TemporalReduction
from autokeras.blocks import TextBlock
from autokeras.blocks import XceptionBlock
Expand All @@ -38,9 +40,12 @@
from autokeras.keras_layers import ExpandLastDim
from autokeras.nodes import ImageInput
from autokeras.nodes import Input
from autokeras.nodes import StructuredDataInput
from autokeras.nodes import TextInput
from autokeras.tasks import ImageClassifier
from autokeras.tasks import ImageRegressor
from autokeras.tasks import StructuredDataClassifier
from autokeras.tasks import StructuredDataRegressor
from autokeras.tasks import TextClassifier
from autokeras.tasks import TextRegressor
from autokeras.tuners import BayesianOptimization
Expand All @@ -51,8 +56,6 @@
__version__ = "2.1.0dev"

CUSTOM_OBJECTS = {
"BertPreprocessor": keras_nlp.models.BertPreprocessor,
"BertBackbone": keras_nlp.models.BertBackbone,
"CastToFloat32": CastToFloat32,
"ExpandLastDim": ExpandLastDim,
}
1 change: 1 addition & 0 deletions autokeras/adapters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from autokeras.adapters.input_adapters import ImageAdapter
from autokeras.adapters.input_adapters import InputAdapter
from autokeras.adapters.input_adapters import StructuredDataAdapter
from autokeras.adapters.input_adapters import TextAdapter
from autokeras.adapters.output_adapters import ClassificationAdapter
from autokeras.adapters.output_adapters import RegressionAdapter
9 changes: 9 additions & 0 deletions autokeras/adapters/input_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,12 @@ def check(self, x):
"Expect the data to TextInput to be numpy.ndarray or "
"data.Dataset, but got {type}.".format(type=type(x))
)


class StructuredDataAdapter(adapter_module.Adapter):
def check(self, x):
if not isinstance(x, np.ndarray):
raise TypeError(
"Unsupported type {type} for "
"{name}.".format(type=type(x), name=self.__class__.__name__)
)
18 changes: 18 additions & 0 deletions autokeras/adapters/input_adapters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@


import numpy as np
import pandas as pd
import pytest

from autokeras import test_utils
Expand Down Expand Up @@ -74,3 +75,20 @@ def test_text_input_type_error():
with pytest.raises(TypeError) as info:
x = adapter.adapt(x)
assert "Expect the data to TextInput to be numpy" in str(info.value)


def test_structured_data_input_unsupported_type_error():
with pytest.raises(TypeError) as info:
adapter = input_adapters.StructuredDataAdapter()
adapter.adapt("unknown")

assert "Unsupported type" in str(info.value)


def test_structured_data_input_transform_to_dataset():
x = pd.read_csv(test_utils.TRAIN_CSV_PATH).to_numpy().astype(str)
adapter = input_adapters.StructuredDataAdapter()

x = adapter.adapt(x)

assert isinstance(x, np.ndarray)
3 changes: 3 additions & 0 deletions autokeras/analysers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from autokeras.analysers.input_analysers import CATEGORICAL
from autokeras.analysers.input_analysers import NUMERICAL
from autokeras.analysers.input_analysers import ImageAnalyser
from autokeras.analysers.input_analysers import InputAnalyser
from autokeras.analysers.input_analysers import StructuredDataAnalyser
from autokeras.analysers.input_analysers import TextAnalyser
from autokeras.analysers.output_analysers import ClassificationAnalyser
from autokeras.analysers.output_analysers import RegressionAnalyser
109 changes: 109 additions & 0 deletions autokeras/analysers/input_analysers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np

from autokeras.engine import analyser

CATEGORICAL = "categorical"
NUMERICAL = "numerical"


class InputAnalyser(analyser.Analyser):
def finalize(self):
Expand Down Expand Up @@ -52,3 +57,107 @@ def finalize(self):
"Expect the data to TextInput to be strings, but got "
"{type}.".format(type=self.dtype)
)


class StructuredDataAnalyser(InputAnalyser):
def __init__(self, column_names=None, column_types=None, **kwargs):
super().__init__(**kwargs)
self.column_names = column_names
self.column_types = column_types
# Variables for inferring column types.
self.count_numerical = None
self.count_categorical = None
self.count_unique_numerical = []
self.num_col = None

def update(self, data):
# TODO: update this to treat data as a np.ndarray containing all the
# data. Currently, it is a numpy array containing one batch of data.
super().update(data)
# The super class set self.dtype to "string" based on the input numpy
# array. However, the preprocessor will encode it to float32. So, set
# self.dtype to "float32", which would be propagated to the Keras Input
# node.
self.dtype = "float32"
if len(self.shape) != 2:
return
# data is a numpy array
for instance in data:
self._update_instance(instance)

def _update_instance(self, x):
if self.num_col is None:
self.num_col = len(x)
self.count_numerical = np.zeros(self.num_col)
self.count_categorical = np.zeros(self.num_col)
for _ in range(len(x)):
self.count_unique_numerical.append({})
for i in range(self.num_col):
x_i = x[i]
if isinstance(x_i, bytes):
x_i = x_i.decode("utf-8")
try:
tmp_num = float(x_i)
self.count_numerical[i] += 1
if tmp_num not in self.count_unique_numerical[i]:
self.count_unique_numerical[i][tmp_num] = 1
else:
self.count_unique_numerical[i][tmp_num] += 1
except ValueError:
self.count_categorical[i] += 1

def finalize(self):
self.check()
self.infer_column_types()

def get_input_name(self):
return "StructuredDataInput"

def check(self):
if len(self.shape) != 2:
raise ValueError(
"Expect the data to {input_name} to have shape "
"(batch_size, num_features), but "
"got input shape {shape}.".format(
input_name=self.get_input_name(), shape=self.shape
)
)

# Fill in the column_names
if self.column_names is None:
if self.column_types:
raise ValueError(
"column_names must be specified, if "
"column_types is specified."
)
self.column_names = [str(index) for index in range(self.shape[1])]

# Check if column_names has the correct length.
if len(self.column_names) != self.shape[1]:
raise ValueError(
"Expect column_names to have length {expect} "
"but got {actual}.".format(
expect=self.shape[1], actual=len(self.column_names)
)
)

def infer_column_types(self):
column_types = {}

for i in range(self.num_col):
if self.count_categorical[i] > 0:
column_types[self.column_names[i]] = CATEGORICAL
elif (
len(self.count_unique_numerical[i]) / self.count_numerical[i]
< 0.05
):
column_types[self.column_names[i]] = CATEGORICAL
else:
column_types[self.column_names[i]] = NUMERICAL

# Partial column_types is provided.
if self.column_types is None:
self.column_types = {}
for key, value in column_types.items():
if key not in self.column_types:
self.column_types[key] = value
66 changes: 66 additions & 0 deletions autokeras/analysers/input_analysers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,78 @@
# limitations under the License.


import copy

import numpy as np
import pandas as pd
import pytest

from autokeras import test_utils
from autokeras.analysers import input_analysers


def test_structured_data_input_less_col_name_error():
with pytest.raises(ValueError) as info:
analyser = input_analysers.StructuredDataAnalyser(
column_names=list(range(8))
)
dataset = np.random.rand(20, 10)
analyser.update(dataset)

analyser.finalize()

assert "Expect column_names to have length" in str(info.value)


def test_structured_data_infer_col_types():
analyser = input_analysers.StructuredDataAnalyser(
column_names=test_utils.COLUMN_NAMES,
column_types=None,
)
x = pd.read_csv(test_utils.TRAIN_CSV_PATH)
x.pop("survived")
dataset = x.values.astype(str)

analyser.update(dataset)
analyser.finalize()

assert analyser.column_types == test_utils.COLUMN_TYPES


def test_dont_infer_specified_column_types():
column_types = copy.copy(test_utils.COLUMN_TYPES)
column_types.pop("sex")
column_types["age"] = "categorical"

analyser = input_analysers.StructuredDataAnalyser(
column_names=test_utils.COLUMN_NAMES,
column_types=column_types,
)
x = pd.read_csv(test_utils.TRAIN_CSV_PATH)
x.pop("survived")
dataset = x.values.astype(str)

analyser.update(dataset)
analyser.finalize()

assert analyser.column_types["age"] == "categorical"


def test_structured_data_input_with_illegal_dim():
analyser = input_analysers.StructuredDataAnalyser(
column_names=test_utils.COLUMN_NAMES,
column_types=None,
)
dataset = np.random.rand(100, 32, 32)
with pytest.raises(ValueError) as info:
analyser.update(dataset)
analyser.finalize()

assert "Expect the data to StructuredDataInput to have shape" in str(
info.value
)


def test_image_input_analyser_shape_is_list_of_int():
analyser = input_analysers.ImageAnalyser()
dataset = np.random.rand(100, 32, 32, 3)
Expand Down
6 changes: 2 additions & 4 deletions autokeras/analysers/output_analysers_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

def test_clf_head_one_hot_shape_error():
analyser = output_analysers.ClassificationAnalyser(name="a", num_classes=9)
dataset = test_utils.generate_one_hot_labels(dtype="np", num_classes=10)
dataset = test_utils.generate_one_hot_labels(num_classes=10)

with pytest.raises(ValueError) as info:
analyser.update(dataset)
Expand Down Expand Up @@ -67,9 +67,7 @@ def test_one_class_error():

def test_infer_ten_classes():
analyser = output_analysers.ClassificationAnalyser(name="a")
dataset = test_utils.generate_one_hot_labels(
dtype="dataset", num_classes=10
)
dataset = test_utils.generate_one_hot_labels(num_classes=10)

analyser.update(dataset)
analyser.finalize()
Expand Down
1 change: 1 addition & 0 deletions autokeras/blocks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from autokeras.blocks.reduction import TemporalReduction
from autokeras.blocks.wrapper import GeneralBlock
from autokeras.blocks.wrapper import ImageBlock
from autokeras.blocks.wrapper import StructuredDataBlock
from autokeras.blocks.wrapper import TextBlock
from autokeras.utils import utils

Expand Down
2 changes: 1 addition & 1 deletion autokeras/blocks/heads_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_clf_head_build_with_zero_dropout_return_tensor():


def test_clf_head_hpps_with_uint8_contain_cast_to_int32():
dataset = test_utils.generate_one_hot_labels(100, 10, "dataset")
dataset = test_utils.generate_one_hot_labels(100, 10)
dataset = dataset.astype("uint8")
head = head_module.ClassificationHead(shape=(8,))
analyser = head.get_analyser()
Expand Down
6 changes: 6 additions & 0 deletions autokeras/blocks/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""Blocks for data preprocessing.

They are built into keras preprocessing layers and will be part of the Keras
model.

"""
from typing import Optional
from typing import Tuple
from typing import Union
Expand Down
Loading
Loading