Merge pull request #49 from mindsdb/staging

paxcema · web-flow · commit e60cbe255a28 · 2023-07-31T13:38:56.000-07:00
Release 0.0.14
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 max-line-length = 120
-ignore = E275,E402,F821,W503,W504,C408,W391
+ignore = E275,E402,F821,W503,W504,C408,W391,E721
 exclude = .git,__pycache__,docs,docssrc
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [3.7,3.8,3.9]
+        python-version: ["3.7","3.8","3.9","3.10","3.11"]
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,14 +1,14 @@
 [tool.poetry]
 name = "type_infer"
-version = "0.0.13"
+version = "0.0.14"
 description = "Automated type inference for Machine Learning pipelines."
 authors = ["MindsDB Inc. <hello@mindsdb.com>"]
 license = "GPL-3.0"
 readme = "README.md"
 packages = [{include = "type_infer"}]
 
 [tool.poetry.dependencies]
-python = ">=3.7,<3.10"
+python = ">=3.7,<3.12"
 python-dateutil = "^2.1"
 scipy = "^1"
 numpy = "^1.15"
diff --git a/tests/integration_tests/test_type_infer.py b/tests/integration_tests/test_type_infer.py
@@ -1,6 +1,10 @@
+from uuid import uuid4
 import unittest
+import numpy as np
 import pandas as pd
+from datetime import datetime, timedelta
 
+from type_infer.dtype import dtype
 from type_infer.infer import infer_types
 
 
@@ -71,3 +75,31 @@ def test_1_stack_overflow_survey(self):
 
         for col in expected_ids:
             self.assertTrue(expected_ids[col], inferred_types.identifiers[col])
+
+    def test_2_simple(self):
+        n_points = 50
+        n_corrupted = 2
+        df = pd.DataFrame({
+            'date': [(datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in range(n_points)],
+            'datetime': [(datetime.now() - timedelta(days=i)).strftime('%Y-%m-%dT%H:%M') for i in range(n_points)],
+            'integer': [*range(n_points)],
+            'float': np.linspace(0, n_points, n_points),
+            'uuid': [str(uuid4()) for i in range(n_points)],
+        })
+
+        # manual tinkering
+        df['float'].iloc[-n_corrupted:] = 'random string'
+
+        inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points)
+        expected_types = {
+            'date': dtype.date,
+            'datetime': dtype.datetime,
+            'integer': dtype.integer,
+            'float': dtype.float,
+            'uuid': dtype.categorical,
+        }
+        self.assertEqual(expected_types, inferred_types.dtypes)  # check type inference is correct
+        self.assertTrue(inferred_types.additional_info['date']['dtype_dist']['date'] == n_points)  # no dropped rows (pct_invalid is 0)   # noqa
+        self.assertTrue(inferred_types.additional_info['float']['dtype_dist']['float'] == n_points - 2)  # due to str injection  # noqa
+        self.assertTrue('uuid' in inferred_types.identifiers)
+        self.assertTrue(inferred_types.identifiers['uuid'] == 'UUID')
diff --git a/type_infer/__init__.py b/type_infer/__init__.py
@@ -4,7 +4,7 @@
 from type_infer import helpers
 
 
-__version__ = '0.0.13'
+__version__ = '0.0.14'
 
 
 __all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']
diff --git a/type_infer/helpers.py b/type_infer/helpers.py
@@ -94,7 +94,7 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty
     unique_pct = nr_unique / len(data)
 
     spaces = [len(str(x).split(' ')) - 1 for x in data]
-    mean_spaces = np.mean(spaces)
+    mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0
 
     # Detect hash
     all_same_length = all(len(str(data[0])) == len(str(x)) for x in data)
@@ -113,7 +113,8 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty
             else:
                 randomness_per_index.append(S / np.log(N))
 
-        if np.mean(randomness_per_index) > 0.95:
+        mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0
+        if mean_randomness > 0.95:
             return 'Hash-like identifier'
 
     # Detect foreign key
diff --git a/type_infer/infer.py b/type_infer/infer.py
@@ -233,10 +233,12 @@ def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame,
         if all(isinstance(x, str) for x in data):
             can_be_tags = True
 
+        mean_lenghts = np.mean(lengths) if len(lengths) > 0 else 0
+
         # If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa
-        if (can_be_tags and np.mean(lengths) > 1.3 and
+        if (can_be_tags and mean_lenghts > 1.3 and
                 6 <= len(unique_tokens) <= 30 and
-                len(unique_tokens) / np.mean(lengths) < (len(data) / 4)):
+                len(unique_tokens) / mean_lenghts < (len(data) / 4)):
             curr_dtype = dtype.tags
 
     # Categorical based on unique values
@@ -392,9 +394,10 @@ def infer_types(
         f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
 
     nr_procs = get_nr_procs(df=sample_df)
-    if data.size > mp_cutoff and nr_procs > 1:
-        log.info(f'Using {nr_procs} processes to deduct types.')
-        pool = mp.Pool(processes=nr_procs)
+    pool_size = min(nr_procs, len(sample_df.columns.values))
+    if data.size > mp_cutoff and pool_size > 1:
+        log.info(f'Using {pool_size} processes to deduct types.')
+        pool = mp.Pool(processes=pool_size)
         # column-wise parallelization  # TODO: evaluate switching to row-wise split instead
         answer_arr = pool.starmap(get_column_data_type, [
             (sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
@@ -422,8 +425,8 @@ def infer_types(
             'dtype_dist': data_dtype_dist
         }
 
-    if data.size > mp_cutoff and nr_procs > 1:
-        pool = mp.Pool(processes=nr_procs)
+    if data.size > mp_cutoff and pool_size > 1:
+        pool = mp.Pool(processes=pool_size)
         answer_arr = pool.map(get_identifier_description_mp, [
             (data[x], x, type_information.dtypes[x])
             for x in sample_df.columns