Skip to content

Commit e60cbe2

Browse files
authored
Merge pull request #49 from mindsdb/staging
Release 0.0.14
2 parents 4f471a0 + 5c643da commit e60cbe2

File tree

8 files changed

+121
-117
lines changed

8 files changed

+121
-117
lines changed

.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[flake8]
22
max-line-length = 120
3-
ignore = E275,E402,F821,W503,W504,C408,W391
3+
ignore = E275,E402,F821,W503,W504,C408,W391,E721
44
exclude = .git,__pycache__,docs,docssrc

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
strategy:
1717
matrix:
1818
os: [ubuntu-latest]
19-
python-version: [3.7,3.8,3.9]
19+
python-version: ["3.7","3.8","3.9","3.10","3.11"]
2020
steps:
2121
- uses: actions/checkout@v2
2222
- name: Set up Python ${{ matrix.python-version }}

poetry.lock

Lines changed: 71 additions & 103 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
[tool.poetry]
22
name = "type_infer"
3-
version = "0.0.13"
3+
version = "0.0.14"
44
description = "Automated type inference for Machine Learning pipelines."
55
authors = ["MindsDB Inc. <[email protected]>"]
66
license = "GPL-3.0"
77
readme = "README.md"
88
packages = [{include = "type_infer"}]
99

1010
[tool.poetry.dependencies]
11-
python = ">=3.7,<3.10"
11+
python = ">=3.7,<3.12"
1212
python-dateutil = "^2.1"
1313
scipy = "^1"
1414
numpy = "^1.15"

tests/integration_tests/test_type_infer.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
from uuid import uuid4
12
import unittest
3+
import numpy as np
24
import pandas as pd
5+
from datetime import datetime, timedelta
36

7+
from type_infer.dtype import dtype
48
from type_infer.infer import infer_types
59

610

@@ -71,3 +75,31 @@ def test_1_stack_overflow_survey(self):
7175

7276
for col in expected_ids:
7377
self.assertTrue(expected_ids[col], inferred_types.identifiers[col])
78+
79+
def test_2_simple(self):
80+
n_points = 50
81+
n_corrupted = 2
82+
df = pd.DataFrame({
83+
'date': [(datetime.now() - timedelta(days=i)).strftime('%Y-%m-%d') for i in range(n_points)],
84+
'datetime': [(datetime.now() - timedelta(days=i)).strftime('%Y-%m-%dT%H:%M') for i in range(n_points)],
85+
'integer': [*range(n_points)],
86+
'float': np.linspace(0, n_points, n_points),
87+
'uuid': [str(uuid4()) for i in range(n_points)],
88+
})
89+
90+
# manual tinkering
91+
df['float'].iloc[-n_corrupted:] = 'random string'
92+
93+
inferred_types = infer_types(df, pct_invalid=100 * (n_corrupted) / n_points)
94+
expected_types = {
95+
'date': dtype.date,
96+
'datetime': dtype.datetime,
97+
'integer': dtype.integer,
98+
'float': dtype.float,
99+
'uuid': dtype.categorical,
100+
}
101+
self.assertEqual(expected_types, inferred_types.dtypes) # check type inference is correct
102+
self.assertTrue(inferred_types.additional_info['date']['dtype_dist']['date'] == n_points) # no dropped rows (pct_invalid is 0) # noqa
103+
self.assertTrue(inferred_types.additional_info['float']['dtype_dist']['float'] == n_points - 2) # due to str injection # noqa
104+
self.assertTrue('uuid' in inferred_types.identifiers)
105+
self.assertTrue(inferred_types.identifiers['uuid'] == 'UUID')

type_infer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from type_infer import helpers
55

66

7-
__version__ = '0.0.13'
7+
__version__ = '0.0.14'
88

99

1010
__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']

type_infer/helpers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty
9494
unique_pct = nr_unique / len(data)
9595

9696
spaces = [len(str(x).split(' ')) - 1 for x in data]
97-
mean_spaces = np.mean(spaces)
97+
mean_spaces = np.mean(spaces) if len(spaces) > 0 else 0.0
9898

9999
# Detect hash
100100
all_same_length = all(len(str(data[0])) == len(str(x)) for x in data)
@@ -113,7 +113,8 @@ def get_identifier_description(data: Iterable, column_name: str, data_dtype: dty
113113
else:
114114
randomness_per_index.append(S / np.log(N))
115115

116-
if np.mean(randomness_per_index) > 0.95:
116+
mean_randomness = np.mean(randomness_per_index) if len(randomness_per_index) > 0 else 0
117+
if mean_randomness > 0.95:
117118
return 'Hash-like identifier'
118119

119120
# Detect foreign key

type_infer/infer.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,12 @@ def get_column_data_type(data: Union[np.ndarray, list], full_data: pd.DataFrame,
233233
if all(isinstance(x, str) for x in data):
234234
can_be_tags = True
235235

236+
mean_lenghts = np.mean(lengths) if len(lengths) > 0 else 0
237+
236238
# If more than 30% of the samples contain more than 1 category and there's more than 6 and less than 30 of them and they are shared between the various cells # noqa
237-
if (can_be_tags and np.mean(lengths) > 1.3 and
239+
if (can_be_tags and mean_lenghts > 1.3 and
238240
6 <= len(unique_tokens) <= 30 and
239-
len(unique_tokens) / np.mean(lengths) < (len(data) / 4)):
241+
len(unique_tokens) / mean_lenghts < (len(data) / 4)):
240242
curr_dtype = dtype.tags
241243

242244
# Categorical based on unique values
@@ -392,9 +394,10 @@ def infer_types(
392394
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
393395

394396
nr_procs = get_nr_procs(df=sample_df)
395-
if data.size > mp_cutoff and nr_procs > 1:
396-
log.info(f'Using {nr_procs} processes to deduct types.')
397-
pool = mp.Pool(processes=nr_procs)
397+
pool_size = min(nr_procs, len(sample_df.columns.values))
398+
if data.size > mp_cutoff and pool_size > 1:
399+
log.info(f'Using {pool_size} processes to deduct types.')
400+
pool = mp.Pool(processes=pool_size)
398401
# column-wise parallelization # TODO: evaluate switching to row-wise split instead
399402
answer_arr = pool.starmap(get_column_data_type, [
400403
(sample_df[x].dropna(), data[x], x, pct_invalid) for x in sample_df.columns.values
@@ -422,8 +425,8 @@ def infer_types(
422425
'dtype_dist': data_dtype_dist
423426
}
424427

425-
if data.size > mp_cutoff and nr_procs > 1:
426-
pool = mp.Pool(processes=nr_procs)
428+
if data.size > mp_cutoff and pool_size > 1:
429+
pool = mp.Pool(processes=pool_size)
427430
answer_arr = pool.map(get_identifier_description_mp, [
428431
(data[x], x, type_information.dtypes[x])
429432
for x in sample_df.columns

0 commit comments

Comments
 (0)