Skip to content

Commit dc95382

Browse files
authored
Merge pull request #56 from mindsdb/staging
Release 0.0.16
2 parents 67f5341 + 2a2ff0c commit dc95382

File tree

9 files changed

+110
-59
lines changed

9 files changed

+110
-59
lines changed

.github/workflows/add_to_bugs_project.yml

Lines changed: 0 additions & 19 deletions
This file was deleted.

.github/workflows/add_to_docs_project.yml

Lines changed: 0 additions & 19 deletions
This file was deleted.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
name: Add Pull Requests to PR review project
2+
3+
on:
4+
pull_request:
5+
types:
6+
- opened
7+
8+
jobs:
9+
add-to-project:
10+
name: Add issue to project
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/[email protected]
14+
with:
15+
project-url: https://github.com/orgs/mindsdb/projects/65
16+
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,14 @@
11
name: Add issue to roadmap project
2-
32
on:
43
issues:
54
types:
65
- opened
7-
86
jobs:
97
add-to-project:
108
name: Add issue to roadmap project
119
runs-on: ubuntu-latest
1210
steps:
1311
- uses: actions/[email protected]
1412
with:
15-
# You can target a repository in a different organization
16-
# to the issue
17-
project-url: https://github.com/orgs/mindsdb/projects/54
18-
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}
19-
labeled: enhancement
13+
project-url: https://github.com/orgs/mindsdb/projects/53
14+
github-token: ${{ secrets.ADD_TO_PROJECT_PAT }}

.github/workflows/docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222
run: |
2323
sudo apt install pandoc
2424
python -m pip install --upgrade pip
25-
pip install install 'Sphinx==4.1.2' 'sphinx-autoapi==1.8.4' 'sphinx-autodoc-typehints==1.12.0' 'sphinx-code-include==1.1.1' 'sphinx-rtd-theme==0.5.2' 'sphinxcontrib-applehelp==1.0.2' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.0' 'sphinxcontrib-jsmath==1.0.1' 'sphinxcontrib-napoleon==0.7' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec
25+
pip install install 'Sphinx==6.2.1' 'sphinx-autoapi==3.0.0' 'sphinx-autodoc-typehints' 'sphinx-code-include' 'sphinx-rtd-theme' 'sphinxcontrib-applehelp' 'sphinxcontrib-devhelp' 'sphinxcontrib-htmlhelp' 'sphinxcontrib-jsmath' 'sphinxcontrib-napoleon' 'sphinxcontrib-qthelp' 'sphinxcontrib-serializinghtml' autoapi nbsphinx myst_parser pandoc jupyter matplotlib imblearn fsspec
2626
pip install --no-cache-dir -e .
2727
- name: Make the docs
2828
run: |

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "type_infer"
3-
version = "0.0.15"
3+
version = "0.0.16"
44
description = "Automated type inference for Machine Learning pipelines."
55
authors = ["MindsDB Inc. <[email protected]>"]
66
license = "GPL-3.0"

tests/unit_tests/test_dates.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,43 @@
55

66

77
class TestDates(unittest.TestCase):
8+
89
def test_0_type_check_dates(self):
10+
""" Checks parsing of string containing a date to dtype 'date'.
11+
"""
912
self.assertEqual(type_check_date('31/12/2010'), dtype.date)
13+
14+
def test_1_type_check_datetime(self):
15+
""" Checks parsing of string containing a date to dtype 'datetime'.
16+
"""
17+
self.assertEqual(type_check_date('31/12/2010 23:15:41'), dtype.datetime)
18+
19+
def test_2_type_check_timestamp_unix_seconds(self):
20+
""" Checks parsing a number containing 1989-12-15T07:30:00 (as seconds
21+
since Unix epoch) to dtype 'datetime'.
22+
"""
23+
self.assertEqual(type_check_date(629721000.0), dtype.datetime)
24+
25+
def test_3_type_check_timestamp_unix_miliseconds(self):
26+
""" Checks parsing a number containing 1989-12-15T07:30:00 (as miliseconds
27+
since Unix epoch) to dtype 'datetime'.
28+
"""
29+
self.assertEqual(type_check_date(629721000000.0), dtype.datetime)
30+
31+
def test_4_type_check_timestamp_unix_microseconds(self):
32+
""" Checks parsing a number containing 1989-12-15T07:30:00 (as microseconds
33+
since Unix epoch) to dtype 'datetime'.
34+
"""
35+
self.assertEqual(type_check_date(629721000000000.0), dtype.datetime)
36+
37+
def test_5_type_check_timestamp_unix_nanoseconds(self):
38+
""" Checks parsing a number containing 1989-12-15T07:30:00 (as nanoseconds
39+
since Unix epoch) to dtype 'datetime'.
40+
"""
41+
self.assertEqual(type_check_date(629721000000000000.0), dtype.datetime)
42+
43+
def test_6_type_check_timestamp_julian_days(self):
44+
""" Checks parsing a number containing 1989-12-15T07:30:00 (as days since
45+
Julian calendar epoch) to dtype 'datetime'.
46+
"""
47+
self.assertEqual(type_check_date(2447875.81250), dtype.datetime)

type_infer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from type_infer import helpers
55

66

7-
__version__ = '0.0.15'
7+
__version__ = '0.0.16'
88

99

1010
__all__ = ['base', 'dtype', 'infer', 'helpers', '__version__']

type_infer/infer.py

Lines changed: 51 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -128,17 +128,57 @@ def type_check_sequence(element: object) -> str:
128128

129129

130130
def type_check_date(element: object) -> str:
131+
"""
132+
Check if element corresponds to a date-like object.
133+
"""
134+
# check if element represents a date (no hour/minute/seconds)
135+
is_date = False
136+
# check if element represents a datetime (has hour/minute/seconds)
137+
is_datetime = False
138+
# check if it makes sense to convert element to unix time-stamp by
139+
# evaluating if, when converted, the element represents a number that
140+
# is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
141+
# note that we also check the number is not larger than the "epochalypse time",
142+
# which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
143+
# this because timestamps outside this range are likely to be unreliable and hence
144+
# rather treated as every-day numbers.
145+
min_dt = pd.to_datetime('1970-01-01 00:00:00', utc=True)
146+
max_dt = pd.to_datetime('2038-01-19 03:14:08', utc=True)
147+
valid_units = {'ns': 'unix', 'us': 'unix', 'ms': 'unix', 's': 'unix',
148+
'D': 'julian'}
149+
for unit, origin in valid_units.items():
150+
try:
151+
as_dt = pd.to_datetime(element, unit=unit, origin=origin,
152+
errors='raise')
153+
if min_dt < as_dt < max_dt:
154+
is_datetime = True
155+
break
156+
except Exception:
157+
pass
158+
# check if element represents a date-like object.
159+
# here we don't check for a validity range like with unix-timestamps
160+
# because dates as string usually represent something more general than
161+
# just the number of seconds since an epoch.
131162
try:
132-
dt = pd.to_datetime(element)
133-
134-
# Not accurate 100% for a single datetime str, but should work in aggregate
135-
if dt.hour == 0 and dt.minute == 0 and dt.second == 0 and len(str(element)) <= 16:
136-
return dtype.date
137-
else:
138-
return dtype.datetime
139-
140-
except ValueError:
141-
return None
163+
as_dt = pd.to_datetime(element, errors='raise')
164+
is_datetime = True
165+
except Exception:
166+
pass
167+
# finally, if element is represents a datetime object, check if only
168+
# date part is contained (no time information)
169+
if is_datetime:
170+
# round element day (drop hour/minute/second)
171+
dt_d = as_dt.to_period('D').to_timestamp()
172+
# if rounded datetime equals the datetime itself, it means there was not
173+
# hour/minute/second information to begin with. Mind the 'localize' to
174+
# avoid time-zone BS to kick in.
175+
is_date = dt_d == as_dt.tz_localize(None)
176+
if is_date:
177+
return dtype.date
178+
if is_datetime:
179+
return dtype.datetime
180+
181+
return None
142182

143183

144184
def count_data_types_in_column(data):
@@ -391,7 +431,7 @@ def infer_types(
391431
population_size = len(data)
392432
log.info(f'Analyzing a sample of {sample_size}')
393433
log.info(
394-
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
434+
f'from a total population of {population_size}, this is equivalent to {round(sample_size*100/population_size, 1)}% of your data.') # noqa
395435

396436
nr_procs = get_nr_procs(df=sample_df)
397437
pool_size = min(nr_procs, len(sample_df.columns.values))

0 commit comments

Comments
 (0)