@@ -128,17 +128,57 @@ def type_check_sequence(element: object) -> str:
128128
129129
130130def type_check_date (element : object ) -> str :
131+ """
132+ Check if element corresponds to a date-like object.
133+ """
134+ # check if element represents a date (no hour/minute/seconds)
135+ is_date = False
136+ # check if element represents a datetime (has hour/minute/seconds)
137+ is_datetime = False
138+ # check if it makes sense to convert element to unix time-stamp by
139+ # evaluating if, when converted, the element represents a number that
140+ # is compatible with a Unix timestamp (number of seconds since 1970-01-01T:00:00:00)
141+ # note that we also check the number is not larger than the "epochalypse time",
142+ # which is when the unix timestamp becomes larger than 2^32 - 1 seconds. We do
143+ # this because timestamps outside this range are likely to be unreliable and hence
144+ # rather treated as every-day numbers.
145+ min_dt = pd .to_datetime ('1970-01-01 00:00:00' , utc = True )
146+ max_dt = pd .to_datetime ('2038-01-19 03:14:08' , utc = True )
147+ valid_units = {'ns' : 'unix' , 'us' : 'unix' , 'ms' : 'unix' , 's' : 'unix' ,
148+ 'D' : 'julian' }
149+ for unit , origin in valid_units .items ():
150+ try :
151+ as_dt = pd .to_datetime (element , unit = unit , origin = origin ,
152+ errors = 'raise' )
153+ if min_dt < as_dt < max_dt :
154+ is_datetime = True
155+ break
156+ except Exception :
157+ pass
158+ # check if element represents a date-like object.
159+ # here we don't check for a validity range like with unix-timestamps
160+ # because dates as string usually represent something more general than
161+ # just the number of seconds since an epoch.
131162 try :
132- dt = pd .to_datetime (element )
133-
134- # Not accurate 100% for a single datetime str, but should work in aggregate
135- if dt .hour == 0 and dt .minute == 0 and dt .second == 0 and len (str (element )) <= 16 :
136- return dtype .date
137- else :
138- return dtype .datetime
139-
140- except ValueError :
141- return None
163+ as_dt = pd .to_datetime (element , errors = 'raise' )
164+ is_datetime = True
165+ except Exception :
166+ pass
167+ # finally, if element is represents a datetime object, check if only
168+ # date part is contained (no time information)
169+ if is_datetime :
170+ # round element day (drop hour/minute/second)
171+ dt_d = as_dt .to_period ('D' ).to_timestamp ()
172+ # if rounded datetime equals the datetime itself, it means there was not
173+ # hour/minute/second information to begin with. Mind the 'localize' to
174+ # avoid time-zone BS to kick in.
175+ is_date = dt_d == as_dt .tz_localize (None )
176+ if is_date :
177+ return dtype .date
178+ if is_datetime :
179+ return dtype .datetime
180+
181+ return None
142182
143183
144184def count_data_types_in_column (data ):
@@ -391,7 +431,7 @@ def infer_types(
391431 population_size = len (data )
392432 log .info (f'Analyzing a sample of { sample_size } ' )
393433 log .info (
394- f'from a total population of { population_size } , this is equivalent to { round (sample_size * 100 / population_size , 1 )} % of your data.' ) # noqa
434+ f'from a total population of { population_size } , this is equivalent to { round (sample_size * 100 / population_size , 1 )} % of your data.' ) # noqa
395435
396436 nr_procs = get_nr_procs (df = sample_df )
397437 pool_size = min (nr_procs , len (sample_df .columns .values ))
0 commit comments