@@ -34,9 +34,11 @@ use std::str::FromStr;
3434use std:: sync:: { Arc , LazyLock } ;
3535
3636macro_rules! cast_utf8_to_timestamp {
37- ( $array: expr, $eval_mode: expr, $array_type: ty, $cast_method: ident, $tz: expr) => { {
37+ // $tz is a Timezone:Tz object and contains the session timezone.
38+ // $to_tz_str is a string containing the to_type timezone
39+ ( $array: expr, $eval_mode: expr, $array_type: ty, $cast_method: ident, $tz: expr, $to_tz_str: expr) => { {
3840 let len = $array. len( ) ;
39- let mut cast_array = PrimitiveArray :: <$array_type>:: builder( len) . with_timezone( "UTC" ) ;
41+ let mut cast_array = PrimitiveArray :: <$array_type>:: builder( len) . with_timezone( $to_tz_str ) ;
4042 let mut cast_err: Option <SparkError > = None ;
4143 for i in 0 ..len {
4244 if $array. is_null( i) {
@@ -675,16 +677,21 @@ pub(crate) fn cast_string_to_timestamp(
675677 . downcast_ref :: < GenericStringArray < i32 > > ( )
676678 . expect ( "Expected a string array" ) ;
677679
678- let tz = & timezone:: Tz :: from_str ( timezone_str) . unwrap ( ) ;
680+ let tz = & timezone:: Tz :: from_str ( timezone_str)
681+ . map_err ( |_| SparkError :: Internal ( format ! ( "Invalid timezone string: {timezone_str}" ) ) ) ?;
679682
680683 let cast_array: ArrayRef = match to_type {
681- DataType :: Timestamp ( _, _) => cast_utf8_to_timestamp ! (
682- string_array,
683- eval_mode,
684- TimestampMicrosecondType ,
685- timestamp_parser,
686- tz
687- ) ?,
684+ DataType :: Timestamp ( _, tz_opt) => {
685+ let to_tz = tz_opt. as_deref ( ) . unwrap_or ( "UTC" ) ;
686+ cast_utf8_to_timestamp ! (
687+ string_array,
688+ eval_mode,
689+ TimestampMicrosecondType ,
690+ timestamp_parser,
691+ tz,
692+ to_tz
693+ ) ?
694+ }
688695 _ => unreachable ! ( "Invalid data type {:?} in cast from string" , to_type) ,
689696 } ;
690697 Ok ( cast_array)
@@ -967,8 +974,14 @@ fn get_timestamp_values<T: TimeZone>(
967974 timestamp_type : & str ,
968975 tz : & T ,
969976) -> SparkResult < Option < i64 > > {
970- let values: Vec < _ > = value. split ( [ 'T' , '-' , ':' , '.' ] ) . collect ( ) ;
971- let year = values[ 0 ] . parse :: < i32 > ( ) . unwrap_or_default ( ) ;
977+ // Handle negative year: strip leading '-' and remember the sign.
978+ let ( sign, date_part) = if let Some ( stripped) = value. strip_prefix ( '-' ) {
979+ ( -1i32 , stripped)
980+ } else {
981+ ( 1i32 , value)
982+ } ;
983+ let values: Vec < _ > = date_part. split ( [ 'T' , ' ' , '-' , ':' , '.' ] ) . collect ( ) ;
984+ let year = sign * values[ 0 ] . parse :: < i32 > ( ) . unwrap_or_default ( ) ;
972985
973986 // NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142.
974987 if !( -262143 ..=262142 ) . contains ( & year) {
@@ -1041,28 +1054,19 @@ fn parse_timestamp_to_micros<T: TimeZone>(
10411054 timestamp_info. second ,
10421055 ) ;
10431056
1044- // Check if datetime is not None
1045- let tz_datetime = match datetime. single ( ) {
1057+ // Spark uses the offset before daylight savings change so we need to use earliest()
1058+ // Return None for LocalResult::None which is the invalid time in a DST spring forward gap).
1059+ let tz_datetime = match datetime. earliest ( ) {
10461060 Some ( dt) => dt
10471061 . with_timezone ( tz)
10481062 . with_nanosecond ( timestamp_info. microsecond * 1000 ) ,
1049- None => {
1050- return Err ( SparkError :: Internal (
1051- "Failed to parse timestamp" . to_string ( ) ,
1052- ) ) ;
1053- }
1054- } ;
1055-
1056- let result = match tz_datetime {
1057- Some ( dt) => dt. timestamp_micros ( ) ,
1058- None => {
1059- return Err ( SparkError :: Internal (
1060- "Failed to parse timestamp" . to_string ( ) ,
1061- ) ) ;
1062- }
1063+ None => return Ok ( None ) ,
10631064 } ;
10641065
1065- Ok ( Some ( result) )
1066+ match tz_datetime {
1067+ Some ( dt) => Ok ( Some ( dt. timestamp_micros ( ) ) ) ,
1068+ None => Ok ( None ) ,
1069+ }
10661070}
10671071
10681072fn parse_str_to_year_timestamp < T : TimeZone > ( value : & str , tz : & T ) -> SparkResult < Option < i64 > > {
@@ -1096,21 +1100,6 @@ fn parse_str_to_microsecond_timestamp<T: TimeZone>(
10961100 get_timestamp_values ( value, "microsecond" , tz)
10971101}
10981102
1099- type TimestampPattern < T > = ( & ' static Regex , fn ( & str , & T ) -> SparkResult < Option < i64 > > ) ;
1100-
1101- static RE_YEAR : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^\d{4,7}$" ) . unwrap ( ) ) ;
1102- static RE_MONTH : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^\d{4,7}-\d{2}$" ) . unwrap ( ) ) ;
1103- static RE_DAY : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}$" ) . unwrap ( ) ) ;
1104- static RE_HOUR : LazyLock < Regex > =
1105- LazyLock :: new ( || Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T\d{1,2}$" ) . unwrap ( ) ) ;
1106- static RE_MINUTE : LazyLock < Regex > =
1107- LazyLock :: new ( || Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}$" ) . unwrap ( ) ) ;
1108- static RE_SECOND : LazyLock < Regex > =
1109- LazyLock :: new ( || Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$" ) . unwrap ( ) ) ;
1110- static RE_MICROSECOND : LazyLock < Regex > =
1111- LazyLock :: new ( || Regex :: new ( r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ) ;
1112- static RE_TIME_ONLY : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^T\d{1,2}$" ) . unwrap ( ) ) ;
1113-
11141103fn timestamp_parser < T : TimeZone > (
11151104 value : & str ,
11161105 eval_mode : EvalMode ,
@@ -1120,15 +1109,103 @@ fn timestamp_parser<T: TimeZone>(
11201109 if value. is_empty ( ) {
11211110 return Ok ( None ) ;
11221111 }
1123- let patterns: & [ TimestampPattern < T > ] = & [
1124- ( & RE_YEAR , parse_str_to_year_timestamp) ,
1112+
1113+ // Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit fixed offset.
1114+ if let Some ( ( stripped, offset_secs) ) = extract_offset_suffix ( value) {
1115+ let fixed_tz = chrono:: FixedOffset :: east_opt ( offset_secs)
1116+ . ok_or_else ( || SparkError :: Internal ( "Invalid timezone offset" . to_string ( ) ) ) ?;
1117+ return timestamp_parser_with_tz ( stripped, eval_mode, & fixed_tz) ;
1118+ }
1119+
1120+ timestamp_parser_with_tz ( value, eval_mode, tz)
1121+ }
1122+
1123+ /// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), returns the
1124+ /// stripped string and the offset in seconds. Returns `None` if no offset suffix is present.
1125+ fn extract_offset_suffix ( value : & str ) -> Option < ( & str , i32 ) > {
1126+ if let Some ( stripped) = value. strip_suffix ( 'Z' ) {
1127+ return Some ( ( stripped, 0 ) ) ;
1128+ }
1129+ // Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 digits)
1130+ if value. len ( ) >= 6 {
1131+ let suffix_start = value. len ( ) - 6 ;
1132+ let suffix = & value[ suffix_start..] ;
1133+ let sign_byte = suffix. as_bytes ( ) [ 0 ] ;
1134+ if ( sign_byte == b'+' || sign_byte == b'-' ) && suffix. as_bytes ( ) [ 3 ] == b':' {
1135+ if let ( Ok ( h) , Ok ( m) ) = ( suffix[ 1 ..3 ] . parse :: < i32 > ( ) , suffix[ 4 ..6 ] . parse :: < i32 > ( ) ) {
1136+ let sign = if sign_byte == b'+' { 1i32 } else { -1i32 } ;
1137+ return Some ( ( & value[ ..suffix_start] , sign * ( h * 3600 + m * 60 ) ) ) ;
1138+ }
1139+ }
1140+ }
1141+ None
1142+ }
1143+
1144+ type TimestampParsePattern < T > = ( & ' static Regex , fn ( & str , & T ) -> SparkResult < Option < i64 > > ) ;
1145+
1146+ static RE_YEAR : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}$" ) . unwrap ( ) ) ;
1147+ static RE_MONTH : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}-\d{2}$" ) . unwrap ( ) ) ;
1148+ static RE_DAY : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}-\d{2}-\d{2}$" ) . unwrap ( ) ) ;
1149+ static RE_HOUR : LazyLock < Regex > =
1150+ LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{1,2}$" ) . unwrap ( ) ) ;
1151+ static RE_MINUTE : LazyLock < Regex > =
1152+ LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}$" ) . unwrap ( ) ) ;
1153+ static RE_SECOND : LazyLock < Regex > =
1154+ LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}$" ) . unwrap ( ) ) ;
1155+ static RE_MICROSECOND : LazyLock < Regex > =
1156+ LazyLock :: new ( || Regex :: new ( r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ) ;
1157+ static RE_TIME_ONLY_H : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^T\d{1,2}$" ) . unwrap ( ) ) ;
1158+ static RE_TIME_ONLY_HM : LazyLock < Regex > =
1159+ LazyLock :: new ( || Regex :: new ( r"^T\d{1,2}:\d{2}$" ) . unwrap ( ) ) ;
1160+ static RE_TIME_ONLY_HMS : LazyLock < Regex > =
1161+ LazyLock :: new ( || Regex :: new ( r"^T\d{1,2}:\d{2}:\d{2}$" ) . unwrap ( ) ) ;
1162+ static RE_TIME_ONLY_HMSU : LazyLock < Regex > =
1163+ LazyLock :: new ( || Regex :: new ( r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ) ;
1164+ static RE_BARE_HM : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^\d{1,2}:\d{2}$" ) . unwrap ( ) ) ;
1165+ static RE_BARE_HMS : LazyLock < Regex > =
1166+ LazyLock :: new ( || Regex :: new ( r"^\d{1,2}:\d{2}:\d{2}$" ) . unwrap ( ) ) ;
1167+ static RE_BARE_HMSU : LazyLock < Regex > =
1168+ LazyLock :: new ( || Regex :: new ( r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$" ) . unwrap ( ) ) ;
1169+
1170+ fn timestamp_parser_with_tz < T : TimeZone > (
1171+ value : & str ,
1172+ eval_mode : EvalMode ,
1173+ tz : & T ,
1174+ ) -> SparkResult < Option < i64 > > {
1175+ // Both T-separator and space-separator date-time forms are supported.
1176+ // Negative years are handled by get_timestamp_values detecting a leading '-'.
1177+ let patterns: & [ TimestampParsePattern < T > ] = & [
1178+ // Year only: 4-7 digits, optionally negative
1179+ (
1180+ & RE_YEAR ,
1181+ parse_str_to_year_timestamp as fn ( & str , & T ) -> SparkResult < Option < i64 > > ,
1182+ ) ,
1183+ // Year-month
11251184 ( & RE_MONTH , parse_str_to_month_timestamp) ,
1185+ // Year-month-day
11261186 ( & RE_DAY , parse_str_to_day_timestamp) ,
1187+ // Date T-or-space hour (1 or 2 digits)
11271188 ( & RE_HOUR , parse_str_to_hour_timestamp) ,
1189+ // Date T-or-space hour:minute
11281190 ( & RE_MINUTE , parse_str_to_minute_timestamp) ,
1191+ // Date T-or-space hour:minute:second
11291192 ( & RE_SECOND , parse_str_to_second_timestamp) ,
1193+ // Date T-or-space hour:minute:second.fraction
11301194 ( & RE_MICROSECOND , parse_str_to_microsecond_timestamp) ,
1131- ( & RE_TIME_ONLY , parse_str_to_time_only_timestamp) ,
1195+ // Time-only: T hour (1 or 2 digits, no colon)
1196+ ( & RE_TIME_ONLY_H , parse_str_to_time_only_timestamp) ,
1197+ // Time-only: T hour:minute
1198+ ( & RE_TIME_ONLY_HM , parse_str_to_time_only_timestamp) ,
1199+ // Time-only: T hour:minute:second
1200+ ( & RE_TIME_ONLY_HMS , parse_str_to_time_only_timestamp) ,
1201+ // Time-only: T hour:minute:second.fraction
1202+ ( & RE_TIME_ONLY_HMSU , parse_str_to_time_only_timestamp) ,
1203+ // Bare time-only: hour:minute (without T prefix)
1204+ ( & RE_BARE_HM , parse_str_to_time_only_timestamp) ,
1205+ // Bare time-only: hour:minute:second
1206+ ( & RE_BARE_HMS , parse_str_to_time_only_timestamp) ,
1207+ // Bare time-only: hour:minute:second.fraction
1208+ ( & RE_BARE_HMSU , parse_str_to_time_only_timestamp) ,
11321209 ] ;
11331210
11341211 let mut timestamp = None ;
@@ -1157,23 +1234,43 @@ fn timestamp_parser<T: TimeZone>(
11571234}
11581235
11591236fn parse_str_to_time_only_timestamp < T : TimeZone > ( value : & str , tz : & T ) -> SparkResult < Option < i64 > > {
1160- let values: Vec < & str > = value. split ( 'T' ) . collect ( ) ;
1161- let time_values: Vec < u32 > = values[ 1 ]
1162- . split ( ':' )
1163- . map ( |v| v. parse :: < u32 > ( ) . unwrap_or ( 0 ) )
1164- . collect ( ) ;
1237+ // The 'T' is optional in the time format; strip it if specified.
1238+ let time_part = value. strip_prefix ( 'T' ) . unwrap_or ( value) ;
1239+
1240+ // Parse time components: hour[:minute[:second[.fraction]]]
1241+ // Use splitn(3) so "12:34:56.789" splits into ["12", "34", "56.789"].
1242+ let colon_parts: Vec < & str > = time_part. splitn ( 3 , ':' ) . collect ( ) ;
1243+ let hour: u32 = colon_parts[ 0 ] . parse ( ) . unwrap_or ( 0 ) ;
1244+ let minute: u32 = colon_parts. get ( 1 ) . and_then ( |s| s. parse ( ) . ok ( ) ) . unwrap_or ( 0 ) ;
1245+ let ( second, nanosecond) = if let Some ( sec_frac) = colon_parts. get ( 2 ) {
1246+ let dot_idx = sec_frac. find ( '.' ) ;
1247+ let sec: u32 = sec_frac[ ..dot_idx. unwrap_or ( sec_frac. len ( ) ) ]
1248+ . parse ( )
1249+ . unwrap_or ( 0 ) ;
1250+ let ns: u32 = if let Some ( dot) = dot_idx {
1251+ let frac = & sec_frac[ dot + 1 ..] ;
1252+ // Interpret up to 6 digits as microseconds, padding with trailing zeros.
1253+ let trimmed = & frac[ ..frac. len ( ) . min ( 6 ) ] ;
1254+ let padded = format ! ( "{:0<6}" , trimmed) ;
1255+ padded. parse :: < u32 > ( ) . unwrap_or ( 0 ) * 1000
1256+ } else {
1257+ 0
1258+ } ;
1259+ ( sec, ns)
1260+ } else {
1261+ ( 0 , 0 )
1262+ } ;
11651263
11661264 let datetime = tz. from_utc_datetime ( & chrono:: Utc :: now ( ) . naive_utc ( ) ) ;
1167- let timestamp = datetime
1265+ let result = datetime
11681266 . with_timezone ( tz)
1169- . with_hour ( time_values. first ( ) . copied ( ) . unwrap_or_default ( ) )
1170- . and_then ( |dt| dt. with_minute ( * time_values. get ( 1 ) . unwrap_or ( & 0 ) ) )
1171- . and_then ( |dt| dt. with_second ( * time_values. get ( 2 ) . unwrap_or ( & 0 ) ) )
1172- . and_then ( |dt| dt. with_nanosecond ( * time_values. get ( 3 ) . unwrap_or ( & 0 ) * 1_000 ) )
1173- . map ( |dt| dt. timestamp_micros ( ) )
1174- . unwrap_or_default ( ) ;
1175-
1176- Ok ( Some ( timestamp) )
1267+ . with_hour ( hour)
1268+ . and_then ( |dt| dt. with_minute ( minute) )
1269+ . and_then ( |dt| dt. with_second ( second) )
1270+ . and_then ( |dt| dt. with_nanosecond ( nanosecond) )
1271+ . map ( |dt| dt. timestamp_micros ( ) ) ;
1272+
1273+ Ok ( result)
11771274}
11781275
11791276//a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
@@ -1343,7 +1440,8 @@ mod tests {
13431440 eval_mode,
13441441 TimestampMicrosecondType ,
13451442 timestamp_parser,
1346- tz
1443+ tz,
1444+ "UTC"
13471445 )
13481446 . unwrap ( ) ;
13491447
@@ -1373,7 +1471,8 @@ mod tests {
13731471 eval_mode,
13741472 TimestampMicrosecondType ,
13751473 timestamp_parser,
1376- tz
1474+ tz,
1475+ "UTC"
13771476 ) ;
13781477 assert ! (
13791478 result. is_err( ) ,
@@ -1497,6 +1596,59 @@ mod tests {
14971596 timestamp_parser( "10000-01-01T12:34:56.123456" , EvalMode :: Legacy , tz) . unwrap( ) ,
14981597 Some ( 253402346096123456 )
14991598 ) ;
1599+ // Space separator (same values as T separator)
1600+ assert_eq ! (
1601+ timestamp_parser( "2020-01-01 12" , EvalMode :: Legacy , tz) . unwrap( ) ,
1602+ Some ( 1577880000000000 )
1603+ ) ;
1604+ assert_eq ! (
1605+ timestamp_parser( "2020-01-01 12:34" , EvalMode :: Legacy , tz) . unwrap( ) ,
1606+ Some ( 1577882040000000 )
1607+ ) ;
1608+ assert_eq ! (
1609+ timestamp_parser( "2020-01-01 12:34:56" , EvalMode :: Legacy , tz) . unwrap( ) ,
1610+ Some ( 1577882096000000 )
1611+ ) ;
1612+ assert_eq ! (
1613+ timestamp_parser( "2020-01-01 12:34:56.123456" , EvalMode :: Legacy , tz) . unwrap( ) ,
1614+ Some ( 1577882096123456 )
1615+ ) ;
1616+ // Z suffix (UTC)
1617+ assert_eq ! (
1618+ timestamp_parser( "2020-01-01T12:34:56Z" , EvalMode :: Legacy , tz) . unwrap( ) ,
1619+ Some ( 1577882096000000 )
1620+ ) ;
1621+ // Positive offset suffix
1622+ assert_eq ! (
1623+ timestamp_parser( "2020-01-01T12:34:56+05:30" , EvalMode :: Legacy , tz) . unwrap( ) ,
1624+ Some ( 1577862296000000 ) // 12:34:56 UTC+5:30 = 07:04:56 UTC
1625+ ) ;
1626+ // T-prefixed time-only with colon
1627+ assert ! ( timestamp_parser( "T12:34" , EvalMode :: Legacy , tz)
1628+ . unwrap( )
1629+ . is_some( ) ) ;
1630+ assert ! ( timestamp_parser( "T12:34:56" , EvalMode :: Legacy , tz)
1631+ . unwrap( )
1632+ . is_some( ) ) ;
1633+ assert ! ( timestamp_parser( "T12:34:56.123456" , EvalMode :: Legacy , tz)
1634+ . unwrap( )
1635+ . is_some( ) ) ;
1636+ // Bare time-only (hour:minute without T prefix)
1637+ assert ! ( timestamp_parser( "12:34" , EvalMode :: Legacy , tz)
1638+ . unwrap( )
1639+ . is_some( ) ) ;
1640+ assert ! ( timestamp_parser( "12:34:56" , EvalMode :: Legacy , tz)
1641+ . unwrap( )
1642+ . is_some( ) ) ;
1643+ // Negative year
1644+ assert ! ( timestamp_parser( "-0001" , EvalMode :: Legacy , tz)
1645+ . unwrap( )
1646+ . is_some( ) ) ;
1647+ assert ! (
1648+ timestamp_parser( "-0001-01-01T12:34:56" , EvalMode :: Legacy , tz)
1649+ . unwrap( )
1650+ . is_some( )
1651+ ) ;
15001652 }
15011653
15021654 #[ test]
0 commit comments