Skip to content

Commit 98a79c7

Browse files
committed
fix: add timezone and special formats support for cast string to timestamp
1 parent 2106cef commit 98a79c7

2 files changed

Lines changed: 238 additions & 66 deletions

File tree

native/spark-expr/src/conversion_funcs/string.rs

Lines changed: 215 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,11 @@ use std::str::FromStr;
3434
use std::sync::{Arc, LazyLock};
3535

3636
macro_rules! cast_utf8_to_timestamp {
37-
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr) => {{
37+
// $tz is a Timezone:Tz object and contains the session timezone.
38+
// $to_tz_str is a string containing the to_type timezone
39+
($array:expr, $eval_mode:expr, $array_type:ty, $cast_method:ident, $tz:expr, $to_tz_str:expr) => {{
3840
let len = $array.len();
39-
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone("UTC");
41+
let mut cast_array = PrimitiveArray::<$array_type>::builder(len).with_timezone($to_tz_str);
4042
let mut cast_err: Option<SparkError> = None;
4143
for i in 0..len {
4244
if $array.is_null(i) {
@@ -675,16 +677,21 @@ pub(crate) fn cast_string_to_timestamp(
675677
.downcast_ref::<GenericStringArray<i32>>()
676678
.expect("Expected a string array");
677679

678-
let tz = &timezone::Tz::from_str(timezone_str).unwrap();
680+
let tz = &timezone::Tz::from_str(timezone_str)
681+
.map_err(|_| SparkError::Internal(format!("Invalid timezone string: {timezone_str}")))?;
679682

680683
let cast_array: ArrayRef = match to_type {
681-
DataType::Timestamp(_, _) => cast_utf8_to_timestamp!(
682-
string_array,
683-
eval_mode,
684-
TimestampMicrosecondType,
685-
timestamp_parser,
686-
tz
687-
)?,
684+
DataType::Timestamp(_, tz_opt) => {
685+
let to_tz = tz_opt.as_deref().unwrap_or("UTC");
686+
cast_utf8_to_timestamp!(
687+
string_array,
688+
eval_mode,
689+
TimestampMicrosecondType,
690+
timestamp_parser,
691+
tz,
692+
to_tz
693+
)?
694+
}
688695
_ => unreachable!("Invalid data type {:?} in cast from string", to_type),
689696
};
690697
Ok(cast_array)
@@ -967,8 +974,14 @@ fn get_timestamp_values<T: TimeZone>(
967974
timestamp_type: &str,
968975
tz: &T,
969976
) -> SparkResult<Option<i64>> {
970-
let values: Vec<_> = value.split(['T', '-', ':', '.']).collect();
971-
let year = values[0].parse::<i32>().unwrap_or_default();
977+
// Handle negative year: strip leading '-' and remember the sign.
978+
let (sign, date_part) = if let Some(stripped) = value.strip_prefix('-') {
979+
(-1i32, stripped)
980+
} else {
981+
(1i32, value)
982+
};
983+
let values: Vec<_> = date_part.split(['T', ' ', '-', ':', '.']).collect();
984+
let year = sign * values[0].parse::<i32>().unwrap_or_default();
972985

973986
// NaiveDate (used internally by chrono's with_ymd_and_hms) is bounded to ±262142.
974987
if !(-262143..=262142).contains(&year) {
@@ -1041,28 +1054,19 @@ fn parse_timestamp_to_micros<T: TimeZone>(
10411054
timestamp_info.second,
10421055
);
10431056

1044-
// Check if datetime is not None
1045-
let tz_datetime = match datetime.single() {
1057+
// Spark uses the offset before daylight savings change so we need to use earliest()
1058+
// Return None for LocalResult::None which is the invalid time in a DST spring forward gap).
1059+
let tz_datetime = match datetime.earliest() {
10461060
Some(dt) => dt
10471061
.with_timezone(tz)
10481062
.with_nanosecond(timestamp_info.microsecond * 1000),
1049-
None => {
1050-
return Err(SparkError::Internal(
1051-
"Failed to parse timestamp".to_string(),
1052-
));
1053-
}
1054-
};
1055-
1056-
let result = match tz_datetime {
1057-
Some(dt) => dt.timestamp_micros(),
1058-
None => {
1059-
return Err(SparkError::Internal(
1060-
"Failed to parse timestamp".to_string(),
1061-
));
1062-
}
1063+
None => return Ok(None),
10631064
};
10641065

1065-
Ok(Some(result))
1066+
match tz_datetime {
1067+
Some(dt) => Ok(Some(dt.timestamp_micros())),
1068+
None => Ok(None),
1069+
}
10661070
}
10671071

10681072
fn parse_str_to_year_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
@@ -1096,21 +1100,6 @@ fn parse_str_to_microsecond_timestamp<T: TimeZone>(
10961100
get_timestamp_values(value, "microsecond", tz)
10971101
}
10981102

1099-
type TimestampPattern<T> = (&'static Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
1100-
1101-
static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{4,7}$").unwrap());
1102-
static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}$").unwrap());
1103-
static RE_DAY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}$").unwrap());
1104-
static RE_HOUR: LazyLock<Regex> =
1105-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{1,2}$").unwrap());
1106-
static RE_MINUTE: LazyLock<Regex> =
1107-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}$").unwrap());
1108-
static RE_SECOND: LazyLock<Regex> =
1109-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$").unwrap());
1110-
static RE_MICROSECOND: LazyLock<Regex> =
1111-
LazyLock::new(|| Regex::new(r"^\d{4,7}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1112-
static RE_TIME_ONLY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^T\d{1,2}$").unwrap());
1113-
11141103
fn timestamp_parser<T: TimeZone>(
11151104
value: &str,
11161105
eval_mode: EvalMode,
@@ -1120,15 +1109,103 @@ fn timestamp_parser<T: TimeZone>(
11201109
if value.is_empty() {
11211110
return Ok(None);
11221111
}
1123-
let patterns: &[TimestampPattern<T>] = &[
1124-
(&RE_YEAR, parse_str_to_year_timestamp),
1112+
1113+
// Handle Z or ±HH:MM offset suffix: strip it and parse with the explicit fixed offset.
1114+
if let Some((stripped, offset_secs)) = extract_offset_suffix(value) {
1115+
let fixed_tz = chrono::FixedOffset::east_opt(offset_secs)
1116+
.ok_or_else(|| SparkError::Internal("Invalid timezone offset".to_string()))?;
1117+
return timestamp_parser_with_tz(stripped, eval_mode, &fixed_tz);
1118+
}
1119+
1120+
timestamp_parser_with_tz(value, eval_mode, tz)
1121+
}
1122+
1123+
/// If `value` ends with a UTC offset suffix (`Z`, `+HH:MM`, or `-HH:MM`), returns the
1124+
/// stripped string and the offset in seconds. Returns `None` if no offset suffix is present.
1125+
fn extract_offset_suffix(value: &str) -> Option<(&str, i32)> {
1126+
if let Some(stripped) = value.strip_suffix('Z') {
1127+
return Some((stripped, 0));
1128+
}
1129+
// Check for ±HH:MM at the end (exactly 6 chars: sign + 2 digits + ':' + 2 digits)
1130+
if value.len() >= 6 {
1131+
let suffix_start = value.len() - 6;
1132+
let suffix = &value[suffix_start..];
1133+
let sign_byte = suffix.as_bytes()[0];
1134+
if (sign_byte == b'+' || sign_byte == b'-') && suffix.as_bytes()[3] == b':' {
1135+
if let (Ok(h), Ok(m)) = (suffix[1..3].parse::<i32>(), suffix[4..6].parse::<i32>()) {
1136+
let sign = if sign_byte == b'+' { 1i32 } else { -1i32 };
1137+
return Some((&value[..suffix_start], sign * (h * 3600 + m * 60)));
1138+
}
1139+
}
1140+
}
1141+
None
1142+
}
1143+
1144+
type TimestampParsePattern<T> = (&'static Regex, fn(&str, &T) -> SparkResult<Option<i64>>);
1145+
1146+
static RE_YEAR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}$").unwrap());
1147+
static RE_MONTH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}$").unwrap());
1148+
static RE_DAY: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}$").unwrap());
1149+
static RE_HOUR: LazyLock<Regex> =
1150+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{1,2}$").unwrap());
1151+
static RE_MINUTE: LazyLock<Regex> =
1152+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}$").unwrap());
1153+
static RE_SECOND: LazyLock<Regex> =
1154+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}$").unwrap());
1155+
static RE_MICROSECOND: LazyLock<Regex> =
1156+
LazyLock::new(|| Regex::new(r"^-?\d{4,7}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1157+
static RE_TIME_ONLY_H: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^T\d{1,2}$").unwrap());
1158+
static RE_TIME_ONLY_HM: LazyLock<Regex> =
1159+
LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}$").unwrap());
1160+
static RE_TIME_ONLY_HMS: LazyLock<Regex> =
1161+
LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}$").unwrap());
1162+
static RE_TIME_ONLY_HMSU: LazyLock<Regex> =
1163+
LazyLock::new(|| Regex::new(r"^T\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1164+
static RE_BARE_HM: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}$").unwrap());
1165+
static RE_BARE_HMS: LazyLock<Regex> =
1166+
LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}$").unwrap());
1167+
static RE_BARE_HMSU: LazyLock<Regex> =
1168+
LazyLock::new(|| Regex::new(r"^\d{1,2}:\d{2}:\d{2}\.\d{1,6}$").unwrap());
1169+
1170+
fn timestamp_parser_with_tz<T: TimeZone>(
1171+
value: &str,
1172+
eval_mode: EvalMode,
1173+
tz: &T,
1174+
) -> SparkResult<Option<i64>> {
1175+
// Both T-separator and space-separator date-time forms are supported.
1176+
// Negative years are handled by get_timestamp_values detecting a leading '-'.
1177+
let patterns: &[TimestampParsePattern<T>] = &[
1178+
// Year only: 4-7 digits, optionally negative
1179+
(
1180+
&RE_YEAR,
1181+
parse_str_to_year_timestamp as fn(&str, &T) -> SparkResult<Option<i64>>,
1182+
),
1183+
// Year-month
11251184
(&RE_MONTH, parse_str_to_month_timestamp),
1185+
// Year-month-day
11261186
(&RE_DAY, parse_str_to_day_timestamp),
1187+
// Date T-or-space hour (1 or 2 digits)
11271188
(&RE_HOUR, parse_str_to_hour_timestamp),
1189+
// Date T-or-space hour:minute
11281190
(&RE_MINUTE, parse_str_to_minute_timestamp),
1191+
// Date T-or-space hour:minute:second
11291192
(&RE_SECOND, parse_str_to_second_timestamp),
1193+
// Date T-or-space hour:minute:second.fraction
11301194
(&RE_MICROSECOND, parse_str_to_microsecond_timestamp),
1131-
(&RE_TIME_ONLY, parse_str_to_time_only_timestamp),
1195+
// Time-only: T hour (1 or 2 digits, no colon)
1196+
(&RE_TIME_ONLY_H, parse_str_to_time_only_timestamp),
1197+
// Time-only: T hour:minute
1198+
(&RE_TIME_ONLY_HM, parse_str_to_time_only_timestamp),
1199+
// Time-only: T hour:minute:second
1200+
(&RE_TIME_ONLY_HMS, parse_str_to_time_only_timestamp),
1201+
// Time-only: T hour:minute:second.fraction
1202+
(&RE_TIME_ONLY_HMSU, parse_str_to_time_only_timestamp),
1203+
// Bare time-only: hour:minute (without T prefix)
1204+
(&RE_BARE_HM, parse_str_to_time_only_timestamp),
1205+
// Bare time-only: hour:minute:second
1206+
(&RE_BARE_HMS, parse_str_to_time_only_timestamp),
1207+
// Bare time-only: hour:minute:second.fraction
1208+
(&RE_BARE_HMSU, parse_str_to_time_only_timestamp),
11321209
];
11331210

11341211
let mut timestamp = None;
@@ -1157,23 +1234,43 @@ fn timestamp_parser<T: TimeZone>(
11571234
}
11581235

11591236
fn parse_str_to_time_only_timestamp<T: TimeZone>(value: &str, tz: &T) -> SparkResult<Option<i64>> {
1160-
let values: Vec<&str> = value.split('T').collect();
1161-
let time_values: Vec<u32> = values[1]
1162-
.split(':')
1163-
.map(|v| v.parse::<u32>().unwrap_or(0))
1164-
.collect();
1237+
// The 'T' is optional in the time format; strip it if specified.
1238+
let time_part = value.strip_prefix('T').unwrap_or(value);
1239+
1240+
// Parse time components: hour[:minute[:second[.fraction]]]
1241+
// Use splitn(3) so "12:34:56.789" splits into ["12", "34", "56.789"].
1242+
let colon_parts: Vec<&str> = time_part.splitn(3, ':').collect();
1243+
let hour: u32 = colon_parts[0].parse().unwrap_or(0);
1244+
let minute: u32 = colon_parts.get(1).and_then(|s| s.parse().ok()).unwrap_or(0);
1245+
let (second, nanosecond) = if let Some(sec_frac) = colon_parts.get(2) {
1246+
let dot_idx = sec_frac.find('.');
1247+
let sec: u32 = sec_frac[..dot_idx.unwrap_or(sec_frac.len())]
1248+
.parse()
1249+
.unwrap_or(0);
1250+
let ns: u32 = if let Some(dot) = dot_idx {
1251+
let frac = &sec_frac[dot + 1..];
1252+
// Interpret up to 6 digits as microseconds, padding with trailing zeros.
1253+
let trimmed = &frac[..frac.len().min(6)];
1254+
let padded = format!("{:0<6}", trimmed);
1255+
padded.parse::<u32>().unwrap_or(0) * 1000
1256+
} else {
1257+
0
1258+
};
1259+
(sec, ns)
1260+
} else {
1261+
(0, 0)
1262+
};
11651263

11661264
let datetime = tz.from_utc_datetime(&chrono::Utc::now().naive_utc());
1167-
let timestamp = datetime
1265+
let result = datetime
11681266
.with_timezone(tz)
1169-
.with_hour(time_values.first().copied().unwrap_or_default())
1170-
.and_then(|dt| dt.with_minute(*time_values.get(1).unwrap_or(&0)))
1171-
.and_then(|dt| dt.with_second(*time_values.get(2).unwrap_or(&0)))
1172-
.and_then(|dt| dt.with_nanosecond(*time_values.get(3).unwrap_or(&0) * 1_000))
1173-
.map(|dt| dt.timestamp_micros())
1174-
.unwrap_or_default();
1175-
1176-
Ok(Some(timestamp))
1267+
.with_hour(hour)
1268+
.and_then(|dt| dt.with_minute(minute))
1269+
.and_then(|dt| dt.with_second(second))
1270+
.and_then(|dt| dt.with_nanosecond(nanosecond))
1271+
.map(|dt| dt.timestamp_micros());
1272+
1273+
Ok(result)
11771274
}
11781275

11791276
//a string to date parser - port of spark's SparkDateTimeUtils#stringToDate.
@@ -1343,7 +1440,8 @@ mod tests {
13431440
eval_mode,
13441441
TimestampMicrosecondType,
13451442
timestamp_parser,
1346-
tz
1443+
tz,
1444+
"UTC"
13471445
)
13481446
.unwrap();
13491447

@@ -1373,7 +1471,8 @@ mod tests {
13731471
eval_mode,
13741472
TimestampMicrosecondType,
13751473
timestamp_parser,
1376-
tz
1474+
tz,
1475+
"UTC"
13771476
);
13781477
assert!(
13791478
result.is_err(),
@@ -1497,6 +1596,59 @@ mod tests {
14971596
timestamp_parser("10000-01-01T12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
14981597
Some(253402346096123456)
14991598
);
1599+
// Space separator (same values as T separator)
1600+
assert_eq!(
1601+
timestamp_parser("2020-01-01 12", EvalMode::Legacy, tz).unwrap(),
1602+
Some(1577880000000000)
1603+
);
1604+
assert_eq!(
1605+
timestamp_parser("2020-01-01 12:34", EvalMode::Legacy, tz).unwrap(),
1606+
Some(1577882040000000)
1607+
);
1608+
assert_eq!(
1609+
timestamp_parser("2020-01-01 12:34:56", EvalMode::Legacy, tz).unwrap(),
1610+
Some(1577882096000000)
1611+
);
1612+
assert_eq!(
1613+
timestamp_parser("2020-01-01 12:34:56.123456", EvalMode::Legacy, tz).unwrap(),
1614+
Some(1577882096123456)
1615+
);
1616+
// Z suffix (UTC)
1617+
assert_eq!(
1618+
timestamp_parser("2020-01-01T12:34:56Z", EvalMode::Legacy, tz).unwrap(),
1619+
Some(1577882096000000)
1620+
);
1621+
// Positive offset suffix
1622+
assert_eq!(
1623+
timestamp_parser("2020-01-01T12:34:56+05:30", EvalMode::Legacy, tz).unwrap(),
1624+
Some(1577862296000000) // 12:34:56 UTC+5:30 = 07:04:56 UTC
1625+
);
1626+
// T-prefixed time-only with colon
1627+
assert!(timestamp_parser("T12:34", EvalMode::Legacy, tz)
1628+
.unwrap()
1629+
.is_some());
1630+
assert!(timestamp_parser("T12:34:56", EvalMode::Legacy, tz)
1631+
.unwrap()
1632+
.is_some());
1633+
assert!(timestamp_parser("T12:34:56.123456", EvalMode::Legacy, tz)
1634+
.unwrap()
1635+
.is_some());
1636+
// Bare time-only (hour:minute without T prefix)
1637+
assert!(timestamp_parser("12:34", EvalMode::Legacy, tz)
1638+
.unwrap()
1639+
.is_some());
1640+
assert!(timestamp_parser("12:34:56", EvalMode::Legacy, tz)
1641+
.unwrap()
1642+
.is_some());
1643+
// Negative year
1644+
assert!(timestamp_parser("-0001", EvalMode::Legacy, tz)
1645+
.unwrap()
1646+
.is_some());
1647+
assert!(
1648+
timestamp_parser("-0001-01-01T12:34:56", EvalMode::Legacy, tz)
1649+
.unwrap()
1650+
.is_some()
1651+
);
15001652
}
15011653

15021654
#[test]

0 commit comments

Comments
 (0)