diff --git a/bin/test-regex-filtering.sh b/bin/test-regex-filtering.sh index 99a91eb..6f54cc7 100755 --- a/bin/test-regex-filtering.sh +++ b/bin/test-regex-filtering.sh @@ -3,7 +3,7 @@ set -euxo pipefail export RUST_BACKTRACE=1 -export CHRONO_TZ_TIMEZONE_FILTER='(Europe/London)' +export CHRONO_TZ_TIMEZONE_FILTER='Europe/(London|Vaduz)' cd chrono-tz/tests/check-regex-filtering diff --git a/chrono-tz-build/src/lib.rs b/chrono-tz-build/src/lib.rs index db579fc..ae4d566 100644 --- a/chrono-tz-build/src/lib.rs +++ b/chrono-tz-build/src/lib.rs @@ -2,6 +2,9 @@ extern crate parse_zoneinfo; #[cfg(feature = "filter-by-regex")] extern crate regex; +mod zoneinfo_structure; +use zoneinfo_structure::{Child, Structure}; + use std::collections::BTreeSet; use std::env; use std::fs::File; @@ -9,7 +12,6 @@ use std::io::{self, BufRead, BufReader, Write}; use std::path::{Path, PathBuf}; use parse_zoneinfo::line::Line; -use parse_zoneinfo::structure::{Child, Structure}; use parse_zoneinfo::table::{Table, TableBuilder}; use parse_zoneinfo::transitions::FixedTimespan; use parse_zoneinfo::transitions::TableTransitions; @@ -78,12 +80,12 @@ fn convert_bad_chars(name: &str) -> String { // The timezone file contains impls of `Timespans` for all timezones in the // database. The `Wrap` wrapper in the `timezone_impl` module then implements // TimeZone for any contained struct that implements `Timespans`. -fn write_timezone_file(timezone_file: &mut File, table: &Table, uncased: bool) -> io::Result<()> { - let zones = table - .zonesets - .keys() - .chain(table.links.keys()) - .collect::>(); +fn write_timezone_file( + timezone_file: &mut File, + table: &Table, + zones: &BTreeSet<&str>, + uncased: bool, +) -> io::Result<()> { writeln!( timezone_file, "use core::fmt::{{self, Debug, Display, Formatter}};", @@ -107,14 +109,14 @@ fn write_timezone_file(timezone_file: &mut File, table: &Table, uncased: bool) - r#"#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]"# )?; writeln!(timezone_file, "pub enum Tz {{")?; - for zone in &zones { + for &zone in zones { let zone_name = convert_bad_chars(zone); writeln!(timezone_file, " /// {zone}\n {zone_name},")?; } writeln!(timezone_file, "}}")?; let mut map = phf_codegen::Map::new(); - for zone in &zones { + for &zone in zones { map.entry(zone, format!("Tz::{}", convert_bad_chars(zone))); } writeln!( @@ -127,7 +129,7 @@ fn write_timezone_file(timezone_file: &mut File, table: &Table, uncased: bool) - if uncased { writeln!(timezone_file, "use uncased::UncasedStr;\n",)?; let mut map = phf_codegen::Map::new(); - for zone in &zones { + for &zone in zones { map.entry( uncased::UncasedStr::new(zone), format!("Tz::{}", convert_bad_chars(zone)), @@ -169,7 +171,7 @@ impl FromStr for Tz {{ pub fn name(self) -> &'static str {{ match self {{" )?; - for zone in &zones { + for &zone in zones { let zone_name = convert_bad_chars(zone); writeln!(timezone_file, " Tz::{zone_name} => \"{zone}\",")?; } @@ -214,10 +216,11 @@ impl FromStr for Tz {{ "impl TimeSpans for Tz {{ fn timespans(&self) -> FixedTimespanSet {{" )?; - for zone in &zones { - if table.links.get(zone.as_str()).is_some() { - continue; - } + for zone in zones + .iter() + .map(|&z| table.links.get(z).map(String::as_str).unwrap_or(z)) + .collect::>() + { let zone_name = convert_bad_chars(zone); let timespans = table.timespans(zone).unwrap(); writeln!( @@ -240,9 +243,9 @@ impl FromStr for Tz {{ " )?; - for zone in &zones { + for &zone in zones { let zone_name = convert_bad_chars(zone); - let target_name = if let Some(target) = table.links.get(zone.as_str()) { + let target_name = if let Some(target) = table.links.get(zone) { convert_bad_chars(target) } else { zone_name.clone() @@ -273,7 +276,7 @@ pub static TZ_VARIANTS: [Tz; {num}] = [ ", num = zones.len() )?; - for zone in &zones { + for &zone in zones { writeln!( timezone_file, " Tz::{zone},", @@ -286,21 +289,22 @@ pub static TZ_VARIANTS: [Tz; {num}] = [ // Create a file containing nice-looking re-exports such as Europe::London // instead of having to use chrono_tz::timezones::Europe__London -fn write_directory_file(directory_file: &mut File, table: &Table, version: &str) -> io::Result<()> { +fn write_directory_file( + directory_file: &mut File, + table: &Table, + zones: &BTreeSet<&str>, + version: &str, +) -> io::Result<()> { + writeln!(directory_file, "use crate::timezones::Tz;\n")?; + // expose the underlying IANA TZDB version writeln!( directory_file, "pub const IANA_TZDB_VERSION: &str = \"{version}\";\n" )?; + // add the `loose' zone definitions first - writeln!(directory_file, "use crate::timezones::Tz;\n")?; - let zones = table - .zonesets - .keys() - .chain(table.links.keys()) - .filter(|zone| !zone.contains('/')) - .collect::>(); - for zone in zones { + for &zone in zones.iter().filter(|zone| !zone.contains('/')) { let zone = convert_bad_chars(zone); writeln!(directory_file, "pub const {zone}: Tz = Tz::{zone};")?; } @@ -308,7 +312,7 @@ fn write_directory_file(directory_file: &mut File, table: &Table, version: &str) // now add the `structured' zone names in submodules let mut first = true; - for entry in table.structure() { + for entry in zoneinfo_structure::build_tree(zones.iter().copied()) { if entry.name.contains('/') { continue; } @@ -320,7 +324,7 @@ fn write_directory_file(directory_file: &mut File, table: &Table, version: &str) let module_name = convert_bad_chars(entry.name); writeln!(directory_file, "pub mod {module_name} {{")?; - writeln!(directory_file, " use crate::timezones::Tz;\n",)?; + writeln!(directory_file, " use super::*;\n",)?; for child in entry.children { let name = match child { Child::Submodule(name) => name, @@ -365,112 +369,28 @@ fn write_directory_file(directory_file: &mut File, table: &Table, version: &str) Ok(()) } -/// Module containing code supporting filter-by-regex feature -/// -/// The "GMT" and "UTC" time zones are always included. +/// Checks the `CHRONO_TZ_TIMEZONE_FILTER` environment variable. +/// Converts it to a regex if set. Panics if the regex is not valid, as we want +/// to fail the build if that happens. #[cfg(feature = "filter-by-regex")] -mod filter { - use std::collections::HashSet; - use std::env; - - use regex::Regex; - - use crate::{Table, FILTER_ENV_VAR_NAME}; - - /// Filter `table` by applying [`FILTER_ENV_VAR_NAME`]. - pub(crate) fn maybe_filter_timezone_table(table: &mut Table) { - if let Some(filter_regex) = get_filter_regex() { - filter_timezone_table(table, filter_regex); - } - } - - /// Checks the `CHRONO_TZ_TIMEZONE_FILTER` environment variable. - /// Converts it to a regex if set. Panics if the regex is not valid, as we want - /// to fail the build if that happens. - fn get_filter_regex() -> Option { - match env::var(FILTER_ENV_VAR_NAME) { - Ok(val) => { - let val = val.trim(); - if val.is_empty() { - return None; - } - match Regex::new(val) { +fn get_filter_regex() -> Option { + match std::env::var(FILTER_ENV_VAR_NAME) { + Ok(val) => { + let val = val.trim(); + if val.is_empty() { + return None; + } + match regex::Regex::new(val) { Ok(regex) => Some(regex), Err(err) => panic!( "The value '{val:?}' for environment variable {FILTER_ENV_VAR_NAME} is not a valid regex, err={err}" ), } - } - Err(env::VarError::NotPresent) => None, - Err(env::VarError::NotUnicode(s)) => panic!( - "The value '{s:?}' for environment variable {FILTER_ENV_VAR_NAME} is not valid Unicode" - ), } - } - - /// Insert a new name in the list of names to keep. If the name has 3 - /// parts, then also insert the 2-part prefix. If we don't do this we will lose - /// half of Indiana in `directory.rs`. But we *don't* want to keep one-part names, - /// otherwise we will inevitably end up with 'America' and include too much as - /// a consequence. - fn insert_keep_entry(keep: &mut HashSet, new_value: &str) { - let mut parts = new_value.split('/'); - if let (Some(p1), Some(p2), Some(_), None) = - (parts.next(), parts.next(), parts.next(), parts.next()) - { - keep.insert(format!("{p1}/{p2}")); - } - - keep.insert(new_value.to_string()); - } - - /// Filter `table` by applying `filter_regex`. - fn filter_timezone_table(table: &mut Table, filter_regex: Regex) { - // Compute the transitive closure of things to keep. - // Doing this, instead of just filtering `zonesets` and `links` by the - // regex, helps to keep the `structure()` intact. - let mut keep = HashSet::new(); - for (k, v) in &table.links { - if filter_regex.is_match(k) || k == "GMT" || k == "UTC" { - insert_keep_entry(&mut keep, k); - } - if filter_regex.is_match(v) || k == "GMT" || k == "UTC" { - insert_keep_entry(&mut keep, v); - } - } - - let mut n = 0; - loop { - let len = keep.len(); - - for (k, v) in &table.links { - if keep.contains(k) && !keep.contains(v) { - insert_keep_entry(&mut keep, v); - } - if keep.contains(v) && !keep.contains(k) { - insert_keep_entry(&mut keep, k); - } - } - - if keep.len() == len { - break; - } - - n += 1; - if n == 50 { - println!("cargo:warning=Recursion limit reached while building filter list"); - break; - } - } - - // Actually do the filtering. - table - .links - .retain(|k, v| keep.contains(k) || keep.contains(v)); - - table - .zonesets - .retain(|k, _| filter_regex.is_match(k) || keep.iter().any(|s| k.starts_with(s))); + Err(env::VarError::NotPresent) => None, + Err(env::VarError::NotUnicode(s)) => panic!( + "The value '{s:?}' for environment variable {FILTER_ENV_VAR_NAME} is not valid Unicode" + ), } } @@ -495,7 +415,7 @@ fn detect_iana_db_version() -> String { unreachable!("no version found") } -pub fn main(dir: &Path, _filter: bool, _uncased: bool) { +pub fn main(dir: &Path, _filter: bool, uncased: bool) { let mut table = TableBuilder::new(); let root = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| String::new())); @@ -509,19 +429,29 @@ pub fn main(dir: &Path, _filter: bool, _uncased: bool) { } } - #[allow(unused_mut)] - let mut table = table.build(); + let table = table.build(); + #[cfg(feature = "filter-by-regex")] - if _filter { - filter::maybe_filter_timezone_table(&mut table); - } + let regex = _filter.then(get_filter_regex).flatten(); + #[cfg(feature = "filter-by-regex")] + let filter = |tz: &str| regex.as_ref().is_none_or(|r| r.is_match(tz)); + #[cfg(not(feature = "filter-by-regex"))] + let filter = |_: &str| true; + + let zones = table + .zonesets + .keys() + .chain(table.links.keys()) + .filter(|s| filter(s)) + .map(String::as_str) + .collect::>(); let timezone_path = dir.join("timezones.rs"); let mut timezone_file = File::create(timezone_path).unwrap(); - write_timezone_file(&mut timezone_file, &table, _uncased).unwrap(); + write_timezone_file(&mut timezone_file, &table, &zones, uncased).unwrap(); let directory_path = dir.join("directory.rs"); let mut directory_file = File::create(directory_path).unwrap(); let version = detect_iana_db_version(); - write_directory_file(&mut directory_file, &table, &version).unwrap(); + write_directory_file(&mut directory_file, &table, &zones, &version).unwrap(); } diff --git a/parse-zoneinfo/src/structure.rs b/chrono-tz-build/src/zoneinfo_structure.rs similarity index 79% rename from parse-zoneinfo/src/structure.rs rename to chrono-tz-build/src/zoneinfo_structure.rs index 2a47c5b..6563488 100644 --- a/parse-zoneinfo/src/structure.rs +++ b/chrono-tz-build/src/zoneinfo_structure.rs @@ -32,7 +32,7 @@ use std::collections::{BTreeMap, BTreeSet}; -use crate::table::Table; +use parse_zoneinfo::table::Table; /// Trait to put the `structure` method on Tables. pub trait Structure { @@ -42,37 +42,46 @@ pub trait Structure { impl Structure for Table { fn structure(&self) -> TableStructure { - let mut mappings = BTreeMap::new(); - - for key in self.zonesets.keys().chain(self.links.keys()) { - // Extract the name from the *last* slash. So - // `America/Kentucky/Louisville` is split into - // `America/Kentucky` and `Louisville` components. - let last_slash = match key.rfind('/') { - Some(pos) => pos, - None => continue, - }; - - // Split the string around the slash, which gets removed. - let parent = &key[..last_slash]; - { - let set = mappings.entry(parent).or_insert_with(BTreeSet::new); - set.insert(Child::TimeZone(&key[last_slash + 1..])); - } - - // If the *parent* name still has a slash in it, then this is - // a time zone of the form `America/Kentucky/Louisville`. We - // need to make sure that `America` now has a `Kentucky` - // child, too. - if let Some(first_slash) = parent.find('/') { - let grandparent = &parent[..first_slash]; - let set = mappings.entry(grandparent).or_insert_with(BTreeSet::new); - set.insert(Child::Submodule(&parent[first_slash + 1..])); - } + build_tree( + self.zonesets + .keys() + .chain(self.links.keys()) + .map(|s| s.as_str()), + ) + } +} + +pub fn build_tree<'a>(entries: impl Iterator) -> TableStructure<'a> { + let mut mappings = BTreeMap::new(); + + for key in entries { + // Extract the name from the *last* slash. So + // `America/Kentucky/Louisville` is split into + // `America/Kentucky` and `Louisville` components. + let last_slash = match key.rfind('/') { + Some(pos) => pos, + None => continue, + }; + + // Split the string around the slash, which gets removed. + let parent = &key[..last_slash]; + { + let set = mappings.entry(parent).or_insert_with(BTreeSet::new); + set.insert(Child::TimeZone(&key[last_slash + 1..])); } - TableStructure { mappings } + // If the *parent* name still has a slash in it, then this is + // a time zone of the form `America/Kentucky/Louisville`. We + // need to make sure that `America` now has a `Kentucky` + // child, too. + if let Some(first_slash) = parent.find('/') { + let grandparent = &parent[..first_slash]; + let set = mappings.entry(grandparent).or_insert_with(BTreeSet::new); + set.insert(Child::Submodule(&parent[first_slash + 1..])); + } } + + TableStructure { mappings } } /// The structure of a set of time zone names. @@ -150,7 +159,7 @@ pub enum Child<'table> { #[allow(unused_results)] mod test { use super::*; - use crate::table::Table; + use parse_zoneinfo::table::Table; #[test] fn empty() { diff --git a/chrono-tz/src/prebuilt/directory.rs b/chrono-tz/src/prebuilt/directory.rs index 2c1834f..34b2d39 100644 --- a/chrono-tz/src/prebuilt/directory.rs +++ b/chrono-tz/src/prebuilt/directory.rs @@ -1,7 +1,7 @@ -pub const IANA_TZDB_VERSION: &str = "2025b"; - use crate::timezones::Tz; +pub const IANA_TZDB_VERSION: &str = "2025b"; + pub const CET: Tz = Tz::CET; pub const CST6CDT: Tz = Tz::CST6CDT; pub const Cuba: Tz = Tz::Cuba; @@ -48,7 +48,7 @@ pub const WET: Tz = Tz::WET; pub const Zulu: Tz = Tz::Zulu; pub mod Africa { - use crate::timezones::Tz; + use super::*; pub const Abidjan: Tz = Tz::Africa__Abidjan; pub const Accra: Tz = Tz::Africa__Accra; @@ -107,7 +107,7 @@ pub mod Africa { } pub mod America { - use crate::timezones::Tz; + use super::*; pub mod Argentina { use crate::timezones::Tz; @@ -301,7 +301,7 @@ pub mod America { } pub mod Antarctica { - use crate::timezones::Tz; + use super::*; pub const Casey: Tz = Tz::Antarctica__Casey; pub const Davis: Tz = Tz::Antarctica__Davis; @@ -318,13 +318,13 @@ pub mod Antarctica { } pub mod Arctic { - use crate::timezones::Tz; + use super::*; pub const Longyearbyen: Tz = Tz::Arctic__Longyearbyen; } pub mod Asia { - use crate::timezones::Tz; + use super::*; pub const Aden: Tz = Tz::Asia__Aden; pub const Almaty: Tz = Tz::Asia__Almaty; @@ -428,7 +428,7 @@ pub mod Asia { } pub mod Atlantic { - use crate::timezones::Tz; + use super::*; pub const Azores: Tz = Tz::Atlantic__Azores; pub const Bermuda: Tz = Tz::Atlantic__Bermuda; @@ -445,7 +445,7 @@ pub mod Atlantic { } pub mod Australia { - use crate::timezones::Tz; + use super::*; pub const ACT: Tz = Tz::Australia__ACT; pub const Adelaide: Tz = Tz::Australia__Adelaide; @@ -473,7 +473,7 @@ pub mod Australia { } pub mod Brazil { - use crate::timezones::Tz; + use super::*; pub const Acre: Tz = Tz::Brazil__Acre; pub const DeNoronha: Tz = Tz::Brazil__DeNoronha; @@ -482,7 +482,7 @@ pub mod Brazil { } pub mod Canada { - use crate::timezones::Tz; + use super::*; pub const Atlantic: Tz = Tz::Canada__Atlantic; pub const Central: Tz = Tz::Canada__Central; @@ -495,14 +495,14 @@ pub mod Canada { } pub mod Chile { - use crate::timezones::Tz; + use super::*; pub const Continental: Tz = Tz::Chile__Continental; pub const EasterIsland: Tz = Tz::Chile__EasterIsland; } pub mod Etc { - use crate::timezones::Tz; + use super::*; pub const GMT: Tz = Tz::Etc__GMT; pub const GMTPlus0: Tz = Tz::Etc__GMTPlus0; @@ -542,7 +542,7 @@ pub mod Etc { } pub mod Europe { - use crate::timezones::Tz; + use super::*; pub const Amsterdam: Tz = Tz::Europe__Amsterdam; pub const Andorra: Tz = Tz::Europe__Andorra; @@ -611,7 +611,7 @@ pub mod Europe { } pub mod Indian { - use crate::timezones::Tz; + use super::*; pub const Antananarivo: Tz = Tz::Indian__Antananarivo; pub const Chagos: Tz = Tz::Indian__Chagos; @@ -627,7 +627,7 @@ pub mod Indian { } pub mod Mexico { - use crate::timezones::Tz; + use super::*; pub const BajaNorte: Tz = Tz::Mexico__BajaNorte; pub const BajaSur: Tz = Tz::Mexico__BajaSur; @@ -635,7 +635,7 @@ pub mod Mexico { } pub mod Pacific { - use crate::timezones::Tz; + use super::*; pub const Apia: Tz = Tz::Pacific__Apia; pub const Auckland: Tz = Tz::Pacific__Auckland; @@ -684,7 +684,7 @@ pub mod Pacific { } pub mod US { - use crate::timezones::Tz; + use super::*; pub const Alaska: Tz = Tz::US__Alaska; pub const Aleutian: Tz = Tz::US__Aleutian; diff --git a/chrono-tz/tests/check-regex-filtering/src/lib.rs b/chrono-tz/tests/check-regex-filtering/src/lib.rs index 9db6010..34cd600 100644 --- a/chrono-tz/tests/check-regex-filtering/src/lib.rs +++ b/chrono-tz/tests/check-regex-filtering/src/lib.rs @@ -1,5 +1,5 @@ /// This test is compiled by the Github workflows with the -/// filter regex set thusly: CHRONO_TZ_TIMEZONE_FILTER="(Europe/London|GMT)" +/// filter regex set thusly: CHRONO_TZ_TIMEZONE_FILTER="Europe/(London|Vaduz)" /// /// We use it to check two things: /// 1) That the compiled chrono-tz contains the correct timezones (a compilation @@ -12,25 +12,22 @@ #[cfg(test)] mod tests { use chrono::offset::TimeZone; - use chrono_tz::{Europe, Europe::London, Tz, TZ_VARIANTS}; + use chrono_tz::{Europe, Tz, TZ_VARIANTS}; use std::str::FromStr; #[test] fn london_compiles() { // This line will be a compilation failure if the code generation // mistakenly excluded Europe::London. - let _london_time = London.with_ymd_and_hms(2013, 12, 25, 14, 0, 0); - assert_eq!("Europe/London", London.name()); + let _london_time = Europe::London.with_ymd_and_hms(2013, 12, 25, 14, 0, 0); + assert_eq!("Europe/London", Europe::London.name()); // Since London is included, converting from the corresponding // string representation should also work. - assert_eq!(Tz::from_str("Europe/London"), Ok(London)); + assert_eq!(Tz::from_str("Europe/London"), Ok(Europe::London)); - // We did not explicitly ask for Isle Of Man or Belfast in our regex, but there is a link - // from Europe::London to Isle_of_Man and Belfast (amongst others) - // so these conversions should also work. - assert_eq!(Tz::from_str("Europe/Isle_of_Man"), Ok(Europe::Isle_of_Man)); - assert_eq!(Tz::from_str("Europe/Belfast"), Ok(Europe::Belfast)); + // Vaduz is a link to a zone we didn't ask for, check that it still works. + assert_eq!(Tz::from_str("Europe/Vaduz"), Ok(Europe::Vaduz)); } #[test] @@ -44,24 +41,22 @@ mod tests { assert!(Tz::from_str("Pacific/Kwajalein").is_err()); assert!(Tz::from_str("US/Central").is_err()); - // The link table caused us to include some extra items from the UK (see - // `london_compiles()`), but it should NOT include various other timezones - // from around Europe since there is no linkage between them. + // Similar for timezones inside Europe, including those that link + // to London, or that Vaduz links to. + assert!(Tz::from_str("Europe/Isle_Of_Man").is_err()); + assert!(Tz::from_str("Europe/Belfast").is_err()); + assert!(Tz::from_str("Europe/Zurich").is_err()); + assert!(Tz::from_str("Europe/Brussels").is_err()); assert!(Tz::from_str("Europe/Brussels").is_err()); assert!(Tz::from_str("Europe/Dublin").is_err()); assert!(Tz::from_str("Europe/Warsaw").is_err()); - // Also, entire continents outside Europe should be excluded. - for tz in TZ_VARIANTS.iter() { - assert!(!tz.name().starts_with("Africa")); - assert!(!tz.name().starts_with("Asia")); - assert!(!tz.name().starts_with("Australia")); - assert!(!tz.name().starts_with("Canada")); - assert!(!tz.name().starts_with("Chile")); - assert!(!tz.name().starts_with("Indian")); - assert!(!tz.name().starts_with("Mexico")); - assert!(!tz.name().starts_with("Pacific")); - assert!(!tz.name().starts_with("US")); - } + // Top level zones, including UTC and GMT should also be excluded + assert!(Tz::from_str("UTC").is_err()); + assert!(Tz::from_str("GMT").is_err()); + assert!(Tz::from_str("EST5EDT").is_err()); + + // There should only really be those two zones. + assert_eq!(TZ_VARIANTS.len(), 2); } } diff --git a/parse-zoneinfo/src/lib.rs b/parse-zoneinfo/src/lib.rs index 2cdd048..4dcfb87 100644 --- a/parse-zoneinfo/src/lib.rs +++ b/parse-zoneinfo/src/lib.rs @@ -35,7 +35,6 @@ #![warn(unused)] pub mod line; -pub mod structure; pub mod table; pub mod transitions;