From a804fa3bb0d360064e9f4f725ae2db42d91db82f Mon Sep 17 00:00:00 2001 From: Dominik Spicher Date: Thu, 6 Feb 2025 17:46:08 +0100 Subject: [PATCH] bytewords: replace match with precomputed perfect has function The two giant match statements with 256 arms incur linear lookup cost. To avoid this, we have defined a simple family of hash functions linear in the first and last character of the encoded string (this lines up nicely with the fact that those two characters are identical for the default and minimal encodings). Exhaustive search has then yielded a perfect hash function (with no collisions) that only needs a domain of roughly double the theoretical minimum (which would be 256 elements). Other families of hash functions might theoretically yield smaller domains, but we don't judge this to be a relevant optimization for now. This design achieves the following goals: - Constant lookup time at runtime. - `no_std` compliance, i.e. no `OnceLock`s and no `HashMap`s. - No `unsafe` code. The recently introduced decoding test [1] suggests performance improvements of 66%. [1] bbf09e4 --- src/bytewords.rs | 546 ++--------------------------------------------- src/constants.rs | 80 +++++++ 2 files changed, 94 insertions(+), 532 deletions(-) diff --git a/src/bytewords.rs b/src/bytewords.rs index 0c57369..a4588bf 100644 --- a/src/bytewords.rs +++ b/src/bytewords.rs @@ -111,7 +111,7 @@ pub fn decode(encoded: &str, style: Style) -> Result, Error> { Style::Uri => '-', Style::Minimal => return decode_minimal(encoded), }; - decode_from_index(&mut encoded.split(separator), false) + decode_parts(&mut encoded.split(separator)) } fn decode_minimal(encoded: &str) -> Result, Error> { @@ -119,545 +119,27 @@ fn decode_minimal(encoded: &str) -> Result, Error> { return Err(Error::InvalidLength); } - decode_from_index( + decode_parts( &mut (0..encoded.len()) .step_by(2) .map(|idx| encoded.get(idx..idx + 2).unwrap()), - true, ) } +fn encoded_byte(str: &str) -> Option { + let mut chars = str.chars(); + let hash = + usize::try_from((25 * (chars.next()? as u32) + 11 * chars.last()? as u32) % 628).ok()?; + crate::constants::BYTES_INDEXED_BY_HASH[hash] +} + #[allow(clippy::too_many_lines)] -fn decode_from_index( - keys: &mut dyn Iterator, - minimal: bool, -) -> Result, Error> { +fn decode_parts(parts: &mut dyn Iterator) -> Result, Error> { strip_checksum( - keys.map(|k| { - if minimal { - match k { - "ae" => Some(0), - "ad" => Some(1), - "ao" => Some(2), - "ax" => Some(3), - "aa" => Some(4), - "ah" => Some(5), - "am" => Some(6), - "at" => Some(7), - "ay" => Some(8), - "as" => Some(9), - "bk" => Some(10), - "bd" => Some(11), - "bn" => Some(12), - "bt" => Some(13), - "ba" => Some(14), - "bs" => Some(15), - "be" => Some(16), - "by" => Some(17), - "bg" => Some(18), - "bw" => Some(19), - "bb" => Some(20), - "bz" => Some(21), - "cm" => Some(22), - "ch" => Some(23), - "cs" => Some(24), - "cf" => Some(25), - "cy" => Some(26), - "cw" => Some(27), - "ce" => Some(28), - "ca" => Some(29), - "ck" => Some(30), - "ct" => Some(31), - "cx" => Some(32), - "cl" => Some(33), - "cp" => Some(34), - "cn" => Some(35), - "dk" => Some(36), - "da" => Some(37), - "ds" => Some(38), - "di" => Some(39), - "de" => Some(40), - "dt" => Some(41), - "dr" => Some(42), - "dn" => Some(43), - "dw" => Some(44), - "dp" => Some(45), - "dm" => Some(46), - "dl" => Some(47), - "dy" => Some(48), - "eh" => Some(49), - "ey" => Some(50), - "eo" => Some(51), - "ee" => Some(52), - "ec" => Some(53), - "en" => Some(54), - "em" => Some(55), - "et" => Some(56), - "es" => Some(57), - "ft" => Some(58), - "fr" => Some(59), - "fn" => Some(60), - "fs" => Some(61), - "fm" => Some(62), - "fh" => Some(63), - "fz" => Some(64), - "fp" => Some(65), - "fw" => Some(66), - "fx" => Some(67), - "fy" => Some(68), - "fe" => Some(69), - "fg" => Some(70), - "fl" => Some(71), - "fd" => Some(72), - "ga" => Some(73), - "ge" => Some(74), - "gr" => Some(75), - "gs" => Some(76), - "gt" => Some(77), - "gl" => Some(78), - "gw" => Some(79), - "gd" => Some(80), - "gy" => Some(81), - "gm" => Some(82), - "gu" => Some(83), - "gh" => Some(84), - "go" => Some(85), - "hf" => Some(86), - "hg" => Some(87), - "hd" => Some(88), - "hk" => Some(89), - "ht" => Some(90), - "hp" => Some(91), - "hh" => Some(92), - "hl" => Some(93), - "hy" => Some(94), - "he" => Some(95), - "hn" => Some(96), - "hs" => Some(97), - "id" => Some(98), - "ia" => Some(99), - "ie" => Some(100), - "ih" => Some(101), - "iy" => Some(102), - "io" => Some(103), - "is" => Some(104), - "in" => Some(105), - "im" => Some(106), - "je" => Some(107), - "jz" => Some(108), - "jn" => Some(109), - "jt" => Some(110), - "jl" => Some(111), - "jo" => Some(112), - "js" => Some(113), - "jp" => Some(114), - "jk" => Some(115), - "jy" => Some(116), - "kp" => Some(117), - "ko" => Some(118), - "kt" => Some(119), - "ks" => Some(120), - "kk" => Some(121), - "kn" => Some(122), - "kg" => Some(123), - "ke" => Some(124), - "ki" => Some(125), - "kb" => Some(126), - "lb" => Some(127), - "la" => Some(128), - "ly" => Some(129), - "lf" => Some(130), - "ls" => Some(131), - "lr" => Some(132), - "lp" => Some(133), - "ln" => Some(134), - "lt" => Some(135), - "lo" => Some(136), - "ld" => Some(137), - "le" => Some(138), - "lu" => Some(139), - "lk" => Some(140), - "lg" => Some(141), - "mn" => Some(142), - "my" => Some(143), - "mh" => Some(144), - "me" => Some(145), - "mo" => Some(146), - "mu" => Some(147), - "mw" => Some(148), - "md" => Some(149), - "mt" => Some(150), - "ms" => Some(151), - "mk" => Some(152), - "nl" => Some(153), - "ny" => Some(154), - "nd" => Some(155), - "ns" => Some(156), - "nt" => Some(157), - "nn" => Some(158), - "ne" => Some(159), - "nb" => Some(160), - "oy" => Some(161), - "oe" => Some(162), - "ot" => Some(163), - "ox" => Some(164), - "on" => Some(165), - "ol" => Some(166), - "os" => Some(167), - "pd" => Some(168), - "pt" => Some(169), - "pk" => Some(170), - "py" => Some(171), - "ps" => Some(172), - "pm" => Some(173), - "pl" => Some(174), - "pe" => Some(175), - "pf" => Some(176), - "pa" => Some(177), - "pr" => Some(178), - "qd" => Some(179), - "qz" => Some(180), - "re" => Some(181), - "rp" => Some(182), - "rl" => Some(183), - "ro" => Some(184), - "rh" => Some(185), - "rd" => Some(186), - "rk" => Some(187), - "rf" => Some(188), - "ry" => Some(189), - "rn" => Some(190), - "rs" => Some(191), - "rt" => Some(192), - "se" => Some(193), - "sa" => Some(194), - "sr" => Some(195), - "ss" => Some(196), - "sk" => Some(197), - "sw" => Some(198), - "st" => Some(199), - "sp" => Some(200), - "so" => Some(201), - "sg" => Some(202), - "sb" => Some(203), - "sf" => Some(204), - "sn" => Some(205), - "to" => Some(206), - "tk" => Some(207), - "ti" => Some(208), - "tt" => Some(209), - "td" => Some(210), - "te" => Some(211), - "ty" => Some(212), - "tl" => Some(213), - "tb" => Some(214), - "ts" => Some(215), - "tp" => Some(216), - "ta" => Some(217), - "tn" => Some(218), - "uy" => Some(219), - "uo" => Some(220), - "ut" => Some(221), - "ue" => Some(222), - "ur" => Some(223), - "vt" => Some(224), - "vy" => Some(225), - "vo" => Some(226), - "vl" => Some(227), - "ve" => Some(228), - "vw" => Some(229), - "va" => Some(230), - "vd" => Some(231), - "vs" => Some(232), - "wl" => Some(233), - "wd" => Some(234), - "wm" => Some(235), - "wp" => Some(236), - "we" => Some(237), - "wy" => Some(238), - "ws" => Some(239), - "wt" => Some(240), - "wn" => Some(241), - "wz" => Some(242), - "wf" => Some(243), - "wk" => Some(244), - "yk" => Some(245), - "yn" => Some(246), - "yl" => Some(247), - "ya" => Some(248), - "yt" => Some(249), - "zs" => Some(250), - "zo" => Some(251), - "zt" => Some(252), - "zc" => Some(253), - "ze" => Some(254), - "zm" => Some(255), - _ => None, - } - } else { - match k { - "able" => Some(0), - "acid" => Some(1), - "also" => Some(2), - "apex" => Some(3), - "aqua" => Some(4), - "arch" => Some(5), - "atom" => Some(6), - "aunt" => Some(7), - "away" => Some(8), - "axis" => Some(9), - "back" => Some(10), - "bald" => Some(11), - "barn" => Some(12), - "belt" => Some(13), - "beta" => Some(14), - "bias" => Some(15), - "blue" => Some(16), - "body" => Some(17), - "brag" => Some(18), - "brew" => Some(19), - "bulb" => Some(20), - "buzz" => Some(21), - "calm" => Some(22), - "cash" => Some(23), - "cats" => Some(24), - "chef" => Some(25), - "city" => Some(26), - "claw" => Some(27), - "code" => Some(28), - "cola" => Some(29), - "cook" => Some(30), - "cost" => Some(31), - "crux" => Some(32), - "curl" => Some(33), - "cusp" => Some(34), - "cyan" => Some(35), - "dark" => Some(36), - "data" => Some(37), - "days" => Some(38), - "deli" => Some(39), - "dice" => Some(40), - "diet" => Some(41), - "door" => Some(42), - "down" => Some(43), - "draw" => Some(44), - "drop" => Some(45), - "drum" => Some(46), - "dull" => Some(47), - "duty" => Some(48), - "each" => Some(49), - "easy" => Some(50), - "echo" => Some(51), - "edge" => Some(52), - "epic" => Some(53), - "even" => Some(54), - "exam" => Some(55), - "exit" => Some(56), - "eyes" => Some(57), - "fact" => Some(58), - "fair" => Some(59), - "fern" => Some(60), - "figs" => Some(61), - "film" => Some(62), - "fish" => Some(63), - "fizz" => Some(64), - "flap" => Some(65), - "flew" => Some(66), - "flux" => Some(67), - "foxy" => Some(68), - "free" => Some(69), - "frog" => Some(70), - "fuel" => Some(71), - "fund" => Some(72), - "gala" => Some(73), - "game" => Some(74), - "gear" => Some(75), - "gems" => Some(76), - "gift" => Some(77), - "girl" => Some(78), - "glow" => Some(79), - "good" => Some(80), - "gray" => Some(81), - "grim" => Some(82), - "guru" => Some(83), - "gush" => Some(84), - "gyro" => Some(85), - "half" => Some(86), - "hang" => Some(87), - "hard" => Some(88), - "hawk" => Some(89), - "heat" => Some(90), - "help" => Some(91), - "high" => Some(92), - "hill" => Some(93), - "holy" => Some(94), - "hope" => Some(95), - "horn" => Some(96), - "huts" => Some(97), - "iced" => Some(98), - "idea" => Some(99), - "idle" => Some(100), - "inch" => Some(101), - "inky" => Some(102), - "into" => Some(103), - "iris" => Some(104), - "iron" => Some(105), - "item" => Some(106), - "jade" => Some(107), - "jazz" => Some(108), - "join" => Some(109), - "jolt" => Some(110), - "jowl" => Some(111), - "judo" => Some(112), - "jugs" => Some(113), - "jump" => Some(114), - "junk" => Some(115), - "jury" => Some(116), - "keep" => Some(117), - "keno" => Some(118), - "kept" => Some(119), - "keys" => Some(120), - "kick" => Some(121), - "kiln" => Some(122), - "king" => Some(123), - "kite" => Some(124), - "kiwi" => Some(125), - "knob" => Some(126), - "lamb" => Some(127), - "lava" => Some(128), - "lazy" => Some(129), - "leaf" => Some(130), - "legs" => Some(131), - "liar" => Some(132), - "limp" => Some(133), - "lion" => Some(134), - "list" => Some(135), - "logo" => Some(136), - "loud" => Some(137), - "love" => Some(138), - "luau" => Some(139), - "luck" => Some(140), - "lung" => Some(141), - "main" => Some(142), - "many" => Some(143), - "math" => Some(144), - "maze" => Some(145), - "memo" => Some(146), - "menu" => Some(147), - "meow" => Some(148), - "mild" => Some(149), - "mint" => Some(150), - "miss" => Some(151), - "monk" => Some(152), - "nail" => Some(153), - "navy" => Some(154), - "need" => Some(155), - "news" => Some(156), - "next" => Some(157), - "noon" => Some(158), - "note" => Some(159), - "numb" => Some(160), - "obey" => Some(161), - "oboe" => Some(162), - "omit" => Some(163), - "onyx" => Some(164), - "open" => Some(165), - "oval" => Some(166), - "owls" => Some(167), - "paid" => Some(168), - "part" => Some(169), - "peck" => Some(170), - "play" => Some(171), - "plus" => Some(172), - "poem" => Some(173), - "pool" => Some(174), - "pose" => Some(175), - "puff" => Some(176), - "puma" => Some(177), - "purr" => Some(178), - "quad" => Some(179), - "quiz" => Some(180), - "race" => Some(181), - "ramp" => Some(182), - "real" => Some(183), - "redo" => Some(184), - "rich" => Some(185), - "road" => Some(186), - "rock" => Some(187), - "roof" => Some(188), - "ruby" => Some(189), - "ruin" => Some(190), - "runs" => Some(191), - "rust" => Some(192), - "safe" => Some(193), - "saga" => Some(194), - "scar" => Some(195), - "sets" => Some(196), - "silk" => Some(197), - "skew" => Some(198), - "slot" => Some(199), - "soap" => Some(200), - "solo" => Some(201), - "song" => Some(202), - "stub" => Some(203), - "surf" => Some(204), - "swan" => Some(205), - "taco" => Some(206), - "task" => Some(207), - "taxi" => Some(208), - "tent" => Some(209), - "tied" => Some(210), - "time" => Some(211), - "tiny" => Some(212), - "toil" => Some(213), - "tomb" => Some(214), - "toys" => Some(215), - "trip" => Some(216), - "tuna" => Some(217), - "twin" => Some(218), - "ugly" => Some(219), - "undo" => Some(220), - "unit" => Some(221), - "urge" => Some(222), - "user" => Some(223), - "vast" => Some(224), - "very" => Some(225), - "veto" => Some(226), - "vial" => Some(227), - "vibe" => Some(228), - "view" => Some(229), - "visa" => Some(230), - "void" => Some(231), - "vows" => Some(232), - "wall" => Some(233), - "wand" => Some(234), - "warm" => Some(235), - "wasp" => Some(236), - "wave" => Some(237), - "waxy" => Some(238), - "webs" => Some(239), - "what" => Some(240), - "when" => Some(241), - "whiz" => Some(242), - "wolf" => Some(243), - "work" => Some(244), - "yank" => Some(245), - "yawn" => Some(246), - "yell" => Some(247), - "yoga" => Some(248), - "yurt" => Some(249), - "zaps" => Some(250), - "zero" => Some(251), - "zest" => Some(252), - "zinc" => Some(253), - "zone" => Some(254), - "zoom" => Some(255), - _ => None, - } - } - }) - .collect::>>() - .ok_or(Error::InvalidWord)?, + parts + .map(encoded_byte) + .collect::>>() + .ok_or(Error::InvalidWord)?, ) } diff --git a/src/constants.rs b/src/constants.rs index f2221d4..6cd9429 100644 --- a/src/constants.rs +++ b/src/constants.rs @@ -69,3 +69,83 @@ pub const MINIMALS: [&str; 256] = [ "wt", "wn", "wz", "wf", "wk", "yk", "yn", "yl", "ya", "yt", "zs", "zo", "zt", "zc", "ze", "zm", ]; + +#[rustfmt::skip] +pub const BYTES_INDEXED_BY_HASH: [Option; 628] = [ + None, Some(101), None, None, None, None, Some(82), None, Some(41), + Some(89), Some(127), None, None, Some(17), Some(65), None, Some(27), + None, Some(124), None, Some(93), None, Some(57), None, Some(21), None, + None, Some(32), Some(85), None, None, None, Some(137), Some(56), None, + None, Some(59), None, Some(26), None, Some(123), Some(44), Some(96), + Some(138), None, None, None, Some(61), None, None, None, None, None, + None, Some(130), None, Some(106), Some(149), Some(58), Some(115), + Some(160), Some(75), Some(125), Some(48), Some(91), Some(141), None, + Some(105), Some(145), None, Some(111), None, Some(76), None, None, + None, None, None, Some(103), None, None, None, Some(155), Some(77), + Some(121), None, None, None, Some(50), None, None, Some(66), + Some(109), Some(159), Some(83), None, None, Some(97), None, + Some(177), None, Some(144), Some(67), Some(112), None, None, None, + None, Some(90), Some(140), None, None, None, Some(68), Some(114), + None, Some(79), Some(122), Some(162), None, None, None, Some(104), + None, Some(64), None, None, None, Some(118), None, None, None, + Some(168), None, Some(152), None, None, None, Some(81), Some(117), + None, None, Some(134), Some(175), None, None, None, Some(113), None, + None, None, None, None, Some(136), Some(176), None, None, Some(179), + Some(110), None, None, None, None, Some(94), Some(133), None, None, + Some(142), None, None, Some(153), None, Some(120), None, Some(194), + None, None, None, Some(146), None, None, None, Some(186), + Some(119), None, Some(203), Some(132), None, Some(102), None, None, + None, Some(158), Some(181), None, Some(166), None, Some(131), None, + Some(217), None, None, None, None, Some(188), None, None, None, + Some(135), Some(170), Some(214), None, None, Some(116), None, None, + None, Some(165), Some(193), Some(139), Some(174), None, Some(151), + None, Some(108), None, Some(185), None, None, Some(204), None, + Some(173), Some(210), Some(150), None, None, None, None, None, None, + Some(202), None, None, Some(211), Some(147), None, None, Some(156), + None, Some(230), None, None, None, None, None, None, None, None, + Some(157), Some(187), None, None, None, Some(129), None, None, + Some(148), None, Some(222), None, Some(183), None, Some(167), None, + None, None, None, None, None, None, None, None, Some(231), + Some(163), Some(197), None, Some(178), Some(208), Some(143), None, + None, None, Some(190), Some(228), None, None, None, Some(172), None, + None, None, None, None, Some(184), None, None, None, Some(234), + Some(169), Some(207), None, None, None, Some(154), Some(182), None, + None, Some(205), Some(237), None, Some(213), None, None, None, + Some(248), None, None, Some(164), Some(201), Some(243), None, None, + None, None, None, None, None, None, Some(161), Some(200), None, + None, Some(218), None, None, None, None, Some(191), None, None, + None, None, Some(4), Some(206), None, None, None, None, Some(192), + None, None, Some(195), None, Some(171), Some(216), None, None, + None, None, None, Some(227), Some(253), Some(196), None, None, + None, None, Some(14), Some(220), None, None, None, None, Some(199), + Some(244), Some(1), None, None, Some(20), None, None, None, None, + Some(254), None, Some(233), Some(0), Some(215), None, Some(180), + None, None, Some(29), Some(226), None, None, Some(235), None, + Some(209), None, Some(11), Some(223), None, Some(189), None, None, + Some(198), Some(241), None, None, None, Some(16), None, None, None, + None, None, Some(37), None, Some(5), None, None, None, Some(221), + Some(245), None, None, None, None, Some(236), None, None, None, + Some(18), None, Some(247), Some(28), Some(232), None, None, None, + None, None, None, None, None, None, Some(25), Some(224), None, + None, None, None, Some(212), None, None, None, Some(246), None, + None, None, Some(40), Some(239), None, Some(53), None, None, None, + None, Some(23), None, Some(255), None, Some(240), Some(6), None, + None, Some(10), Some(219), None, None, Some(229), None, None, None, + None, Some(52), None, None, None, None, None, Some(73), Some(251), + None, None, Some(2), None, None, None, Some(72), None, Some(30), + Some(225), None, Some(39), None, None, None, None, Some(12), + Some(69), None, Some(33), None, None, None, None, None, Some(49), + None, None, None, Some(249), Some(22), Some(80), None, Some(36), + Some(238), None, None, None, None, Some(70), None, Some(35), + Some(74), Some(250), Some(47), Some(242), Some(9), None, Some(99), + None, Some(63), None, None, None, Some(252), Some(46), Some(88), + Some(7), None, None, None, None, None, Some(34), None, None, + Some(43), Some(95), None, None, None, Some(15), None, None, None, + Some(84), None, None, Some(86), None, Some(55), Some(98), Some(13), + None, None, None, None, None, Some(45), Some(87), None, Some(54), + Some(100), None, Some(71), None, Some(24), None, None, None, + Some(92), Some(3), Some(51), None, None, Some(62), None, Some(31), + None, Some(126), Some(42), None, Some(8), None, None, Some(19), + Some(60), Some(107), None, Some(78), None, Some(38), None, + Some(128), +];