From ba925210a3bf30bf9b6f7155d57e4067df602384 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Fri, 1 Sep 2023 16:33:08 +0200 Subject: [PATCH 1/2] Working on bson support --- Cargo.toml | 3 +- src/structure/tfc/block.rs | 4 + src/structure/tfc/datatypes.rs | 113 +++++++++++++++++++++++++ src/structure/tfc/typed.rs | 150 ++++++++++++++++++++++++++++++++- 4 files changed, 268 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2ae7bfce..4dc83bca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,8 @@ regex = "1.5" lru = "0.10" bitvec = "1.0" tempfile = "3.1" +dec = "0.4.8" [features] noreadlock = [] -eprint_log = [] \ No newline at end of file +eprint_log = [] diff --git a/src/structure/tfc/block.rs b/src/structure/tfc/block.rs index 9172a8f4..3e1cbbcd 100644 --- a/src/structure/tfc/block.rs +++ b/src/structure/tfc/block.rs @@ -751,6 +751,8 @@ fn record_size_decoding(enc: u8) -> Option { 0 => None, 3 => Some(4), 4 => Some(8), + 5 => Some(12), + 6 => Some(16), _ => panic!("Ok, this is not known"), } } @@ -760,6 +762,8 @@ fn record_size_encoding(record_size: Option) -> u8 { None => 0, Some(4) => 3 << 3, Some(8) => 4 << 3, + Some(12) => 5 << 3, + Some(16) => 6 << 3, _ => { panic!("This is really bad!") } diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index a9b11234..64fa291d 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -8,6 +8,7 @@ use base64::display::Base64Display; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use chrono::{NaiveDateTime, NaiveTime}; +use dec::Decimal128; use num_derive::FromPrimitive; use rug::Integer; @@ -59,6 +60,14 @@ pub enum Datatype { Base64Binary, HexBinary, AnySimpleType, + + Decimal128, + BSONObjectId, + TimeStamp64, + BSONTimeStamp, + Regex, + Javascript, + BSONBinary, } impl Datatype { @@ -84,6 +93,10 @@ impl Datatype { Datatype::BigInt => None, Datatype::Token => None, Datatype::LangString => None, + Datatype::Decimal128 => Some(16), + Datatype::BSONObjectId => Some(12), + Datatype::TimeStamp64 => Some(8), + Datatype::BSONTimeStamp => Some(8), _ => None, } } @@ -998,6 +1011,67 @@ impl TdbDataType for HexBinary { } } +pub struct BSONObjectId([u8; 12]); + +impl FromLexical for BSONObjectId { + fn from_lexical(mut b: B) -> Self { + let mut result = [0; 12]; + b.copy_to_slice(&mut result); + + BSONObjectId(result) + } +} + +impl ToLexical for BSONObjectId { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(&self.0) + } +} + +impl ToLexical for [u8; 12] { + fn to_lexical(&self) -> Bytes { + Bytes::copy_from_slice(self) + } +} + +impl TdbDataType for BSONObjectId { + fn datatype() -> Datatype { + Datatype::BSONObjectId + } +} + +const DEC128_SIGN_MASK: u128 = 0x8000_0000_0000_0000_0000_0000_0000_0000; +const DEC128_COMPLEMENT: u128 = 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff; + +impl ToLexical for Decimal128 { + fn to_lexical(&self) -> Bytes { + let bits: u128 = u128::from_be_bytes(self.to_be_bytes()); + let transformed = if bits & DEC128_SIGN_MASK > 0 { + bits ^ DEC128_COMPLEMENT + } else { + bits ^ DEC128_SIGN_MASK + }; + Bytes::copy_from_slice(&transformed.to_be_bytes()) + } +} + +impl FromLexical for Decimal128 { + fn from_lexical(mut b: B) -> Self { + let i = b.get_u128(); + if i & DEC128_SIGN_MASK > 0 { + Decimal128::from_be_bytes((i ^ DEC128_SIGN_MASK).to_be_bytes()) + } else { + Decimal128::from_be_bytes((i ^ DEC128_COMPLEMENT).to_be_bytes()) + } + } +} + +impl TdbDataType for Decimal128 { + fn datatype() -> Datatype { + Datatype::Decimal128 + } +} + macro_rules! stringy_type { ($ty:ident) => { stringy_type!($ty, $ty); @@ -1082,6 +1156,39 @@ macro_rules! biginty_type { }; } +macro_rules! u64y_type { + ($ty:ident) => { + u64y_type!($ty, $ty); + }; + ($ty:ident, $datatype:ident) => { + #[derive(PartialEq, Debug)] + pub struct $ty(pub u64); + + impl TdbDataType for $ty { + fn datatype() -> Datatype { + Datatype::$datatype + } + } + + impl FromLexical<$ty> for $ty { + fn from_lexical(b: B) -> Self { + $ty(FromLexical::::from_lexical(b)) + } + } + impl FromLexical<$ty> for u64 { + fn from_lexical(b: B) -> Self { + FromLexical::::from_lexical(b) + } + } + + impl ToLexical<$ty> for $ty { + fn to_lexical(&self) -> Bytes { + self.0.to_lexical() + } + } + }; +} + stringy_type!(LangString); stringy_type!(NCName); stringy_type!(Name); @@ -1098,7 +1205,13 @@ stringy_type!(Entity); stringy_type!(AnySimpleType); +stringy_type!(Regex); +stringy_type!(Javascript); + biginty_type!(PositiveInteger); biginty_type!(NonNegativeInteger); biginty_type!(NegativeInteger); biginty_type!(NonPositiveInteger); + +u64y_type!(TimeStamp64); +u64y_type!(BSONTimeStamp); diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index 2494d07d..b24859ea 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -500,9 +500,10 @@ impl TypedDictBufBuilder = dict.iter().collect(); + eprintln!("{entries:?}"); + eprintln!("{data:?}"); + panic!("wah"); + } + + #[test] + fn test_bson_objectid() { + let used_types_buf = BytesMut::new(); + let type_offsets_buf = BytesMut::new(); + let block_offsets_buf = BytesMut::new(); + let data_buf = BytesMut::new(); + + let mut typed_builder = TypedDictBufBuilder::new( + used_types_buf, + type_offsets_buf, + block_offsets_buf, + data_buf, + ); + + let mut vec = vec![ + BSONObjectId::make_entry(&[42; 12]), + BSONObjectId::make_entry(&[43; 12]), + BSONObjectId::make_entry(&[44; 12]), + BSONObjectId::make_entry(&[41; 12]), + BSONObjectId::make_entry(&[25; 12]), + ]; + vec.sort(); + typed_builder.add_all(vec.into_iter()); + let (b1, b2, b3, b4) = typed_builder.finalize(); + let data = b4.freeze(); + let dict = TypedDict::from_parts(b1.freeze(), b2.freeze(), b3.freeze(), data.clone()); + + let entries: Vec<_> = dict.iter().collect(); + eprintln!("{entries:?}"); + eprintln!("{data:?}"); + panic!("wah"); + } + + #[test] + fn test_bson_objectid_overlap() { + let used_types_buf = BytesMut::new(); + let type_offsets_buf = BytesMut::new(); + let block_offsets_buf = BytesMut::new(); + let data_buf = BytesMut::new(); + + let mut typed_builder = TypedDictBufBuilder::new( + used_types_buf, + type_offsets_buf, + block_offsets_buf, + data_buf, + ); + + let mut ids = [[42; 12], [43; 12], [44; 12], [41; 12], [25; 12]]; + for id in ids.iter_mut() { + id[0] = 42; + } + + let mut vec: Vec<_> = ids.iter().map(BSONObjectId::make_entry).collect(); + vec.sort(); + typed_builder.add_all(vec.into_iter()); + let (b1, b2, b3, b4) = typed_builder.finalize(); + let data = b4.freeze(); + let dict = TypedDict::from_parts(b1.freeze(), b2.freeze(), b3.freeze(), data.clone()); + + let entries: Vec<_> = dict.iter().collect(); + eprintln!("{entries:?}"); + eprintln!("{data:?}"); + panic!("wah"); + } + + #[test] + fn test_decimal128() { + let used_types_buf = BytesMut::new(); + let type_offsets_buf = BytesMut::new(); + let block_offsets_buf = BytesMut::new(); + let data_buf = BytesMut::new(); + + let mut typed_builder = TypedDictBufBuilder::new( + used_types_buf, + type_offsets_buf, + block_offsets_buf, + data_buf, + ); + + let numbers: Vec = [ + "0.1", + "2.3", + "0.00000028", + "1000000", + "4.2", + "-1.3", + "-12", + "-0.0000005", + ] + .iter() + .map(|n| n.parse().unwrap()) + .collect(); + + let mut entries: Vec<_> = numbers.iter().map(Decimal128::make_entry).collect(); + entries.sort(); + + typed_builder.add_all(entries.into_iter()); + let (b1, b2, b3, b4) = typed_builder.finalize(); + let data = b4.freeze(); + let dict = TypedDict::from_parts(b1.freeze(), b2.freeze(), b3.freeze(), data.clone()); + + let entries: Vec<_> = dict.iter().collect(); + eprintln!("{entries:?}"); + eprintln!("{data:?}"); + eprintln!( + "{:?}", + entries + .iter() + .map(|e| e.as_val()) + .collect::>() + ); + panic!("wah"); + } } From ce644bcf6a5f8a81ecc121db8da9f382d2df8461 Mon Sep 17 00:00:00 2001 From: Matthijs van Otterdijk Date: Tue, 5 Sep 2023 11:57:38 +0200 Subject: [PATCH 2/2] switch to bson decimal128 --- Cargo.toml | 2 +- src/structure/tfc/datatypes.rs | 8 ++++---- src/structure/tfc/typed.rs | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4dc83bca..a4e3757c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -36,7 +36,7 @@ regex = "1.5" lru = "0.10" bitvec = "1.0" tempfile = "3.1" -dec = "0.4.8" +bson = "2.7" [features] noreadlock = [] diff --git a/src/structure/tfc/datatypes.rs b/src/structure/tfc/datatypes.rs index 64fa291d..d0dccf91 100644 --- a/src/structure/tfc/datatypes.rs +++ b/src/structure/tfc/datatypes.rs @@ -5,10 +5,10 @@ use super::{ TypedDictEntry, }; use base64::display::Base64Display; +use bson::Decimal128; use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use bytes::{Buf, BufMut, Bytes, BytesMut}; use chrono::{NaiveDateTime, NaiveTime}; -use dec::Decimal128; use num_derive::FromPrimitive; use rug::Integer; @@ -1045,7 +1045,7 @@ const DEC128_COMPLEMENT: u128 = 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff; impl ToLexical for Decimal128 { fn to_lexical(&self) -> Bytes { - let bits: u128 = u128::from_be_bytes(self.to_be_bytes()); + let bits: u128 = u128::from_le_bytes(self.bytes()); let transformed = if bits & DEC128_SIGN_MASK > 0 { bits ^ DEC128_COMPLEMENT } else { @@ -1059,9 +1059,9 @@ impl FromLexical for Decimal128 { fn from_lexical(mut b: B) -> Self { let i = b.get_u128(); if i & DEC128_SIGN_MASK > 0 { - Decimal128::from_be_bytes((i ^ DEC128_SIGN_MASK).to_be_bytes()) + Decimal128::from_bytes((i ^ DEC128_SIGN_MASK).to_le_bytes()) } else { - Decimal128::from_be_bytes((i ^ DEC128_COMPLEMENT).to_be_bytes()) + Decimal128::from_bytes((i ^ DEC128_COMPLEMENT).to_le_bytes()) } } } diff --git a/src/structure/tfc/typed.rs b/src/structure/tfc/typed.rs index b24859ea..b3ab57c9 100644 --- a/src/structure/tfc/typed.rs +++ b/src/structure/tfc/typed.rs @@ -498,9 +498,9 @@ impl TypedDictBufBuilder>() + .map(|e| e.as_val::().to_string()) + .collect::>() ); panic!("wah"); }