Support indefinite length bytestring

qnikst · qnikst · commit 4c1a0a322a6c · 2025-11-11T11:35:24.000+01:00
We support indefinite length bytestring in the Read module. In order to do it we add two extra variants for bigint deseralization, both puts us to the slow-path to start consumption of the chunks. This serialization is considered non-canonical thus canonical read rejects is. While specification is silent about it, the cbor.me implementation always converts bigit into a fixed size byte blob. Fixes #263.
diff --git a/cborg/cborg.cabal b/cborg/cborg.cabal
@@ -135,6 +135,7 @@ test-suite cborg-tests
     Tests.Regress
     Tests.Regress.Issue160
     Tests.Regress.Issue162
+    Tests.Regress.Issue263
     Tests.Regress.FlatTerm
     Tests.Reference
     Tests.Reference.Implementation
diff --git a/cborg/src/Codec/CBOR/Read.hs b/cborg/src/Codec/CBOR/Read.hs
@@ -537,6 +537,8 @@ go_fast !bs da@(ConsumeTag64Canonical k) =
 go_fast !bs da@(ConsumeInteger k) =
     case tryConsumeInteger (BS.unsafeHead bs) bs of
       DecodedToken sz (BigIntToken _ n) -> k n >>= go_fast (BS.unsafeDrop sz bs)
+      DecodedToken sz BigNIntNeedBytes -> go_fast_end (BS.unsafeDrop sz bs) (decodeBytesIndefLen (k . nintegerFromBytes) [])
+      DecodedToken sz BigUIntNeedBytes -> go_fast_end (BS.unsafeDrop sz bs) (decodeBytesIndefLen (k . uintegerFromBytes) [])
       _                                 -> go_fast_end bs da
 
 go_fast !bs da@(ConsumeFloat k) =
@@ -593,6 +595,8 @@ go_fast !bs da@(ConsumeSimple k) =
 go_fast !bs da@(ConsumeIntegerCanonical k) =
     case tryConsumeInteger (BS.unsafeHead bs) bs of
       DecodedToken sz (BigIntToken True n) -> k n >>= go_fast (BS.unsafeDrop sz bs)
+      DecodedToken _sz BigUIntNeedBytes    -> return $ SlowFail bs "non-canonical integer encoding"
+      DecodedToken _sz BigNIntNeedBytes    -> return $ SlowFail bs "non-canonical integer encoding"
       _                                    -> go_fast_end bs da
 
 
@@ -1037,6 +1041,9 @@ go_fast_end !bs (ConsumeInteger k) =
       DecodedToken sz (BigNIntNeedBody _ len) -> return $! SlowConsumeTokenBytes (BS.unsafeDrop sz bs) (adjustContBigNIntNeedBody k) len
       DecodedToken sz  BigUIntNeedHeader    -> return $! SlowDecodeAction      (BS.unsafeDrop sz bs) (adjustContBigUIntNeedHeader k)
       DecodedToken sz  BigNIntNeedHeader    -> return $! SlowDecodeAction      (BS.unsafeDrop sz bs) (adjustContBigNIntNeedHeader k)
+      DecodedToken sz  BigUIntNeedBytes -> go_fast_end (BS.unsafeDrop sz bs) (decodeBytesIndefLen (k . uintegerFromBytes) [])
+      DecodedToken sz  BigNIntNeedBytes -> go_fast_end (BS.unsafeDrop sz bs) (decodeBytesIndefLen (k . nintegerFromBytes) [])
+
 
 go_fast_end !bs (ConsumeFloat k) =
     case tryConsumeFloat (BS.unsafeHead bs) bs of
@@ -2696,12 +2703,26 @@ readBytes64 bs = case word64ToInt (eatTailWord64 bs) of
 -- Note that canonicity information is calculated lazily. This way we don't need
 -- to concern ourselves with two distinct paths, while according to benchmarks
 -- it doesn't affect performance in the non-canonical case.
+--
+-- According to the CBOR specification, big integers can be encoded as
+-- *indefinite-length* byte strings. To support this representation, we return
+-- the 'BigUIntNeedBytes' or 'BigNIntNeedBytes' constructors, which indicate that
+-- more multiple tockens are required before the integer can be fully reconstructed.
+-- In this case we fall back to the slow path.
+--
+-- Note that this is a non-canonical encoding of big integers. The canonical
+-- representation uses a definite-length byte string. Therefore, when canonical
+-- decoding is requested, indefinite-length values are rejected immediately.
+
 
 data BigIntToken a = BigIntToken     Bool {- canonical? -} Integer
                    | BigUIntNeedBody Bool {- canonical? -} Int
                    | BigNIntNeedBody Bool {- canonical? -} Int
                    | BigUIntNeedHeader
                    | BigNIntNeedHeader
+                   | BigUIntNeedBytes
+                   | BigNIntNeedBytes
+
 
 -- So when we have to break out because we can't read the whole bytes body
 -- in one go then we need to use SlowConsumeTokenBytes but we can adjust the
@@ -2714,7 +2735,6 @@ adjustContBigUIntNeedBody, adjustContBigNIntNeedBody
 
 adjustContBigUIntNeedBody k = \bs -> k $! uintegerFromBytes bs
 adjustContBigNIntNeedBody k = \bs -> k $! nintegerFromBytes bs
-
 adjustContCanonicalBigUIntNeedBody, adjustContCanonicalBigNIntNeedBody
   :: (Integer -> ST s (DecodeAction s a))
   -> (ByteString -> ST s (DecodeAction s a))
@@ -2770,7 +2790,9 @@ readBigUInt bs
     , not (BS.null bs')
     , let !hdr = BS.unsafeHead bs'
     , BS.length bs' >= tokenSize hdr
-    = case tryConsumeBytes hdr bs' of
+    = if isInfiniteBytes hdr
+      then DecodedToken 2 BigUIntNeedBytes
+      else case tryConsumeBytes hdr bs' of
         DecodeFailure                           -> DecodeFailure
         DecodedToken sz (Fits canonical bstr)   -> DecodedToken (1+sz)
           (BigIntToken (canonical && isBigIntRepCanonical bstr)
@@ -2788,7 +2810,9 @@ readBigNInt bs
     , not (BS.null bs')
     , let !hdr = BS.unsafeHead bs'
     , BS.length bs' >= tokenSize hdr
-    = case tryConsumeBytes hdr bs' of
+    = if isInfiniteBytes hdr
+      then DecodedToken 2 BigNIntNeedBytes
+      else case tryConsumeBytes hdr bs' of
         DecodeFailure                           -> DecodeFailure
         DecodedToken sz (Fits canonical bstr)   -> DecodedToken (1+sz)
           (BigIntToken (canonical && isBigIntRepCanonical bstr)
@@ -2805,3 +2829,15 @@ readBigNInt bs
 -- representation for the number in question).
 isBigIntRepCanonical :: ByteString -> Bool
 isBigIntRepCanonical bstr = BS.length bstr > 8 && BS.unsafeHead bstr /= 0x00
+
+-- | Check if the given bytes(*) header has an infinite length.
+isInfiniteBytes :: Word8 -> Bool
+isInfiniteBytes hdr = case word8ToWord hdr of
+  0x5f -> True
+  _ -> False
+
+decodeBytesIndefLen :: (BS.ByteString -> ST s (DecodeAction s a)) -> [BS.ByteString] -> DecodeAction s a
+decodeBytesIndefLen k acc = ConsumeBreakOr $ \isBreak ->
+    if isBreak
+    then k (LBS.toStrict $! LBS.fromChunks $! reverse acc)
+    else return $! ConsumeBytes $ \bs -> return (decodeBytesIndefLen k (bs:acc))
diff --git a/cborg/tests/Tests/Reference/Implementation.hs b/cborg/tests/Tests/Reference/Implementation.hs
@@ -65,6 +65,9 @@ module Tests.Reference.Implementation (
     prop_word32ToFromNet,
     prop_word64ToFromNet,
     prop_halfToFromFloat,
+
+    -- helper functions
+    integerToBytes,
     ) where
 
 
diff --git a/cborg/tests/Tests/Regress.hs b/cborg/tests/Tests/Regress.hs
@@ -6,6 +6,7 @@ import           Test.Tasty
 
 import qualified Tests.Regress.Issue160 as Issue160
 import qualified Tests.Regress.Issue162 as Issue162
+import qualified Tests.Regress.Issue263 as Issue263
 import qualified Tests.Regress.FlatTerm as FlatTerm
 
 --------------------------------------------------------------------------------
@@ -16,4 +17,5 @@ testTree = testGroup "Regression tests"
   [ FlatTerm.testTree
   , Issue160.testTree
   , Issue162.testTree
+  , Issue263.testTree
   ]
diff --git a/cborg/tests/Tests/Regress/Issue263.hs b/cborg/tests/Tests/Regress/Issue263.hs
@@ -0,0 +1,41 @@
+module Tests.Regress.Issue263 ( testTree ) where
+
+import           Data.Word
+import qualified Data.ByteString.Lazy as LBS
+import           Codec.CBOR.Read
+import           Codec.CBOR.Term (Term(..), decodeTerm)
+import           Test.Tasty
+import           Test.Tasty.HUnit
+import qualified Tests.Reference.Implementation as Reference
+
+mkRepr :: Integer -> [Word8]
+mkRepr int =
+    [ -- Tag(2), 0xc2 — positive bigint, 0xc3 — negative bigint
+      if int>=0 then 0xc2  else 0xc3
+      -- Indefinite-length byte string
+    , 0x5f
+      -- Bytes
+    ] ++ (let b = if int >0
+                  then Reference.integerToBytes int
+                  else Reference.integerToBytes (-(int+1))
+              l = Reference.lengthUInt b
+           in Reference.encodeToken (Reference.MT2_ByteString l b)) ++
+    [ 0xff ]
+
+shouldDecode :: Integer -> IO ()
+shouldDecode int =
+  case deserialiseFromBytes decodeTerm (LBS.pack (mkRepr int)) of
+    Left err -> fail ("Deserialisation failed for " ++ (show (mkRepr int)) ++ ": " ++ show err)
+    Right (b,x)
+      | LBS.null b -> (TInteger int) @=? x
+      | otherwise -> fail "Trailing bytes"
+
+
+testTree :: TestTree
+testTree =
+    testGroup "Issue 263 - bigint with indefinite length"
+        [ testCase "small bigint" $ shouldDecode 1231
+        , testCase "big bigint" $ shouldDecode 123123123123123123123123123
+        , testCase "small negative bigint" $ shouldDecode (-123)
+        , testCase "big negative bigint" $ shouldDecode (-12312312311231231231231234)
+        ]