Skip to content

Commit e8827f2

Browse files
authored
Improve querying by skipping blocks (#173)
* Optimize queryEntry to skip records * Update test spec * Rename decodeChunkEntries to streamChunkEntries * Rename recursive function variable
1 parent 177d9ed commit e8827f2

File tree

4 files changed

+65
-53
lines changed

4 files changed

+65
-53
lines changed

scls-format/src/Cardano/SCLS/Internal/Reader.hs

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ module Cardano.SCLS.Internal.Reader (
1919
knownNamespacedData,
2020

2121
-- * Low-level functions
22-
decodeChunkEntries,
22+
streamChunkEntries,
2323
) where
2424

2525
import Cardano.SCLS.Internal.Frame
@@ -51,13 +51,11 @@ import GHC.TypeLits (KnownSymbol)
5151
import Streaming qualified as S
5252
import Streaming.Prelude qualified as S
5353

54-
{- | Decode entries from chunk data into a stream.
55-
56-
This function provides a stream of the Chunk entries
54+
{- | This function provides a stream of the Chunk entries
5755
stored in the data field of the 'Chunk'.
5856
-}
59-
decodeChunkEntries :: (Typeable u, MemPack u) => BS.ByteString -> S.Stream (S.Of u) IO ()
60-
decodeChunkEntries = go
57+
streamChunkEntries :: (Typeable u, MemPack u) => BS.ByteString -> S.Stream (S.Of u) IO ()
58+
streamChunkEntries = go
6159
where
6260
go !bs
6361
| BS.null bs = pure ()
@@ -170,7 +168,7 @@ namespacedData handle namespace = stream
170168
fetchOffsetFrame handle next_record
171169
for_ (decodeFrame dataRecord) \chunkRecord -> do
172170
when (chunkNamespace (frameViewContent (chunkRecord)) == namespace) do
173-
decodeChunkEntries (chunkData $ frameViewContent chunkRecord)
171+
streamChunkEntries (chunkData $ frameViewContent chunkRecord)
174172
go next_record
175173

176174
knownNamespacedData :: forall ns. (KnownSymbol ns, KnownNamespace ns) => Handle -> Proxy ns -> S.Stream (S.Of (ChunkEntry (NamespaceKey ns) (NamespaceEntry ns))) IO ()
Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,68 @@
1+
{-# LANGUAGE BlockArguments #-}
12
{-# LANGUAGE LambdaCase #-}
23

34
module Cardano.SCLS.Query (queryEntry) where
45

5-
import Cardano.SCLS.Internal.Reader (withNamespacedData)
6+
import Cardano.SCLS.Internal.Frame (FrameView (frameViewContent), decodeFrame, fetchNextFrame, fetchOffsetFrame, headerOffset)
7+
import Cardano.SCLS.Internal.Reader (streamChunkEntries)
8+
import Cardano.SCLS.Internal.Record.Chunk (Chunk (chunkData, chunkNamespace))
69
import Cardano.SCLS.Internal.Serializer.HasKey (HasKey (..))
710
import Cardano.Types.Namespace (Namespace)
11+
import Control.Monad (mfilter)
12+
import Control.Monad.Fix (fix)
13+
import Data.Functor ((<&>))
814
import Data.MemPack (MemPack)
915
import Data.Typeable (Typeable)
1016
import Streaming.Prelude qualified as S
17+
import System.IO (IOMode (ReadMode), SeekMode (AbsoluteSeek), hSeek, withBinaryFile)
1118

1219
{- | Query for a specific entry by namespace and key.
1320
1421
This function searches for an entry with the given key within the specified
15-
namespace in the SCLS data directory. It leverages the assumption that entries
16-
are stored in sorted order by key, allowing for early termination once a key
17-
greater than or equal to the target is found.
22+
namespace and SCLS file. It leverages the assumption that entries are stored
23+
in sorted order by key, allowing for early termination once a key greater than
24+
or equal to the target is found.
1825
-}
1926
queryEntry ::
2027
(HasKey a, MemPack a, Typeable a) =>
21-
-- | The directory path containing the SCLS data files
28+
-- | The path for the SCLS file
2229
FilePath ->
2330
-- | The namespace to search within
2431
Namespace ->
2532
-- | The key of the entry to find
2633
Key a ->
2734
-- | Returns @Just entry@ if an entry with the exact key is found, @Nothing@ otherwise
2835
IO (Maybe a)
29-
queryEntry dir ns key = do
30-
withNamespacedData dir ns $ \stream -> do
31-
value :: Maybe a <-
32-
S.head_ $
33-
S.dropWhile
34-
( \entry ->
35-
let k = getKey entry
36-
in k < key
37-
)
38-
stream
39-
case value of
40-
Just entry | getKey entry == key -> pure $ Just entry
41-
Nothing -> pure Nothing
42-
Just _ -> pure Nothing
36+
queryEntry filePath namespace key = do
37+
withBinaryFile
38+
filePath
39+
ReadMode
40+
query
41+
where
42+
query handle = do
43+
hSeek handle AbsoluteSeek 0
44+
flip fix (headerOffset, S.each []) \go (offset, prev) -> do
45+
fetchNextFrame handle offset >>= \case
46+
Nothing ->
47+
-- No more records. If the entry exists, it must be on the previous record.
48+
findInStream prev
49+
Just nextOffset -> do
50+
dataRecord <- fetchOffsetFrame handle nextOffset
51+
case frameViewContent <$> decodeFrame dataRecord of
52+
Right chunk
53+
| chunkNamespace chunk == namespace -> do
54+
let entries = streamChunkEntries (chunkData chunk)
55+
S.head_ entries >>= \case
56+
Just entry
57+
| (getKey entry == key) -> pure $ Just entry
58+
| (getKey entry > key) ->
59+
-- The key of the current block's first entry is bigger than the queried key.
60+
-- If the entry exists, it must be on the previous record.
61+
findInStream prev
62+
| otherwise -> go (nextOffset, entries)
63+
Nothing -> go (nextOffset, prev)
64+
_ -> go (nextOffset, prev)
65+
66+
findInStream stream = do
67+
S.head_ (S.dropWhile ((< key) . getKey) stream)
68+
<&> mfilter ((== key) . getKey)

scls-format/test/QuerySpec.hs

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,7 @@ basicTests =
8080

8181
serializeTestData fileName [(ns, entries)]
8282

83-
result <- queryEntry fileName ns key
84-
result `shouldBe` Just (mkTestEntry 100 1 value)
83+
queryEntry fileName ns key `shouldReturn` Just (mkTestEntry 100 1 value)
8584

8685
it "returns Nothing when entry doesn't exist" $
8786
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -92,8 +91,7 @@ basicTests =
9291

9392
serializeTestData fileName [(ns, entries)]
9493

95-
result <- queryEntry @TestEntry fileName ns searchKey
96-
result `shouldBe` Nothing
94+
queryEntry @TestEntry fileName ns searchKey `shouldReturn` Nothing
9795

9896
it "finds entry among multiple entries" $
9997
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -109,8 +107,7 @@ basicTests =
109107

110108
serializeTestData fileName [(ns, entries)]
111109

112-
result <- queryEntry fileName ns searchKey
113-
result `shouldBe` Just (mkTestEntry 150 5 value)
110+
queryEntry fileName ns searchKey `shouldReturn` Just (mkTestEntry 150 5 value)
114111

115112
edgeCaseTests :: Spec
116113
edgeCaseTests =
@@ -123,8 +120,7 @@ edgeCaseTests =
123120

124121
serializeTestData fileName [(ns, [])]
125122

126-
result <- queryEntry @TestEntry fileName ns searchKey
127-
result `shouldBe` Nothing
123+
queryEntry @TestEntry fileName ns searchKey `shouldReturn` Nothing
128124

129125
it "returns Nothing for non-existent namespace" $
130126
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -136,8 +132,7 @@ edgeCaseTests =
136132

137133
serializeTestData fileName [(existingNs, entries)]
138134

139-
result <- queryEntry @TestEntry fileName nonExistentNs searchKey
140-
result `shouldBe` Nothing
135+
queryEntry @TestEntry fileName nonExistentNs searchKey `shouldReturn` Nothing
141136

142137
it "returns first match when there are duplicate keys" $
143138
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -153,8 +148,7 @@ edgeCaseTests =
153148

154149
serializeTestData fileName [(ns, entries)]
155150

156-
result <- queryEntry fileName ns searchKey
157-
result `shouldBe` Just (mkTestEntry 100 1 firstValue)
151+
queryEntry fileName ns searchKey `shouldReturn` Just (mkTestEntry 100 1 firstValue)
158152

159153
it "finds single entry in namespace" $
160154
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -166,8 +160,7 @@ edgeCaseTests =
166160

167161
serializeTestData fileName [(ns, entries)]
168162

169-
result <- queryEntry fileName ns searchKey
170-
result `shouldBe` Just (mkTestEntry 42 7 value)
163+
queryEntry fileName ns searchKey `shouldReturn` Just (mkTestEntry 42 7 value)
171164

172165
multiNamespaceTests :: Spec
173166
multiNamespaceTests =
@@ -185,11 +178,9 @@ multiNamespaceTests =
185178

186179
serializeTestData fileName [(ns1, entries1), (ns2, entries2)]
187180

188-
result1 <- queryEntry fileName ns1 searchKey
189-
result1 `shouldBe` Just (mkTestEntry 100 1 value1)
181+
queryEntry fileName ns1 searchKey `shouldReturn` Just (mkTestEntry 100 1 value1)
190182

191-
result2 <- queryEntry fileName ns2 searchKey
192-
result2 `shouldBe` Just (mkTestEntry 100 1 value2)
183+
queryEntry fileName ns2 searchKey `shouldReturn` Just (mkTestEntry 100 1 value2)
193184

194185
it "correctly isolates namespaces - entry exists in one but not another" $
195186
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -202,11 +193,9 @@ multiNamespaceTests =
202193

203194
serializeTestData fileName [(ns1, entries1), (ns2, entries2)]
204195

205-
result1 <- queryEntry fileName ns1 searchKey
206-
result1 `shouldBe` Just (mkTestEntry 100 1 "exists-here")
196+
queryEntry fileName ns1 searchKey `shouldReturn` Just (mkTestEntry 100 1 "exists-here")
207197

208-
result2 <- queryEntry @TestEntry fileName ns2 searchKey
209-
result2 `shouldBe` Nothing
198+
queryEntry @TestEntry fileName ns2 searchKey `shouldReturn` Nothing
210199

211200
it "handles many namespaces efficiently" $
212201
withSystemTempDirectory "query-test-XXXXXX" $ \tmpDir -> do
@@ -222,5 +211,4 @@ multiNamespaceTests =
222211

223212
serializeTestData fileName namespaces
224213

225-
result <- queryEntry fileName searchNs searchKey
226-
result `shouldBe` Just (mkTestEntry 5 1 (BS.pack [5]))
214+
queryEntry fileName searchNs searchKey `shouldReturn` Just (mkTestEntry 5 1 (BS.pack [5]))

scls-util/src/Cardano/SCLS/Util/Check.hs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import Cardano.SCLS.CDDL (NamespaceInfo (..), namespaces)
2323
import Cardano.SCLS.Internal.Entry.CBOREntry (GenericCBOREntry (..))
2424
import Cardano.SCLS.Internal.Entry.ChunkEntry (ChunkEntry (..))
2525
import Cardano.SCLS.Internal.Hash (Digest (..), digest)
26-
import Cardano.SCLS.Internal.Reader (decodeChunkEntries, extractNamespaceList, withRecordData)
26+
import Cardano.SCLS.Internal.Reader (extractNamespaceList, streamChunkEntries, withRecordData)
2727
import Cardano.SCLS.Internal.Record.Chunk (Chunk (..))
2828
import Cardano.SCLS.Util.Result
2929
import Cardano.Types.Namespace (Namespace)
@@ -201,7 +201,7 @@ validateChunk cddlTrees Chunk{..} = do
201201
-- We do not known how to decode values inside, so we just read the data
202202
-- this way we can calculate count and check digest
203203
actualCount <-
204-
decodeChunkEntries @(Entry RawBytes) chunkData & S.length_
204+
streamChunkEntries @(Entry RawBytes) chunkData & S.length_
205205
pure $
206206
if actualCount /= fromIntegral chunkEntriesCount
207207
then [EntryCountMismatch chunkEntriesCount actualCount]
@@ -210,7 +210,7 @@ validateChunk cddlTrees Chunk{..} = do
210210
withSomeSNat keySize \(snat :: SNat n) -> do
211211
withKnownNat snat do
212212
(actualCount S.:> formatErrors S.:> ()) <-
213-
decodeChunkEntries @(GenericCBOREntry n) chunkData
213+
streamChunkEntries @(GenericCBOREntry n) chunkData
214214
& S.copy
215215
& S.zip (S.enumFrom 1)
216216
& S.mapMaybe (validateAgainst spec)

0 commit comments

Comments
 (0)