Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ef9aec2

Browse files
simonhollisfacebook-github-bot
authored andcommittedMar 24, 2025·
Add schema IDs to glean merged files
Summary: Ensure that all fact files merged using `glean merge` have a schema id; and that ID is the same across all input and output files. ## Part 1) Support consistent Schema IDs when importing a facts JSON into an existing DB, which has the autoritativet schema to align with. NOTE: This approach relies on JSON fact files implementing the `schema_id` metadata attribute, as implemented in T214992259. This limits which indexers' JSON fact files can be merged to those that implement `schema_id`. At the moment, C++ is OK, for example, but Python is not. ## Part 2) Merge multiple JSON files into a single JSON file type merge using the inventory mechanism # This diff implements... This diff adds schema ID assertions to implement Part 1) Reviewed By: pepeiborra Differential Revision: D71329107 fbshipit-source-id: e7276e2546edf11a41ace05eb3f69738d95d9cc6
1 parent 33ef03e commit ef9aec2

File tree

1 file changed

+20
-3
lines changed

1 file changed

+20
-3
lines changed
 

‎glean/tools/gleancli/GleanCLI/Merge.hs

+20-3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import System.Process
2222

2323
import Control.Concurrent.Stream
2424
import Util.OptParse
25+
import Util.Log
2526
import Thrift.Protocol.Compact
2627

2728
import Glean.LocalOrRemote (loadDbSchema)
@@ -41,6 +42,7 @@ import Glean.Write (fileToBatches, schemaIdToOpts)
4142
import Glean.Write.JSON (buildJsonBatch)
4243
import System.Directory.Extra (listFiles)
4344

45+
4446
data MergeCommand = MergeCommand
4547
{ mergeFiles :: [FilePath]
4648
, mergeFileSize :: Int
@@ -82,6 +84,7 @@ instance Plugin MergeCommand where
8284
dbSchema <- Glean.withBackendWithDefaultOptions
8385
_evb _cfgAPI _svc Nothing $ \backend -> do
8486
loadDbSchema backend repo
87+
logInfo("db's schema ID is: " <> show(schemaId dbSchema))
8588
return (schemaInventory dbSchema, Just dbSchema)
8689
Right mergeInventory -> do
8790
inventory <- Inventory.deserialize <$> B.readFile mergeInventory
@@ -141,16 +144,30 @@ instance Plugin MergeCommand where
141144
where
142145
read :: FilePath -> Int -> FactSet -> IO FactOwnership
143146
read file size factSet = do
144-
hPutStrLn stderr $ "Reading " <> file <> " (" <> show size <> ")"
147+
logInfo $ "Reading " <> file <> " (" <> show size <> " bytes)"
145148
batch <- case fileFormat of
146149
JsonFormat -> do
147150
case dbSchema of
148151
Nothing -> throwIO $ ErrorCall $
149152
"No db schema to serialize json format file. "
150153
<> "Please specify the database"
151154
Just schema -> do
152-
(batches, schema_id) <- fileToBatches file
153-
buildJsonBatch schema (schemaIdToOpts schema_id) batches
155+
(batches, schema_id_file) <- fileToBatches file
156+
if Just(schemaId schema) == schema_id_file then
157+
logInfo(
158+
"Schema matches with db schema. Merging data from "
159+
<> file
160+
)
161+
else
162+
throwIO $ ErrorCall $
163+
"ERROR - ABORTING MERGE\nSchema ID mismatch:\ndb: "
164+
<> show(schemaId schema) <> "\nvs\nFile: "
165+
<> file <> " has " <> show schema_id_file
166+
let getSchemaId theschema = Just(schemaId theschema) in
167+
buildJsonBatch schema
168+
(schemaIdToOpts $ getSchemaId schema) batches
169+
170+
154171
BinaryFormat -> do
155172
bytes <- B.readFile file
156173
case deserializeCompact bytes of

0 commit comments

Comments
 (0)
Please sign in to comment.