22
22
import java .util .Arrays ;
23
23
import java .util .Map ;
24
24
import java .util .Set ;
25
+ import java .util .function .IntPredicate ;
25
26
import java .util .function .IntUnaryOperator ;
26
- import java .util .stream .Collectors ;
27
27
import java .util .stream .IntStream ;
28
+ import javax .annotation .Nullable ;
28
29
29
30
import com .google .common .annotations .VisibleForTesting ;
30
31
import com .google .common .collect .BiMap ;
34
35
35
36
import io .github .jbellis .jvector .graph .RandomAccessVectorValues ;
36
37
import io .github .jbellis .jvector .graph .disk .OrdinalMapper ;
38
+ import io .github .jbellis .jvector .util .FixedBitSet ;
37
39
import io .github .jbellis .jvector .vector .types .VectorFloat ;
38
40
import org .agrona .collections .Int2IntHashMap ;
39
41
import org .agrona .collections .Int2ObjectHashMap ;
@@ -101,12 +103,60 @@ public V5VectorPostingsWriter(RemappedPostings remappedPostings)
101
103
this .remappedPostings = remappedPostings ;
102
104
}
103
105
104
- public V5VectorPostingsWriter (Structure structure , int graphSize , Map <VectorFloat <?>, VectorPostings .CompactionVectorPostings > postingsMap )
106
+ /**
107
+ * This method describes the mapping done during construction of the graph so that we can easily create
108
+ * an appropriate V5VectorPostingsWriter. No ordinal remapping is performed because (V5) compaction writes
109
+ * vectors to disk as they are added to the graph, so there is no opportunity to reorder the way there is
110
+ * in a Memtable index.
111
+ */
112
+ public static RemappedPostings describeForCompaction (Structure structure , int graphSize , Map <VectorFloat <?>, VectorPostings .CompactionVectorPostings > postingsMap )
105
113
{
114
+ assert !postingsMap .isEmpty (); // flush+compact should skip writing an index component in this case
115
+
106
116
if (structure == Structure .ONE_TO_ONE )
107
- remappedPostings = new RemappedPostings (Structure .ONE_TO_ONE , graphSize - 1 , graphSize - 1 , null , null );
108
- else
109
- remappedPostings = remapPostings (postingsMap );
117
+ {
118
+ return new RemappedPostings (Structure .ONE_TO_ONE ,
119
+ graphSize - 1 ,
120
+ graphSize - 1 ,
121
+ null ,
122
+ null ,
123
+ new OrdinalMapper .IdentityMapper (graphSize - 1 ));
124
+ }
125
+
126
+ if (structure == Structure .ONE_TO_MANY )
127
+ {
128
+ // compute maxOldOrdinal, maxRow, and extraOrdinals from the postingsMap
129
+ int maxOldOrdinal = Integer .MIN_VALUE ;
130
+ int maxRow = Integer .MIN_VALUE ;
131
+ var extraOrdinals = new Int2IntHashMap (Integer .MIN_VALUE );
132
+ for (var entry : postingsMap .entrySet ())
133
+ {
134
+ var postings = entry .getValue ();
135
+ int ordinal = postings .getOrdinal ();
136
+
137
+ maxOldOrdinal = Math .max (maxOldOrdinal , ordinal );
138
+ var rowIds = postings .getRowIds ();
139
+ assert ordinal == rowIds .getInt (0 ); // synthetic ordinals not allowed in ONE_TO_MANY
140
+ for (int i = 0 ; i < rowIds .size (); i ++)
141
+ {
142
+ int rowId = rowIds .getInt (i );
143
+ maxRow = Math .max (maxRow , rowId );
144
+ if (i > 0 )
145
+ extraOrdinals .put (rowId , ordinal );
146
+ }
147
+ }
148
+
149
+ var skippedOrdinals = extraOrdinals .keySet ();
150
+ return new RemappedPostings (Structure .ONE_TO_MANY ,
151
+ maxOldOrdinal ,
152
+ maxRow ,
153
+ null ,
154
+ extraOrdinals ,
155
+ new OmissionAwareIdentityMapper (maxOldOrdinal , skippedOrdinals ::contains ));
156
+ }
157
+
158
+ assert structure == Structure .ZERO_OR_ONE_TO_MANY : structure ;
159
+ return createGenericIdentityMapping (postingsMap );
110
160
}
111
161
112
162
public long writePostings (SequentialWriter writer ,
@@ -167,7 +217,6 @@ private void writeOneToManyOrdinalMapping(SequentialWriter writer) throws IOExce
167
217
writer .writeInt (newOrdinal );
168
218
writer .writeInt (0 );
169
219
entries ++;
170
- assert !ordinalToExtraRowIds .containsKey (oldOrdinal );
171
220
continue ;
172
221
}
173
222
@@ -204,8 +253,8 @@ private void writeOneToManyRowIdMapping(SequentialWriter writer) throws IOExcept
204
253
writer .writeInt (rowId );
205
254
writer .writeInt (remappedPostings .ordinalMapper .oldToNew (originalOrdinal ));
206
255
// validate that we do in fact have contiguous rowids in the non-extra mapping
207
- for ( int j = lastExtraRowId + 1 ; j < rowId ; j ++ )
208
- assert remappedPostings .ordinalMap . inverse (). containsKey ( j ) ;
256
+ assert IntStream . range ( lastExtraRowId + 1 , rowId )
257
+ . allMatch ( j -> remappedPostings .ordinalMapper . newToOld ( j ) != OrdinalMapper . OMITTED ) : "Non-contiguous rowids found in non-extra mapping" ;
209
258
lastExtraRowId = rowId ;
210
259
}
211
260
@@ -314,63 +363,32 @@ public static class RemappedPostings
314
363
public final int maxNewOrdinal ;
315
364
/** the largest rowId in the postings (inclusive) */
316
365
public final int maxRowId ;
317
- /** map from original vector ordinal to rowId that will be its new, remapped ordinal */
318
- private final BiMap <Integer , Integer > ordinalMap ;
319
366
/** map from rowId to [original] vector ordinal */
367
+ @ Nullable
320
368
private final Int2IntHashMap extraPostings ;
321
369
/** public api */
322
370
public final OrdinalMapper ordinalMapper ;
323
371
324
- public RemappedPostings (Structure structure , int maxNewOrdinal , int maxRowId , BiMap <Integer , Integer > ordinalMap , Int2IntHashMap extraPostings )
372
+ /** visible for V2VectorPostingsWriter.remapPostings, everyone else should use factory methods */
373
+ public RemappedPostings (Structure structure , int maxNewOrdinal , int maxRowId , BiMap <Integer , Integer > ordinalMap , Int2IntHashMap extraPostings , OrdinalMapper ordinalMapper )
325
374
{
326
- assert structure == Structure .ONE_TO_ONE || structure == Structure .ONE_TO_MANY ;
327
375
this .structure = structure ;
328
376
this .maxNewOrdinal = maxNewOrdinal ;
329
377
this .maxRowId = maxRowId ;
330
- this .ordinalMap = ordinalMap ;
331
378
this .extraPostings = extraPostings ;
332
- ordinalMapper = new OrdinalMapper ()
333
- {
334
- @ Override
335
- public int maxOrdinal ()
336
- {
337
- return maxNewOrdinal ;
338
- }
339
-
340
- @ Override
341
- public int oldToNew (int i )
342
- {
343
- return ordinalMap .get (i );
344
- }
345
-
346
- @ Override
347
- public int newToOld (int i )
348
- {
349
- return ordinalMap .inverse ().getOrDefault (i , OMITTED );
350
- }
351
- };
352
- }
353
-
354
- public RemappedPostings (int maxNewOrdinal , int maxRowId , Int2IntHashMap sequentialMap )
355
- {
356
- this .structure = Structure .ZERO_OR_ONE_TO_MANY ;
357
- this .maxNewOrdinal = maxNewOrdinal ;
358
- this .maxRowId = maxRowId ;
359
- this .ordinalMap = null ;
360
- this .extraPostings = null ;
361
- ordinalMapper = new OrdinalMapper .MapMapper (sequentialMap );
379
+ this .ordinalMapper = ordinalMapper ;
362
380
}
363
381
}
364
382
365
383
/**
366
384
* @see RemappedPostings
367
385
*/
368
- public static <T > RemappedPostings remapPostings (Map <VectorFloat <?>, ? extends VectorPostings <T >> postingsMap )
386
+ public static <T > RemappedPostings remapForMemtable (Map <VectorFloat <?>, ? extends VectorPostings <T >> postingsMap )
369
387
{
370
388
assert V5OnDiskFormat .writeV5VectorPostings ();
371
389
372
390
BiMap <Integer , Integer > ordinalMap = HashBiMap .create ();
373
- Int2IntHashMap extraPostings = new Int2IntHashMap (- 1 );
391
+ Int2IntHashMap extraPostings = new Int2IntHashMap (Integer . MIN_VALUE );
374
392
int minRow = Integer .MAX_VALUE ;
375
393
int maxRow = Integer .MIN_VALUE ;
376
394
int maxNewOrdinal = Integer .MIN_VALUE ;
@@ -398,12 +416,13 @@ public static <T> RemappedPostings remapPostings(Map<VectorFloat<?>, ? extends V
398
416
extraPostings .put (a [i ], oldOrdinal );
399
417
}
400
418
}
419
+ assert totalRowsAssigned == 0 || totalRowsAssigned <= maxRow + 1 : "rowids are not unique -- " + totalRowsAssigned + " >= " + maxRow ;
401
420
402
421
// derive the correct structure
403
422
Structure structure ;
404
- if (totalRowsAssigned > 0 && (minRow != 0 || totalRowsAssigned != maxRow + 1 ))
423
+ if (totalRowsAssigned > 0 && (minRow != 0 || totalRowsAssigned < maxRow + 1 ))
405
424
{
406
- logger .debug ("Not all rows are assigned vectors, cannot remap" );
425
+ logger .debug ("Not all rows are assigned vectors, cannot remap one-to-many " );
407
426
structure = Structure .ZERO_OR_ONE_TO_MANY ;
408
427
}
409
428
else
@@ -419,32 +438,105 @@ public static <T> RemappedPostings remapPostings(Map<VectorFloat<?>, ? extends V
419
438
420
439
// create the mapping
421
440
if (structure == Structure .ZERO_OR_ONE_TO_MANY )
422
- return createGenericMapping (ordinalMap .keySet (), maxOldOrdinal , maxRow );
423
- return new RemappedPostings (structure , maxNewOrdinal , maxRow , ordinalMap , extraPostings );
441
+ return createGenericRenumberedMapping (ordinalMap .keySet (), maxOldOrdinal , maxRow );
442
+ var ordinalMapper = new BiMapMapper (maxNewOrdinal , ordinalMap );
443
+ return new RemappedPostings (structure , maxNewOrdinal , maxRow , ordinalMap , extraPostings , ordinalMapper );
424
444
}
425
445
426
446
/**
427
447
* return an exhaustive zero-to-many mapping with the live ordinals renumbered sequentially
428
448
*/
429
- private static RemappedPostings createGenericMapping (Set <Integer > liveOrdinals , int maxOldOrdinal , int maxRow )
449
+ private static RemappedPostings createGenericRenumberedMapping (Set <Integer > liveOrdinals , int maxOldOrdinal , int maxRow )
430
450
{
431
- var sequentialMap = new Int2IntHashMap (maxOldOrdinal , 0.65f , Integer .MIN_VALUE );
451
+ var oldToNew = new Int2IntHashMap (maxOldOrdinal , 0.65f , Integer .MIN_VALUE );
432
452
int nextOrdinal = 0 ;
433
453
for (int i = 0 ; i <= maxOldOrdinal ; i ++) {
434
454
if (liveOrdinals .contains (i ))
435
- sequentialMap .put (i , nextOrdinal ++);
455
+ oldToNew .put (i , nextOrdinal ++);
436
456
}
437
- return new RemappedPostings (nextOrdinal - 1 , maxRow , sequentialMap );
457
+ return new RemappedPostings (Structure .ZERO_OR_ONE_TO_MANY ,
458
+ nextOrdinal - 1 ,
459
+ maxRow ,
460
+ null ,
461
+ null ,
462
+ new OrdinalMapper .MapMapper (oldToNew ));
438
463
}
439
464
440
465
/**
441
- * return an exhaustive zero-to-many mapping for v2 postings, which never contain missing ordinals
442
- * since deleted vectors are only removed from the index in its next compaction
466
+ * return an exhaustive zero-to-many mapping with no renumbering
443
467
*/
444
- public static <T > RemappedPostings createGenericV2Mapping (Map <VectorFloat <?>, ? extends VectorPostings <T >> postingsMap )
468
+ public static <T > RemappedPostings createGenericIdentityMapping (Map <VectorFloat <?>, ? extends VectorPostings <T >> postingsMap )
445
469
{
446
- int maxOldOrdinal = postingsMap .size () - 1 ;
470
+ var maxOldOrdinal = postingsMap .values (). stream (). mapToInt ( VectorPostings :: getOrdinal ). max (). orElseThrow () ;
447
471
int maxRow = postingsMap .values ().stream ().flatMap (p -> p .getRowIds ().stream ()).mapToInt (i -> i ).max ().orElseThrow ();
448
- return createGenericMapping (IntStream .range (0 , postingsMap .size ()).boxed ().collect (Collectors .toSet ()), maxOldOrdinal , maxRow );
472
+ var presentOrdinals = new FixedBitSet (maxOldOrdinal + 1 );
473
+ for (var entry : postingsMap .entrySet ())
474
+ presentOrdinals .set (entry .getValue ().getOrdinal ());
475
+ return new RemappedPostings (Structure .ZERO_OR_ONE_TO_MANY ,
476
+ maxOldOrdinal ,
477
+ maxRow ,
478
+ null ,
479
+ null ,
480
+ new OmissionAwareIdentityMapper (maxOldOrdinal , i -> !presentOrdinals .get (i )));
481
+ }
482
+
483
+ public static class BiMapMapper implements OrdinalMapper
484
+ {
485
+ private final int maxOrdinal ;
486
+ private final BiMap <Integer , Integer > ordinalMap ;
487
+
488
+ public BiMapMapper (int maxNewOrdinal , BiMap <Integer , Integer > ordinalMap )
489
+ {
490
+ this .maxOrdinal = maxNewOrdinal ;
491
+ this .ordinalMap = ordinalMap ;
492
+ }
493
+
494
+ @ Override
495
+ public int maxOrdinal ()
496
+ {
497
+ return maxOrdinal ;
498
+ }
499
+
500
+ @ Override
501
+ public int oldToNew (int i )
502
+ {
503
+ return ordinalMap .get (i );
504
+ }
505
+
506
+ @ Override
507
+ public int newToOld (int i )
508
+ {
509
+ return ordinalMap .inverse ().getOrDefault (i , OMITTED );
510
+ }
511
+ }
512
+
513
+ private static class OmissionAwareIdentityMapper implements OrdinalMapper
514
+ {
515
+ private final int maxVectorOrdinal ;
516
+ private final IntPredicate toSkip ;
517
+
518
+ public OmissionAwareIdentityMapper (int maxVectorOrdinal , IntPredicate toSkip )
519
+ {
520
+ this .maxVectorOrdinal = maxVectorOrdinal ;
521
+ this .toSkip = toSkip ;
522
+ }
523
+
524
+ @ Override
525
+ public int maxOrdinal ()
526
+ {
527
+ return maxVectorOrdinal ;
528
+ }
529
+
530
+ @ Override
531
+ public int oldToNew (int i )
532
+ {
533
+ return i ;
534
+ }
535
+
536
+ @ Override
537
+ public int newToOld (int i )
538
+ {
539
+ return toSkip .test (i ) ? OrdinalMapper .OMITTED : i ;
540
+ }
449
541
}
450
542
}
0 commit comments