Skip to content

Commit 8795231

Browse files
committed
Undo end to end test changes
1 parent 0b6a2b7 commit 8795231

24 files changed

+237
-181
lines changed

src/main/java/io/anserini/search/SearchCollection.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1071,7 +1071,7 @@ public SearchCollection(Args args) throws IOException {
10711071
// Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952
10721072
// If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues,
10731073
// which is the source of the incompatibility.
1074-
if (!reader.toString().contains("lucene.version=9")) {
1074+
if (!reader.toString().contains("lucene.version=9") && !reader.toString().contains("lucene.version=10")) {
10751075
args.arbitraryScoreTieBreak = true;
10761076
args.axiom_deterministic = false;
10771077
}

src/main/java/io/anserini/search/SimpleImpactSearcher.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,8 @@ public SimpleImpactSearcher(String indexDir, Analyzer analyzer) throws IOExcepti
135135
// Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952
136136
// If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues,
137137
// which is the source of the incompatibility.
138-
this.backwardsCompatibilityLucene8 = !reader.toString().contains("lucene.version=9");
138+
this.backwardsCompatibilityLucene8 = !reader.toString().contains("lucene.version=9")
139+
&& !reader.toString().contains("lucene.version=10");
139140

140141
// Default to using ImpactSimilarity.
141142
this.similarity = new ImpactSimilarity();
@@ -725,4 +726,4 @@ public String doc_raw(String docid) {
725726
return IndexReaderUtils.documentRaw(reader, docid);
726727
}
727728
}
728-
729+

src/main/java/io/anserini/search/SimpleSearcher.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ public SimpleSearcher(String indexDir, Analyzer analyzer) throws IOException {
134134
// Fix for index compatibility issue between Lucene 8 and 9: https://github.com/castorini/anserini/issues/1952
135135
// If we detect an older index version, we turn off consistent tie-breaking, which avoids accessing docvalues,
136136
// which is the source of the incompatibility.
137-
this.backwardsCompatibilityLucene8 = !reader.toString().contains("lucene.version=9");
137+
this.backwardsCompatibilityLucene8 = !reader.toString().contains("lucene.version=9")
138+
&& !reader.toString().contains("lucene.version=10");
138139

139140
// Default to using BM25.
140141
this.similarity = new BM25Similarity(Float.parseFloat(defaults.bm25_k1[0]), Float.parseFloat(defaults.bm25_b[0]));

src/test/java/io/anserini/collection/FineWebCollectionTest.java

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,11 @@ public void testReadStandardParquetFile() throws IOException {
192192
}
193193

194194
// Verify we read 3 documents
195-
assertEquals("Should read 2 documents from parquet file", 2, docs.size());
195+
assertEquals("Should read 3 documents from parquet file", 3, docs.size());
196196

197197
// Verify document IDs
198198
assertTrue("Should contain fineweb-doc-001", docContents.containsKey("fineweb-doc-001"));
199+
assertTrue("Should contain fineweb-doc-002", docContents.containsKey("fineweb-doc-002"));
199200
assertTrue("Should contain fineweb-doc-003", docContents.containsKey("fineweb-doc-003"));
200201

201202
// Verify content of first document
@@ -249,6 +250,40 @@ public void testReadParquetWithAlternativeFieldNames() throws IOException {
249250
assertEquals("crawl", docs.get(1).fields().get("source"));
250251
}
251252

253+
@Test
254+
public void testReadParquetWithAutoGeneratedIds() throws IOException {
255+
// This parquet file has no ID field - IDs should be auto-generated
256+
Path parquetPath = Paths.get("src/test/resources/sample_docs/fineweb/fineweb_no_id.parquet");
257+
FineWebCollection collection = new FineWebCollection(parquetPath);
258+
259+
List<FineWebCollection.Document> docs = new ArrayList<>();
260+
for (FileSegment<FineWebCollection.Document> segment : collection) {
261+
for (FineWebCollection.Document doc : segment) {
262+
docs.add(doc);
263+
}
264+
}
265+
266+
// Verify we read 3 documents
267+
assertEquals("Should read 3 documents from parquet file", 3, docs.size());
268+
269+
// All IDs should be auto-generated in format: filename_rownum
270+
for (int i = 0; i < docs.size(); i++) {
271+
String expectedIdPrefix = "fineweb_no_id_";
272+
assertTrue(
273+
"Auto-generated ID should start with '" + expectedIdPrefix + "', got: " + docs.get(i).id(),
274+
docs.get(i).id().startsWith(expectedIdPrefix));
275+
}
276+
277+
// Verify contents are still readable
278+
assertTrue(docs.get(0).contents().contains("auto-generate"));
279+
assertTrue(docs.get(1).contents().contains("auto-generated ID"));
280+
281+
// Verify metadata (domain field)
282+
assertEquals("example.com", docs.get(0).fields().get("domain"));
283+
assertEquals("test.org", docs.get(1).fields().get("domain"));
284+
assertEquals("sample.net", docs.get(2).fields().get("domain"));
285+
}
286+
252287
@Test
253288
public void testCollectionIteration() throws IOException {
254289
// Test that we can iterate through all segments properly
@@ -272,7 +307,7 @@ public void testCollectionIteration() throws IOException {
272307
}
273308

274309
assertEquals("Should have 1 segment (1 parquet file)", 1, segmentCount);
275-
assertEquals("Should have 2 documents total", 2, totalDocCount);
310+
assertEquals("Should have 3 documents total", 3, totalDocCount);
276311
}
277312

278313
@Test

src/test/java/io/anserini/integration/AclAnthologyEndToEndTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ protected void setSearchGroundTruth() {
8686

8787
testQueries.put("bm25", createDefaultSearchArgs().bm25());
8888
referenceRunOutput.put("bm25", new String[]{
89-
"1 Q0 C00-1007 1 0.293992 Anserini",
90-
"1 Q0 E17-1003 2 0.186060 Anserini",
91-
"2 Q0 C00-1003 1 0.622663 Anserini"});
89+
"1 Q0 C00-1007 1 0.294000 Anserini",
90+
"1 Q0 E17-1003 2 0.186100 Anserini",
91+
"2 Q0 C00-1003 1 0.622700 Anserini"});
9292
}
9393
}

src/test/java/io/anserini/integration/BibtexEndToEndTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ protected void setSearchGroundTruth() {
6464

6565
testQueries.put("bm25", createDefaultSearchArgs().bm25());
6666
referenceRunOutput.put("bm25", new String[]{
67-
"1 Q0 article-id 1 0.073774 Anserini",
68-
"1 Q0 proceedings-id 2 0.073774 Anserini",
69-
"1 Q0 inproceedings-id 3 0.064198 Anserini",
70-
"2 Q0 inproceedings-id 1 0.471553 Anserini"});
67+
"1 Q0 article-id 1 0.073800 Anserini",
68+
"1 Q0 proceedings-id 2 0.073799 Anserini",
69+
"1 Q0 inproceedings-id 3 0.064200 Anserini",
70+
"2 Q0 inproceedings-id 1 0.471600 Anserini"});
7171
}
7272
}

src/test/java/io/anserini/integration/C4EndToEndTest.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616

1717
package io.anserini.integration;
1818

19-
import java.util.Map;
20-
2119
import io.anserini.collection.C4Collection;
2220
import io.anserini.index.IndexCollection;
2321
import io.anserini.index.generator.C4Generator;
2422

23+
import java.util.Map;
24+
2525
public class C4EndToEndTest extends EndToEndTest {
2626
@Override
2727
protected IndexCollection.Args getIndexArgs() {
@@ -71,6 +71,6 @@ protected void setSearchGroundTruth() {
7171

7272
testQueries.put("bm25", createDefaultSearchArgs().bm25());
7373
referenceRunOutput.put("bm25", new String[]{
74-
"1 Q0 c4-0001-000000 1 0.364814 Anserini"});
74+
"1 Q0 c4-0001-000000 1 0.364800 Anserini"});
7575
}
7676
}

src/test/java/io/anserini/integration/CoreEndToEndTest.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,10 @@ protected void setSearchGroundTruth() {
7474

7575
testQueries.put("bm25", createDefaultSearchArgs().bm25());
7676
referenceRunOutput.put("bm25", new String[]{
77-
"1 Q0 coreDoc1 1 0.243182 Anserini",
78-
"1 Q0 doi2 2 0.243182 Anserini",
79-
"2 Q0 coreDoc1 1 0.243182 Anserini",
80-
"2 Q0 doi2 2 0.243182 Anserini",
81-
"3 Q0 fullCoreDoc 1 0.534644 Anserini"});
77+
"1 Q0 coreDoc1 1 0.243200 Anserini",
78+
"1 Q0 doi2 2 0.243199 Anserini",
79+
"2 Q0 coreDoc1 1 0.243200 Anserini",
80+
"2 Q0 doi2 2 0.243199 Anserini",
81+
"3 Q0 fullCoreDoc 1 0.534600 Anserini"});
8282
}
8383
}

src/test/java/io/anserini/integration/EndToEndTest.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,19 @@ public void checkIndex() throws IOException {
252252
assertTrue(seg.openReaderPassed);
253253

254254
assertNotNull(seg.diagnostics);
255+
255256
assertNotNull(seg.fieldNormStatus);
256257
assertNull(seg.fieldNormStatus.error);
257258
assertEquals(this.fieldNormStatusTotalFields, seg.fieldNormStatus.totFields);
258259

260+
assertNotNull(seg.termIndexStatus);
261+
assertNull(seg.termIndexStatus.error);
259262
assertEquals(this.termIndexStatusTermCount, seg.termIndexStatus.termCount);
260263
assertEquals(this.termIndexStatusTotFreq, seg.termIndexStatus.totFreq);
261264
assertEquals(this.termIndexStatusTotPos, seg.termIndexStatus.totPos);
262265

266+
assertNotNull(seg.storedFieldStatus);
267+
assertNull(seg.storedFieldStatus.error);
263268
assertEquals(this.storedFieldStatusTotalDocCounts, seg.storedFieldStatus.docCount);
264269
assertEquals(this.storedFieldStatusTotFields, seg.storedFieldStatus.totFields);
265270

src/test/java/io/anserini/integration/FineWebEndToEndTest.java

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ protected IndexCollection.Args getIndexArgs() {
3434

3535
@Override
3636
protected void setCheckIndexGroundTruth() {
37-
docCount = 4;
37+
docCount = 8;
3838
docFieldCount = -1; // Variable field counts across documents
3939

4040
// Documents from fineweb_standard.parquet
4141
referenceDocs.put("fineweb-doc-001", Map.of(
4242
"contents", "This is the first test document for FineWeb collection testing."));
43+
referenceDocs.put("fineweb-doc-002", Map.of(
44+
"contents", "Second document contains different content for verification."));
4345
referenceDocs.put("fineweb-doc-003", Map.of(
4446
"contents", "Third document with special characters: café, naïve, 日本語."));
4547

@@ -49,12 +51,20 @@ protected void setCheckIndexGroundTruth() {
4951
referenceDocs.put("alt-doc-002", Map.of(
5052
"contents", "Another document with docid field instead of id."));
5153

54+
// Documents from fineweb_no_id.parquet (auto-generated IDs)
55+
referenceDocs.put("fineweb_no_id_0", Map.of(
56+
"contents", "Document without an ID field - should auto-generate."));
57+
referenceDocs.put("fineweb_no_id_1", Map.of(
58+
"contents", "Another document that needs an auto-generated ID."));
59+
referenceDocs.put("fineweb_no_id_2", Map.of(
60+
"contents", "Third document also missing ID field."));
61+
5262
fieldNormStatusTotalFields = 1;
53-
termIndexStatusTermCount = 26;
54-
termIndexStatusTotFreq = 31;
55-
storedFieldStatusTotalDocCounts = 4;
56-
termIndexStatusTotPos = 32;
57-
storedFieldStatusTotFields = 12;
63+
termIndexStatusTermCount = 41;
64+
termIndexStatusTotFreq = 60;
65+
storedFieldStatusTotalDocCounts = 8;
66+
termIndexStatusTotPos = 61;
67+
storedFieldStatusTotFields = 24;
5868
}
5969

6070
@Override
@@ -64,9 +74,13 @@ protected void setSearchGroundTruth() {
6474

6575
testQueries.put("bm25", createDefaultSearchArgs().bm25());
6676
referenceRunOutput.put("bm25", new String[]{
67-
"1 Q0 fineweb-doc-001 1 2.204911 Anserini",
68-
"1 Q0 alt-doc-002 2 0.056996 Anserini",
69-
"1 Q0 alt-doc-001 3 0.055453 Anserini",
70-
"1 Q0 fineweb-doc-003 4 0.052605 Anserini"});
77+
"1 Q0 fineweb-doc-001 1 3.201400 Anserini",
78+
"1 Q0 alt-doc-002 2 0.030600 Anserini",
79+
"1 Q0 fineweb-doc-002 3 0.030599 Anserini",
80+
"1 Q0 fineweb_no_id_1 4 0.030598 Anserini",
81+
"1 Q0 fineweb_no_id_2 5 0.030597 Anserini",
82+
"1 Q0 alt-doc-001 6 0.029800 Anserini",
83+
"1 Q0 fineweb_no_id_0 7 0.029799 Anserini",
84+
"1 Q0 fineweb-doc-003 8 0.028200 Anserini"});
7185
}
7286
}

0 commit comments

Comments
 (0)