diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md
index b10e73266..75549503a 100644
--- a/docs/prebuilt-indexes.md
+++ b/docs/prebuilt-indexes.md
@@ -651,6 +651,12 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
Other
+browsecomp-plus.bm25
+[readme]
+- Lucene index of the BrowseComp-Plus corpus. See https://texttron.github.io/BrowseComp-Plus/
+
+
+
ciral-v1.0-ha
[readme]
- Lucene index for CIRAL v1.0 (Hausa).
@@ -2163,6 +2169,15 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
+BrowseComp-Plus
+
+browsecomp-plus.qwen3-embedding-8b
+[readme]
+- Faiss flat index of the BrowseComp-Plus corpus encoded by Qwen3-Embedding-8B. See https://texttron.github.io/BrowseComp-Plus/
+
+
+
+
Mr.TyDi
mrtydi-v1.1-arabic-mdpr-nq
diff --git a/pyserini/encode/__main__.py b/pyserini/encode/__main__.py
index 20a917673..f70d16333 100644
--- a/pyserini/encode/__main__.py
+++ b/pyserini/encode/__main__.py
@@ -24,7 +24,7 @@
from pyserini.encode.optional import FaissRepresentationWriter
-def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal):
+def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal, fp16=False, padding_side='right'):
_encoder_class = encoder_class
# determine encoder_class
@@ -51,7 +51,7 @@ def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multi
if _encoder_class == 'contriever' or 'contriever' in encoder:
kwargs.update(dict(pooling='mean', l2_norm=False))
if _encoder_class == 'auto':
- kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix))
+ kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix, fp16=fp16, padding_side=padding_side))
if _encoder_class == 'clip' or 'clip' in encoder:
kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal))
if _encoder_class == 'uniir':
@@ -116,15 +116,16 @@ def parse_args(parser, commands):
default='cuda:0', required=False)
encoder_parser.add_argument('--fp16', action='store_true', default=False)
encoder_parser.add_argument('--add-sep', action='store_true', default=False)
- encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', choices=['cls', 'mean'], required=False)
+ encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', choices=['cls', 'mean', 'eos'], required=False)
encoder_parser.add_argument('--l2-norm', action='store_true', help='whether to normalize embedding', default=False, required=False)
encoder_parser.add_argument('--prefix', type=str, help='prefix of document input', default=None, required=False)
+ encoder_parser.add_argument('--padding-side', type=str, default='right', choices=['left', 'right'], help='padding side for the tokenizer', required=False)
encoder_parser.add_argument('--use-openai', help='use OpenAI text-embedding-ada-002 to retreive embeddings', action='store_true', default=False)
encoder_parser.add_argument('--rate-limit', type=int, help='rate limit of the requests per minute for OpenAI embeddings', default=3500, required=False)
args = parse_args(parser, commands)
delimiter = args.input.delimiter.replace("\\n", "\n") # argparse would add \ prior to the passed '\n\n'
- encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device, pooling=args.encoder.pooling, l2_norm=args.encoder.l2_norm, prefix=args.encoder.prefix, multimodal=args.encoder.multimodal)
+ encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device, pooling=args.encoder.pooling, l2_norm=args.encoder.l2_norm, prefix=args.encoder.prefix, multimodal=args.encoder.multimodal, fp16=args.encoder.fp16, padding_side=getattr(args.encoder, 'padding_side', 'right'))
if args.output.to_faiss:
embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension)
diff --git a/pyserini/encode/_auto.py b/pyserini/encode/_auto.py
index 209ad1985..f6a394597 100644
--- a/pyserini/encode/_auto.py
+++ b/pyserini/encode/_auto.py
@@ -15,6 +15,8 @@
#
import numpy as np
+import torch
+from contextlib import nullcontext
from sklearn.preprocessing import normalize
from transformers import AutoModel, AutoTokenizer
@@ -22,9 +24,11 @@
class AutoDocumentEncoder(DocumentEncoder):
- def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False, prefix=None):
+ def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False, prefix=None, fp16=False, padding_side='right'):
self.device = device
- self.model = AutoModel.from_pretrained(model_name)
+ self.fp16 = fp16
+ torch_dtype = torch.float16 if fp16 else torch.float32
+ self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch_dtype if device.startswith('cuda') else None)
self.model.to(self.device)
try:
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name,
@@ -33,6 +37,9 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cl
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name,
use_fast=False,
clean_up_tokenization_spaces=True)
+ if self.tokenizer.pad_token_id is None:
+ self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+ self.tokenizer.padding_side = padding_side
self.has_model = True
self.pooling = pooling
self.l2_norm = l2_norm
@@ -62,9 +69,22 @@ def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs):
inputs = self.tokenizer(**input_kwargs, **shared_tokenizer_kwargs)
inputs.to(self.device)
- outputs = self.model(**inputs)
+ autocast_context = torch.amp.autocast('cuda') if self.fp16 and self.device.startswith('cuda') else nullcontext()
+ with autocast_context:
+ with torch.no_grad():
+ outputs = self.model(**inputs)
if self.pooling == "mean":
embeddings = self._mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy()
+ elif self.pooling == "eos":
+ attention_mask = inputs['attention_mask']
+ last_hidden_state = outputs[0]
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+ if left_padding:
+ embeddings = last_hidden_state[:, -1].detach().cpu().numpy()
+ else:
+ sequence_lengths = attention_mask.sum(dim=1) - 1
+ batch_size = last_hidden_state.shape[0]
+ embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths].detach().cpu().numpy()
else:
embeddings = outputs[0][:, 0, :].detach().cpu().numpy()
if self.l2_norm:
@@ -75,11 +95,13 @@ def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs):
class AutoQueryEncoder(QueryEncoder):
def __init__(self, encoder_dir: str = None, tokenizer_name: str = None,
encoded_query_dir: str = None, device: str = 'cpu',
- pooling: str = 'cls', l2_norm: bool = False, prefix=None, **kwargs):
+ pooling: str = 'cls', l2_norm: bool = False, prefix=None, fp16=False, padding_side='right', **kwargs):
super().__init__(encoded_query_dir)
if encoder_dir:
self.device = device
- self.model = AutoModel.from_pretrained(encoder_dir)
+ self.fp16 = fp16
+ torch_dtype = torch.float16 if fp16 else torch.float32
+ self.model = AutoModel.from_pretrained(encoder_dir, torch_dtype=torch_dtype if device.startswith('cuda') else None)
self.model.to(self.device)
try:
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir,
@@ -88,6 +110,9 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None,
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir,
use_fast=False,
clean_up_tokenization_spaces=True)
+ if self.tokenizer.pad_token_id is None:
+ self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+ self.tokenizer.padding_side = padding_side
self.has_model = True
self.pooling = pooling
self.l2_norm = l2_norm
@@ -95,23 +120,41 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None,
if (not self.has_model) and (not self.has_encoded_query):
raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one')
- def encode(self, query: str):
+ def encode(self, query: str, max_length=None):
if self.has_model:
if self.prefix:
query = f'{self.prefix} {query}'
- inputs = self.tokenizer(
- query,
+ tokenizer_kwargs = dict(
add_special_tokens=True,
return_tensors='pt',
truncation='only_first',
padding='longest',
return_token_type_ids=False,
+ return_attention_mask=True,
)
+ if max_length is not None:
+ tokenizer_kwargs['max_length'] = max_length
+ inputs = self.tokenizer(query, **tokenizer_kwargs)
inputs.to(self.device)
- outputs = self.model(**inputs)[0].detach().cpu().numpy()
+ autocast_context = torch.amp.autocast('cuda') if self.fp16 and self.device.startswith('cuda') else nullcontext()
+ with autocast_context:
+ with torch.no_grad():
+ model_outputs = self.model(**inputs)
+ last_hidden_state = model_outputs[0]
if self.pooling == "mean":
+ outputs = last_hidden_state.detach().cpu().numpy()
embeddings = np.average(outputs, axis=-2)
+ elif self.pooling == "eos":
+ attention_mask = inputs['attention_mask']
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+ if left_padding:
+ embeddings = last_hidden_state[:, -1].detach().cpu().numpy()
+ else:
+ sequence_lengths = attention_mask.sum(dim=1) - 1
+ batch_size = last_hidden_state.shape[0]
+ embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths].detach().cpu().numpy()
else:
+ outputs = last_hidden_state.detach().cpu().numpy()
embeddings = outputs[:, 0, :]
if self.l2_norm:
embeddings = normalize(embeddings, norm='l2')
diff --git a/pyserini/encode/query.py b/pyserini/encode/query.py
index c610b56cc..ddf888813 100644
--- a/pyserini/encode/query.py
+++ b/pyserini/encode/query.py
@@ -24,7 +24,7 @@
from tqdm import tqdm
-def init_encoder(encoder, device, pooling, l2_norm, prefix):
+def init_encoder(encoder, device, pooling, l2_norm, prefix, fp16=False, padding_side='right'):
if 'dpr' in encoder.lower():
return DprQueryEncoder(encoder, device=device)
elif 'tct' in encoder.lower():
@@ -44,7 +44,7 @@ def init_encoder(encoder, device, pooling, l2_norm, prefix):
elif 'arctic' in encoder.lower():
return ArcticQueryEncoder(encoder, device=device)
else:
- return AutoQueryEncoder(encoder, device=device, pooling=pooling, l2_norm=l2_norm, prefix=prefix)
+ return AutoQueryEncoder(encoder, device=device, pooling=pooling, l2_norm=l2_norm, prefix=prefix, fp16=fp16, padding_side=padding_side)
if __name__ == '__main__':
@@ -56,12 +56,14 @@ def init_encoder(encoder, device, pooling, l2_norm, prefix):
parser.add_argument('--output', type=str, help='path to stored encoded queries', required=True)
parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cpu', required=False)
parser.add_argument('--max-length', type=int, help='max length', default=256, required=False)
- parser.add_argument('--pooling', type=str, help='pooling strategy', default='cls', choices=['cls', 'mean'], required=False)
+ parser.add_argument('--pooling', type=str, help='pooling strategy', default='cls', choices=['cls', 'mean', 'eos'], required=False)
parser.add_argument('--l2-norm', action='store_true', help='whether to normalize embedding', default=False, required=False)
parser.add_argument('--prefx', type=str, help='prefix query input', default=None, required=False)
+ parser.add_argument('--fp16', action='store_true', help='use fp16 for query embeddings', default=False, required=False)
+ parser.add_argument('--padding-side', type=str, default='right', choices=['left', 'right'], help='padding side for the tokenizer', required=False)
args = parser.parse_args()
- encoder = init_encoder(args.encoder, device=args.device, pooling=args.pooling, l2_norm=args.l2_norm, prefix=args.prefx)
+ encoder = init_encoder(args.encoder, device=args.device, pooling=args.pooling, l2_norm=args.l2_norm, prefix=args.prefx, fp16=args.fp16, padding_side=args.padding_side)
query_iterator = DefaultQueryIterator.from_topics(args.topics)
is_sparse = False
diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py
index 24b2f1937..209322ffb 100644
--- a/pyserini/prebuilt_index_info.py
+++ b/pyserini/prebuilt_index_info.py
@@ -1418,6 +1418,23 @@ def import_from_lucene(enum):
"core18": TF_INDEX_INFO_OTHER["wapo.v2"],
}
+TF_INDEX_INFO_BROWSECOMP_PLUS = {
+ "browsecomp-plus.bm25": {
+ "description": "Lucene index of the BrowseComp-Plus corpus. See https://texttron.github.io/BrowseComp-Plus/",
+ "filename": "lucene-inverted.browsecomp-plus.bm25.20250810.tar.gz",
+ "readme": "lucene-inverted.browsecomp-plus.bm25.20250810.README.md",
+ "urls": [
+ "https://huggingface.co/datasets/castorini/prebuilt-indexes-browsecomp-plus/resolve/main/lucene-inverted/lucene-inverted.browsecomp-plus.bm25.20250810.tar.gz"
+ ],
+ "md5": "c9c3a69fe2725016f35e8c8523bdf828",
+ "size compressed (bytes)": 1783466040,
+ "total_terms": 373478034,
+ "documents": 100195,
+ "unique_terms": 4152345,
+ "downloaded": False
+ },
+}
+
TF_INDEX_INFO = {**TF_INDEX_INFO_MSMARCO,
**TF_INDEX_INFO_MSMARCO_ALIASES,
**TF_INDEX_INFO_BEIR,
@@ -1426,6 +1443,7 @@ def import_from_lucene(enum):
**TF_INDEX_INFO_MRTYDI_ALIASES,
**TF_INDEX_INFO_MIRACL,
**TF_INDEX_INFO_CIRAL,
+ **TF_INDEX_INFO_BROWSECOMP_PLUS,
**TF_INDEX_INFO_OTHER,
**TF_INDEX_INFO_OTHER_ALIASES}
@@ -6010,6 +6028,22 @@ def import_from_lucene(enum):
}
}
+FAISS_INDEX_INFO_BROWSECOMP_PLUS = {
+ "browsecomp-plus.qwen3-embedding-8b": {
+ "description": "Faiss flat index of the BrowseComp-Plus corpus encoded by Qwen3-Embedding-8B. See https://texttron.github.io/BrowseComp-Plus/",
+ "filename": "faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.tar.gz",
+ "readme": "faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md",
+ "urls": [
+ "https://huggingface.co/datasets/castorini/prebuilt-indexes-browsecomp-plus/resolve/main/faiss-flat/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.tar.gz"
+ ],
+ "md5": "2b3faad654414787c2438c9dfcd5500c",
+ "size compressed (bytes)": 1521405052,
+ "documents": 100195,
+ "downloaded": False,
+ "texts": "browsecomp-plus.bm25"
+ },
+}
+
FAISS_INDEX_INFO_OTHER = {
"cast2019-tct_colbert-v2.hnsw": {
"description": "Faiss HNSW index of the CAsT2019 passage corpus encoded by the tct_colbert-v2 passage encoder",
@@ -6347,4 +6381,5 @@ def import_from_lucene(enum):
**FAISS_INDEX_INFO_WIKIPEDIA,
**FAISS_INDEX_INFO_CIRAL,
**FAISS_INDEX_INFO_M_BEIR,
+ **FAISS_INDEX_INFO_BROWSECOMP_PLUS,
**FAISS_INDEX_INFO_OTHER}
diff --git a/pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md b/pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md
new file mode 100644
index 000000000..1abcfaaee
--- /dev/null
+++ b/pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md
@@ -0,0 +1,5 @@
+# browsecomp-plus.qwen3-embedding-8b
+
+Faiss flat index of the [BrowseComp-Plus](https://texttron.github.io/BrowseComp-Plus/) corpus encoded by Qwen3-Embedding-8B.
+
+This was generated on 2025/08/10 following [this guide](https://github.com/texttron/tevatron/tree/main/examples/BrowseComp-Plus).
diff --git a/pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md b/pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md
new file mode 100644
index 000000000..ada2a1947
--- /dev/null
+++ b/pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md
@@ -0,0 +1,12 @@
+# browsecomp-plus.bm25
+
+BM25 index of the [BrowseComp-Plus](https://texttron.github.io/BrowseComp-Plus/) corpus.
+
+This was generated on 2025/08/10 at commit [a6a45da](https://github.com/castorini/pyserini/commit/a6a45dac5651b1236ae2f3e2ee3b0e71dd403e23) on `basilisk` with the following command:
+
+```
+python -m pyserini.index.lucene --collection JsonCollection --input corpus/browsecomp-plus/ --index indexes/bm25 --generator DefaultLuceneDocumentGenerator --threads 32 --storeRaw
+```
+
+where the corpus can be found at https://huggingface.co/datasets/Tevatron/browsecomp-plus-corpus.
+
diff --git a/pyserini/search/faiss/__main__.py b/pyserini/search/faiss/__main__.py
index 3554b6ae9..916465915 100644
--- a/pyserini/search/faiss/__main__.py
+++ b/pyserini/search/faiss/__main__.py
@@ -89,7 +89,7 @@ def define_dsearch_args(parser):
metavar="pooling strategy",
required=False,
default="cls",
- choices=["cls", "mean"],
+ choices=["cls", "mean", "eos"],
help="Pooling strategy for query encoder",
)
parser.add_argument(
@@ -226,6 +226,15 @@ def define_dsearch_args(parser):
default=None,
help="Set efSearch for HNSW index",
)
+ parser.add_argument(
+ "--padding-side",
+ type=str,
+ metavar="left or right",
+ required=False,
+ default="right",
+ choices=["left", "right"],
+ help="Padding side for the tokenizer",
+ )
def init_query_encoder(
@@ -242,6 +251,7 @@ def init_query_encoder(
multimodal=False,
instruction_config=None,
fp16=False,
+ padding_side='right',
):
encoded_queries_map = {
"msmarco-passage-dev-subset": "tct_colbert-msmarco-passage-dev-subset",
@@ -289,7 +299,7 @@ def init_query_encoder(
if _encoder_class == "openai-api" or "openai" in encoder:
kwargs.update(dict(max_length=max_length))
if _encoder_class == "auto":
- kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix))
+ kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix, padding_side=padding_side))
if _encoder_class == "clip" or "clip" in encoder:
kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal))
if _encoder_class == "uniir":
@@ -434,7 +444,8 @@ def init_query_encoder(
args.query_prefix,
args.multimodal,
args.instruction_config,
- args.fp16
+ args.fp16,
+ args.padding_side
)
if args.pca_model:
query_encoder = PcaEncoder(query_encoder, args.pca_model)
diff --git a/scripts/generate_docs_from_prebuilt_indexes.py b/scripts/generate_docs_from_prebuilt_indexes.py
index a771d42c0..81db1c715 100644
--- a/scripts/generate_docs_from_prebuilt_indexes.py
+++ b/scripts/generate_docs_from_prebuilt_indexes.py
@@ -125,6 +125,7 @@ def generate_prebuilt(index):
print('')
print('Other
')
+ generate_prebuilt(TF_INDEX_INFO_BROWSECOMP_PLUS)
generate_prebuilt(TF_INDEX_INFO_CIRAL)
generate_prebuilt(TF_INDEX_INFO_OTHER)
print(' ')
@@ -187,6 +188,11 @@ def generate_prebuilt(index):
generate_prebuilt(FAISS_INDEX_INFO_BRIGHT)
print('
')
+ print('')
+ print('BrowseComp-Plus
')
+ generate_prebuilt(FAISS_INDEX_INFO_BROWSECOMP_PLUS)
+ print(' ')
+
print('')
print('Mr.TyDi
')
generate_prebuilt(FAISS_INDEX_INFO_MRTYDI)
diff --git a/tests/core/test_prebuilt_index.py b/tests/core/test_prebuilt_index.py
index 2414b41b1..23115d794 100644
--- a/tests/core/test_prebuilt_index.py
+++ b/tests/core/test_prebuilt_index.py
@@ -114,6 +114,18 @@ def test_lucene_tf_ciral(self):
self.assertEqual(cnt, 8)
self._test_urls(urls)
+ def test_lucene_tf_browsecomp_plus(self):
+ urls = []
+ cnt = 0
+ for key in TF_INDEX_INFO:
+ if 'browsecomp-plus' in key:
+ cnt += 1
+ for url in TF_INDEX_INFO[key]['urls']:
+ urls.append(url)
+
+ self.assertEqual(cnt, 1)
+ self._test_urls(urls)
+
def test_lucene_impact_msmarco(self):
urls = []
cnt = 0
@@ -237,6 +249,18 @@ def test_faiss_bright(self):
self.assertEqual(cnt, 12)
self._test_urls(urls)
+ def test_faiss_browsecomp_plus(self):
+ urls = []
+ cnt = 0
+ for key in FAISS_INDEX_INFO:
+ if 'browsecomp-plus' in key:
+ cnt += 1
+ for url in FAISS_INDEX_INFO[key]['urls']:
+ urls.append(url)
+
+ self.assertEqual(cnt, 1)
+ self._test_urls(urls)
+
def test_faiss_mrtydi(self):
urls = []
cnt = 0