Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/prebuilt-indexes.md
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,12 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
<details>
<summary>Other</summary>
<dl>
<dt></dt><b><code>browsecomp-plus.bm25</code></b>
[<a href="../pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md">readme</a>]
<dd>Lucene index of the BrowseComp-Plus corpus. See https://texttron.github.io/BrowseComp-Plus/
</dd>
</dl>
<dl>
<dt></dt><b><code>ciral-v1.0-ha</code></b>
[<a href="../pyserini/resources/index-metadata/lucene-index.ciral-v1.0.20230721.e850ea.README.md">readme</a>]
<dd>Lucene index for CIRAL v1.0 (Hausa).
Expand Down Expand Up @@ -2163,6 +2169,15 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
</dl>
</details>
<details>
<summary>BrowseComp-Plus</summary>
<dl>
<dt></dt><b><code>browsecomp-plus.qwen3-embedding-8b</code></b>
[<a href="../pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md">readme</a>]
<dd>Faiss flat index of the BrowseComp-Plus corpus encoded by Qwen3-Embedding-8B. See https://texttron.github.io/BrowseComp-Plus/
</dd>
</dl>
</details>
<details>
<summary>Mr.TyDi</summary>
<dl>
<dt></dt><b><code>mrtydi-v1.1-arabic-mdpr-nq</code></b>
Expand Down
9 changes: 5 additions & 4 deletions pyserini/encode/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from pyserini.encode.optional import FaissRepresentationWriter


def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal):
def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal, fp16=False, padding_side='right'):
_encoder_class = encoder_class

# determine encoder_class
Expand All @@ -51,7 +51,7 @@ def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multi
if _encoder_class == 'contriever' or 'contriever' in encoder:
kwargs.update(dict(pooling='mean', l2_norm=False))
if _encoder_class == 'auto':
kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix))
kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix, fp16=fp16, padding_side=padding_side))
if _encoder_class == 'clip' or 'clip' in encoder:
kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal))
if _encoder_class == 'uniir':
Expand Down Expand Up @@ -116,15 +116,16 @@ def parse_args(parser, commands):
default='cuda:0', required=False)
encoder_parser.add_argument('--fp16', action='store_true', default=False)
encoder_parser.add_argument('--add-sep', action='store_true', default=False)
encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', choices=['cls', 'mean'], required=False)
encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', choices=['cls', 'mean', 'eos'], required=False)
encoder_parser.add_argument('--l2-norm', action='store_true', help='whether to normalize embedding', default=False, required=False)
encoder_parser.add_argument('--prefix', type=str, help='prefix of document input', default=None, required=False)
encoder_parser.add_argument('--padding-side', type=str, default='right', choices=['left', 'right'], help='padding side for the tokenizer', required=False)
encoder_parser.add_argument('--use-openai', help='use OpenAI text-embedding-ada-002 to retreive embeddings', action='store_true', default=False)
encoder_parser.add_argument('--rate-limit', type=int, help='rate limit of the requests per minute for OpenAI embeddings', default=3500, required=False)

args = parse_args(parser, commands)
delimiter = args.input.delimiter.replace("\\n", "\n") # argparse would add \ prior to the passed '\n\n'
encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device, pooling=args.encoder.pooling, l2_norm=args.encoder.l2_norm, prefix=args.encoder.prefix, multimodal=args.encoder.multimodal)
encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device, pooling=args.encoder.pooling, l2_norm=args.encoder.l2_norm, prefix=args.encoder.prefix, multimodal=args.encoder.multimodal, fp16=args.encoder.fp16, padding_side=getattr(args.encoder, 'padding_side', 'right'))

if args.output.to_faiss:
embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension)
Expand Down
61 changes: 52 additions & 9 deletions pyserini/encode/_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,20 @@
#

import numpy as np
import torch
from contextlib import nullcontext
from sklearn.preprocessing import normalize
from transformers import AutoModel, AutoTokenizer

from pyserini.encode import DocumentEncoder, QueryEncoder


class AutoDocumentEncoder(DocumentEncoder):
def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False, prefix=None):
def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False, prefix=None, fp16=False, padding_side='right'):
self.device = device
self.model = AutoModel.from_pretrained(model_name)
self.fp16 = fp16
torch_dtype = torch.float16 if fp16 else torch.float32
self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch_dtype if device.startswith('cuda') else None)
self.model.to(self.device)
try:
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name,
Expand All @@ -33,6 +37,9 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cl
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name,
use_fast=False,
clean_up_tokenization_spaces=True)
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.tokenizer.padding_side = padding_side
self.has_model = True
self.pooling = pooling
self.l2_norm = l2_norm
Expand Down Expand Up @@ -62,9 +69,22 @@ def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs):

inputs = self.tokenizer(**input_kwargs, **shared_tokenizer_kwargs)
inputs.to(self.device)
outputs = self.model(**inputs)
autocast_context = torch.amp.autocast('cuda') if self.fp16 and self.device.startswith('cuda') else nullcontext()
with autocast_context:
with torch.no_grad():
outputs = self.model(**inputs)
if self.pooling == "mean":
embeddings = self._mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy()
elif self.pooling == "eos":
attention_mask = inputs['attention_mask']
last_hidden_state = outputs[0]
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
embeddings = last_hidden_state[:, -1].detach().cpu().numpy()
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_state.shape[0]
embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths].detach().cpu().numpy()
else:
embeddings = outputs[0][:, 0, :].detach().cpu().numpy()
if self.l2_norm:
Expand All @@ -75,11 +95,13 @@ def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs):
class AutoQueryEncoder(QueryEncoder):
def __init__(self, encoder_dir: str = None, tokenizer_name: str = None,
encoded_query_dir: str = None, device: str = 'cpu',
pooling: str = 'cls', l2_norm: bool = False, prefix=None, **kwargs):
pooling: str = 'cls', l2_norm: bool = False, prefix=None, fp16=False, padding_side='right', **kwargs):
super().__init__(encoded_query_dir)
if encoder_dir:
self.device = device
self.model = AutoModel.from_pretrained(encoder_dir)
self.fp16 = fp16
torch_dtype = torch.float16 if fp16 else torch.float32
self.model = AutoModel.from_pretrained(encoder_dir, torch_dtype=torch_dtype if device.startswith('cuda') else None)
self.model.to(self.device)
try:
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir,
Expand All @@ -88,30 +110,51 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None,
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir,
use_fast=False,
clean_up_tokenization_spaces=True)
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.tokenizer.padding_side = padding_side
self.has_model = True
self.pooling = pooling
self.l2_norm = l2_norm
self.prefix = prefix
if (not self.has_model) and (not self.has_encoded_query):
raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one')

def encode(self, query: str):
def encode(self, query: str, max_length=None):
if self.has_model:
if self.prefix:
query = f'{self.prefix} {query}'
inputs = self.tokenizer(
query,
tokenizer_kwargs = dict(
add_special_tokens=True,
return_tensors='pt',
truncation='only_first',
padding='longest',
return_token_type_ids=False,
return_attention_mask=True,
)
if max_length is not None:
tokenizer_kwargs['max_length'] = max_length
inputs = self.tokenizer(query, **tokenizer_kwargs)
inputs.to(self.device)
outputs = self.model(**inputs)[0].detach().cpu().numpy()
autocast_context = torch.amp.autocast('cuda') if self.fp16 and self.device.startswith('cuda') else nullcontext()
with autocast_context:
with torch.no_grad():
model_outputs = self.model(**inputs)
last_hidden_state = model_outputs[0]
if self.pooling == "mean":
outputs = last_hidden_state.detach().cpu().numpy()
embeddings = np.average(outputs, axis=-2)
elif self.pooling == "eos":
attention_mask = inputs['attention_mask']
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
embeddings = last_hidden_state[:, -1].detach().cpu().numpy()
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_state.shape[0]
embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths].detach().cpu().numpy()
else:
outputs = last_hidden_state.detach().cpu().numpy()
embeddings = outputs[:, 0, :]
if self.l2_norm:
embeddings = normalize(embeddings, norm='l2')
Expand Down
10 changes: 6 additions & 4 deletions pyserini/encode/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from tqdm import tqdm


def init_encoder(encoder, device, pooling, l2_norm, prefix):
def init_encoder(encoder, device, pooling, l2_norm, prefix, fp16=False, padding_side='right'):
if 'dpr' in encoder.lower():
return DprQueryEncoder(encoder, device=device)
elif 'tct' in encoder.lower():
Expand All @@ -44,7 +44,7 @@ def init_encoder(encoder, device, pooling, l2_norm, prefix):
elif 'arctic' in encoder.lower():
return ArcticQueryEncoder(encoder, device=device)
else:
return AutoQueryEncoder(encoder, device=device, pooling=pooling, l2_norm=l2_norm, prefix=prefix)
return AutoQueryEncoder(encoder, device=device, pooling=pooling, l2_norm=l2_norm, prefix=prefix, fp16=fp16, padding_side=padding_side)


if __name__ == '__main__':
Expand All @@ -56,12 +56,14 @@ def init_encoder(encoder, device, pooling, l2_norm, prefix):
parser.add_argument('--output', type=str, help='path to stored encoded queries', required=True)
parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cpu', required=False)
parser.add_argument('--max-length', type=int, help='max length', default=256, required=False)
parser.add_argument('--pooling', type=str, help='pooling strategy', default='cls', choices=['cls', 'mean'], required=False)
parser.add_argument('--pooling', type=str, help='pooling strategy', default='cls', choices=['cls', 'mean', 'eos'], required=False)
parser.add_argument('--l2-norm', action='store_true', help='whether to normalize embedding', default=False, required=False)
parser.add_argument('--prefx', type=str, help='prefix query input', default=None, required=False)
parser.add_argument('--fp16', action='store_true', help='use fp16 for query embeddings', default=False, required=False)
parser.add_argument('--padding-side', type=str, default='right', choices=['left', 'right'], help='padding side for the tokenizer', required=False)
args = parser.parse_args()

encoder = init_encoder(args.encoder, device=args.device, pooling=args.pooling, l2_norm=args.l2_norm, prefix=args.prefx)
encoder = init_encoder(args.encoder, device=args.device, pooling=args.pooling, l2_norm=args.l2_norm, prefix=args.prefx, fp16=args.fp16, padding_side=args.padding_side)
query_iterator = DefaultQueryIterator.from_topics(args.topics)

is_sparse = False
Expand Down
35 changes: 35 additions & 0 deletions pyserini/prebuilt_index_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -1418,6 +1418,23 @@ def import_from_lucene(enum):
"core18": TF_INDEX_INFO_OTHER["wapo.v2"],
}

TF_INDEX_INFO_BROWSECOMP_PLUS = {
"browsecomp-plus.bm25": {
"description": "Lucene index of the BrowseComp-Plus corpus. See https://texttron.github.io/BrowseComp-Plus/",
"filename": "lucene-inverted.browsecomp-plus.bm25.20250810.tar.gz",
"readme": "lucene-inverted.browsecomp-plus.bm25.20250810.README.md",
"urls": [
"https://huggingface.co/datasets/castorini/prebuilt-indexes-browsecomp-plus/resolve/main/lucene-inverted/lucene-inverted.browsecomp-plus.bm25.20250810.tar.gz"
],
"md5": "c9c3a69fe2725016f35e8c8523bdf828",
"size compressed (bytes)": 1783466040,
"total_terms": 373478034,
"documents": 100195,
"unique_terms": 4152345,
"downloaded": False
},
}

TF_INDEX_INFO = {**TF_INDEX_INFO_MSMARCO,
**TF_INDEX_INFO_MSMARCO_ALIASES,
**TF_INDEX_INFO_BEIR,
Expand All @@ -1426,6 +1443,7 @@ def import_from_lucene(enum):
**TF_INDEX_INFO_MRTYDI_ALIASES,
**TF_INDEX_INFO_MIRACL,
**TF_INDEX_INFO_CIRAL,
**TF_INDEX_INFO_BROWSECOMP_PLUS,
**TF_INDEX_INFO_OTHER,
**TF_INDEX_INFO_OTHER_ALIASES}

Expand Down Expand Up @@ -6010,6 +6028,22 @@ def import_from_lucene(enum):
}
}

FAISS_INDEX_INFO_BROWSECOMP_PLUS = {
"browsecomp-plus.qwen3-embedding-8b": {
"description": "Faiss flat index of the BrowseComp-Plus corpus encoded by Qwen3-Embedding-8B. See https://texttron.github.io/BrowseComp-Plus/",
"filename": "faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.tar.gz",
"readme": "faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md",
"urls": [
"https://huggingface.co/datasets/castorini/prebuilt-indexes-browsecomp-plus/resolve/main/faiss-flat/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.tar.gz"
],
"md5": "2b3faad654414787c2438c9dfcd5500c",
"size compressed (bytes)": 1521405052,
"documents": 100195,
"downloaded": False,
"texts": "browsecomp-plus.bm25"
},
}

FAISS_INDEX_INFO_OTHER = {
"cast2019-tct_colbert-v2.hnsw": {
"description": "Faiss HNSW index of the CAsT2019 passage corpus encoded by the tct_colbert-v2 passage encoder",
Expand Down Expand Up @@ -6347,4 +6381,5 @@ def import_from_lucene(enum):
**FAISS_INDEX_INFO_WIKIPEDIA,
**FAISS_INDEX_INFO_CIRAL,
**FAISS_INDEX_INFO_M_BEIR,
**FAISS_INDEX_INFO_BROWSECOMP_PLUS,
**FAISS_INDEX_INFO_OTHER}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# browsecomp-plus.qwen3-embedding-8b

Faiss flat index of the [BrowseComp-Plus](https://texttron.github.io/BrowseComp-Plus/) corpus encoded by Qwen3-Embedding-8B.

This was generated on 2025/08/10 following [this guide](https://github.com/texttron/tevatron/tree/main/examples/BrowseComp-Plus).
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# browsecomp-plus.bm25

BM25 index of the [BrowseComp-Plus](https://texttron.github.io/BrowseComp-Plus/) corpus.

This was generated on 2025/08/10 at commit [a6a45da](https://github.com/castorini/pyserini/commit/a6a45dac5651b1236ae2f3e2ee3b0e71dd403e23) on `basilisk` with the following command:

```
python -m pyserini.index.lucene --collection JsonCollection --input corpus/browsecomp-plus/ --index indexes/bm25 --generator DefaultLuceneDocumentGenerator --threads 32 --storeRaw
```

where the corpus can be found at https://huggingface.co/datasets/Tevatron/browsecomp-plus-corpus.

17 changes: 14 additions & 3 deletions pyserini/search/faiss/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def define_dsearch_args(parser):
metavar="pooling strategy",
required=False,
default="cls",
choices=["cls", "mean"],
choices=["cls", "mean", "eos"],
help="Pooling strategy for query encoder",
)
parser.add_argument(
Expand Down Expand Up @@ -226,6 +226,15 @@ def define_dsearch_args(parser):
default=None,
help="Set efSearch for HNSW index",
)
parser.add_argument(
"--padding-side",
type=str,
metavar="left or right",
required=False,
default="right",
choices=["left", "right"],
help="Padding side for the tokenizer",
)


def init_query_encoder(
Expand All @@ -242,6 +251,7 @@ def init_query_encoder(
multimodal=False,
instruction_config=None,
fp16=False,
padding_side='right',
):
encoded_queries_map = {
"msmarco-passage-dev-subset": "tct_colbert-msmarco-passage-dev-subset",
Expand Down Expand Up @@ -289,7 +299,7 @@ def init_query_encoder(
if _encoder_class == "openai-api" or "openai" in encoder:
kwargs.update(dict(max_length=max_length))
if _encoder_class == "auto":
kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix))
kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix, padding_side=padding_side))
if _encoder_class == "clip" or "clip" in encoder:
kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal))
if _encoder_class == "uniir":
Expand Down Expand Up @@ -434,7 +444,8 @@ def init_query_encoder(
args.query_prefix,
args.multimodal,
args.instruction_config,
args.fp16
args.fp16,
args.padding_side
)
if args.pca_model:
query_encoder = PcaEncoder(query_encoder, args.pca_model)
Expand Down
6 changes: 6 additions & 0 deletions scripts/generate_docs_from_prebuilt_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def generate_prebuilt(index):

print('<details>')
print('<summary>Other</summary>')
generate_prebuilt(TF_INDEX_INFO_BROWSECOMP_PLUS)
generate_prebuilt(TF_INDEX_INFO_CIRAL)
generate_prebuilt(TF_INDEX_INFO_OTHER)
print('</details>')
Expand Down Expand Up @@ -187,6 +188,11 @@ def generate_prebuilt(index):
generate_prebuilt(FAISS_INDEX_INFO_BRIGHT)
print('</details>')

print('<details>')
print('<summary>BrowseComp-Plus</summary>')
generate_prebuilt(FAISS_INDEX_INFO_BROWSECOMP_PLUS)
print('</details>')

print('<details>')
print('<summary>Mr.TyDi</summary>')
generate_prebuilt(FAISS_INDEX_INFO_MRTYDI)
Expand Down
Loading