diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md index b10e73266..75549503a 100644 --- a/docs/prebuilt-indexes.md +++ b/docs/prebuilt-indexes.md @@ -651,6 +651,12 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
Other
+
browsecomp-plus.bm25 +[readme] +
Lucene index of the BrowseComp-Plus corpus. See https://texttron.github.io/BrowseComp-Plus/ +
+
+
ciral-v1.0-ha [readme]
Lucene index for CIRAL v1.0 (Hausa). @@ -2163,6 +2169,15 @@ Detailed configuration information for the prebuilt indexes are stored in [`pyse
+BrowseComp-Plus +
+
browsecomp-plus.qwen3-embedding-8b +[readme] +
Faiss flat index of the BrowseComp-Plus corpus encoded by Qwen3-Embedding-8B. See https://texttron.github.io/BrowseComp-Plus/ +
+
+
+
Mr.TyDi
mrtydi-v1.1-arabic-mdpr-nq diff --git a/pyserini/encode/__main__.py b/pyserini/encode/__main__.py index 20a917673..f70d16333 100644 --- a/pyserini/encode/__main__.py +++ b/pyserini/encode/__main__.py @@ -24,7 +24,7 @@ from pyserini.encode.optional import FaissRepresentationWriter -def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal): +def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multimodal, fp16=False, padding_side='right'): _encoder_class = encoder_class # determine encoder_class @@ -51,7 +51,7 @@ def init_encoder(encoder, encoder_class, device, pooling, l2_norm, prefix, multi if _encoder_class == 'contriever' or 'contriever' in encoder: kwargs.update(dict(pooling='mean', l2_norm=False)) if _encoder_class == 'auto': - kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix)) + kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix, fp16=fp16, padding_side=padding_side)) if _encoder_class == 'clip' or 'clip' in encoder: kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal)) if _encoder_class == 'uniir': @@ -116,15 +116,16 @@ def parse_args(parser, commands): default='cuda:0', required=False) encoder_parser.add_argument('--fp16', action='store_true', default=False) encoder_parser.add_argument('--add-sep', action='store_true', default=False) - encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', choices=['cls', 'mean'], required=False) + encoder_parser.add_argument('--pooling', type=str, default='cls', help='for auto classes, allow the ability to dictate pooling strategy', choices=['cls', 'mean', 'eos'], required=False) encoder_parser.add_argument('--l2-norm', action='store_true', help='whether to normalize embedding', default=False, required=False) encoder_parser.add_argument('--prefix', type=str, help='prefix of document input', default=None, required=False) + encoder_parser.add_argument('--padding-side', type=str, default='right', choices=['left', 'right'], help='padding side for the tokenizer', required=False) encoder_parser.add_argument('--use-openai', help='use OpenAI text-embedding-ada-002 to retreive embeddings', action='store_true', default=False) encoder_parser.add_argument('--rate-limit', type=int, help='rate limit of the requests per minute for OpenAI embeddings', default=3500, required=False) args = parse_args(parser, commands) delimiter = args.input.delimiter.replace("\\n", "\n") # argparse would add \ prior to the passed '\n\n' - encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device, pooling=args.encoder.pooling, l2_norm=args.encoder.l2_norm, prefix=args.encoder.prefix, multimodal=args.encoder.multimodal) + encoder = init_encoder(args.encoder.encoder, args.encoder.encoder_class, device=args.encoder.device, pooling=args.encoder.pooling, l2_norm=args.encoder.l2_norm, prefix=args.encoder.prefix, multimodal=args.encoder.multimodal, fp16=args.encoder.fp16, padding_side=getattr(args.encoder, 'padding_side', 'right')) if args.output.to_faiss: embedding_writer = FaissRepresentationWriter(args.output.embeddings, dimension=args.encoder.dimension) diff --git a/pyserini/encode/_auto.py b/pyserini/encode/_auto.py index 209ad1985..f6a394597 100644 --- a/pyserini/encode/_auto.py +++ b/pyserini/encode/_auto.py @@ -15,6 +15,8 @@ # import numpy as np +import torch +from contextlib import nullcontext from sklearn.preprocessing import normalize from transformers import AutoModel, AutoTokenizer @@ -22,9 +24,11 @@ class AutoDocumentEncoder(DocumentEncoder): - def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False, prefix=None): + def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cls', l2_norm=False, prefix=None, fp16=False, padding_side='right'): self.device = device - self.model = AutoModel.from_pretrained(model_name) + self.fp16 = fp16 + torch_dtype = torch.float16 if fp16 else torch.float32 + self.model = AutoModel.from_pretrained(model_name, torch_dtype=torch_dtype if device.startswith('cuda') else None) self.model.to(self.device) try: self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, @@ -33,6 +37,9 @@ def __init__(self, model_name, tokenizer_name=None, device='cuda:0', pooling='cl self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or model_name, use_fast=False, clean_up_tokenization_spaces=True) + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.tokenizer.padding_side = padding_side self.has_model = True self.pooling = pooling self.l2_norm = l2_norm @@ -62,9 +69,22 @@ def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs): inputs = self.tokenizer(**input_kwargs, **shared_tokenizer_kwargs) inputs.to(self.device) - outputs = self.model(**inputs) + autocast_context = torch.amp.autocast('cuda') if self.fp16 and self.device.startswith('cuda') else nullcontext() + with autocast_context: + with torch.no_grad(): + outputs = self.model(**inputs) if self.pooling == "mean": embeddings = self._mean_pooling(outputs[0], inputs['attention_mask']).detach().cpu().numpy() + elif self.pooling == "eos": + attention_mask = inputs['attention_mask'] + last_hidden_state = outputs[0] + left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) + if left_padding: + embeddings = last_hidden_state[:, -1].detach().cpu().numpy() + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths].detach().cpu().numpy() else: embeddings = outputs[0][:, 0, :].detach().cpu().numpy() if self.l2_norm: @@ -75,11 +95,13 @@ def encode(self, texts, titles=None, max_length=256, add_sep=False, **kwargs): class AutoQueryEncoder(QueryEncoder): def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, encoded_query_dir: str = None, device: str = 'cpu', - pooling: str = 'cls', l2_norm: bool = False, prefix=None, **kwargs): + pooling: str = 'cls', l2_norm: bool = False, prefix=None, fp16=False, padding_side='right', **kwargs): super().__init__(encoded_query_dir) if encoder_dir: self.device = device - self.model = AutoModel.from_pretrained(encoder_dir) + self.fp16 = fp16 + torch_dtype = torch.float16 if fp16 else torch.float32 + self.model = AutoModel.from_pretrained(encoder_dir, torch_dtype=torch_dtype if device.startswith('cuda') else None) self.model.to(self.device) try: self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, @@ -88,6 +110,9 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name or encoder_dir, use_fast=False, clean_up_tokenization_spaces=True) + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.tokenizer.padding_side = padding_side self.has_model = True self.pooling = pooling self.l2_norm = l2_norm @@ -95,23 +120,41 @@ def __init__(self, encoder_dir: str = None, tokenizer_name: str = None, if (not self.has_model) and (not self.has_encoded_query): raise Exception('Neither query encoder model nor encoded queries provided. Please provide at least one') - def encode(self, query: str): + def encode(self, query: str, max_length=None): if self.has_model: if self.prefix: query = f'{self.prefix} {query}' - inputs = self.tokenizer( - query, + tokenizer_kwargs = dict( add_special_tokens=True, return_tensors='pt', truncation='only_first', padding='longest', return_token_type_ids=False, + return_attention_mask=True, ) + if max_length is not None: + tokenizer_kwargs['max_length'] = max_length + inputs = self.tokenizer(query, **tokenizer_kwargs) inputs.to(self.device) - outputs = self.model(**inputs)[0].detach().cpu().numpy() + autocast_context = torch.amp.autocast('cuda') if self.fp16 and self.device.startswith('cuda') else nullcontext() + with autocast_context: + with torch.no_grad(): + model_outputs = self.model(**inputs) + last_hidden_state = model_outputs[0] if self.pooling == "mean": + outputs = last_hidden_state.detach().cpu().numpy() embeddings = np.average(outputs, axis=-2) + elif self.pooling == "eos": + attention_mask = inputs['attention_mask'] + left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) + if left_padding: + embeddings = last_hidden_state[:, -1].detach().cpu().numpy() + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device), sequence_lengths].detach().cpu().numpy() else: + outputs = last_hidden_state.detach().cpu().numpy() embeddings = outputs[:, 0, :] if self.l2_norm: embeddings = normalize(embeddings, norm='l2') diff --git a/pyserini/encode/query.py b/pyserini/encode/query.py index c610b56cc..ddf888813 100644 --- a/pyserini/encode/query.py +++ b/pyserini/encode/query.py @@ -24,7 +24,7 @@ from tqdm import tqdm -def init_encoder(encoder, device, pooling, l2_norm, prefix): +def init_encoder(encoder, device, pooling, l2_norm, prefix, fp16=False, padding_side='right'): if 'dpr' in encoder.lower(): return DprQueryEncoder(encoder, device=device) elif 'tct' in encoder.lower(): @@ -44,7 +44,7 @@ def init_encoder(encoder, device, pooling, l2_norm, prefix): elif 'arctic' in encoder.lower(): return ArcticQueryEncoder(encoder, device=device) else: - return AutoQueryEncoder(encoder, device=device, pooling=pooling, l2_norm=l2_norm, prefix=prefix) + return AutoQueryEncoder(encoder, device=device, pooling=pooling, l2_norm=l2_norm, prefix=prefix, fp16=fp16, padding_side=padding_side) if __name__ == '__main__': @@ -56,12 +56,14 @@ def init_encoder(encoder, device, pooling, l2_norm, prefix): parser.add_argument('--output', type=str, help='path to stored encoded queries', required=True) parser.add_argument('--device', type=str, help='device cpu or cuda [cuda:0, cuda:1...]', default='cpu', required=False) parser.add_argument('--max-length', type=int, help='max length', default=256, required=False) - parser.add_argument('--pooling', type=str, help='pooling strategy', default='cls', choices=['cls', 'mean'], required=False) + parser.add_argument('--pooling', type=str, help='pooling strategy', default='cls', choices=['cls', 'mean', 'eos'], required=False) parser.add_argument('--l2-norm', action='store_true', help='whether to normalize embedding', default=False, required=False) parser.add_argument('--prefx', type=str, help='prefix query input', default=None, required=False) + parser.add_argument('--fp16', action='store_true', help='use fp16 for query embeddings', default=False, required=False) + parser.add_argument('--padding-side', type=str, default='right', choices=['left', 'right'], help='padding side for the tokenizer', required=False) args = parser.parse_args() - encoder = init_encoder(args.encoder, device=args.device, pooling=args.pooling, l2_norm=args.l2_norm, prefix=args.prefx) + encoder = init_encoder(args.encoder, device=args.device, pooling=args.pooling, l2_norm=args.l2_norm, prefix=args.prefx, fp16=args.fp16, padding_side=args.padding_side) query_iterator = DefaultQueryIterator.from_topics(args.topics) is_sparse = False diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index 24b2f1937..209322ffb 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -1418,6 +1418,23 @@ def import_from_lucene(enum): "core18": TF_INDEX_INFO_OTHER["wapo.v2"], } +TF_INDEX_INFO_BROWSECOMP_PLUS = { + "browsecomp-plus.bm25": { + "description": "Lucene index of the BrowseComp-Plus corpus. See https://texttron.github.io/BrowseComp-Plus/", + "filename": "lucene-inverted.browsecomp-plus.bm25.20250810.tar.gz", + "readme": "lucene-inverted.browsecomp-plus.bm25.20250810.README.md", + "urls": [ + "https://huggingface.co/datasets/castorini/prebuilt-indexes-browsecomp-plus/resolve/main/lucene-inverted/lucene-inverted.browsecomp-plus.bm25.20250810.tar.gz" + ], + "md5": "c9c3a69fe2725016f35e8c8523bdf828", + "size compressed (bytes)": 1783466040, + "total_terms": 373478034, + "documents": 100195, + "unique_terms": 4152345, + "downloaded": False + }, +} + TF_INDEX_INFO = {**TF_INDEX_INFO_MSMARCO, **TF_INDEX_INFO_MSMARCO_ALIASES, **TF_INDEX_INFO_BEIR, @@ -1426,6 +1443,7 @@ def import_from_lucene(enum): **TF_INDEX_INFO_MRTYDI_ALIASES, **TF_INDEX_INFO_MIRACL, **TF_INDEX_INFO_CIRAL, + **TF_INDEX_INFO_BROWSECOMP_PLUS, **TF_INDEX_INFO_OTHER, **TF_INDEX_INFO_OTHER_ALIASES} @@ -6010,6 +6028,22 @@ def import_from_lucene(enum): } } +FAISS_INDEX_INFO_BROWSECOMP_PLUS = { + "browsecomp-plus.qwen3-embedding-8b": { + "description": "Faiss flat index of the BrowseComp-Plus corpus encoded by Qwen3-Embedding-8B. See https://texttron.github.io/BrowseComp-Plus/", + "filename": "faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.tar.gz", + "readme": "faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md", + "urls": [ + "https://huggingface.co/datasets/castorini/prebuilt-indexes-browsecomp-plus/resolve/main/faiss-flat/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.tar.gz" + ], + "md5": "2b3faad654414787c2438c9dfcd5500c", + "size compressed (bytes)": 1521405052, + "documents": 100195, + "downloaded": False, + "texts": "browsecomp-plus.bm25" + }, +} + FAISS_INDEX_INFO_OTHER = { "cast2019-tct_colbert-v2.hnsw": { "description": "Faiss HNSW index of the CAsT2019 passage corpus encoded by the tct_colbert-v2 passage encoder", @@ -6347,4 +6381,5 @@ def import_from_lucene(enum): **FAISS_INDEX_INFO_WIKIPEDIA, **FAISS_INDEX_INFO_CIRAL, **FAISS_INDEX_INFO_M_BEIR, + **FAISS_INDEX_INFO_BROWSECOMP_PLUS, **FAISS_INDEX_INFO_OTHER} diff --git a/pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md b/pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md new file mode 100644 index 000000000..1abcfaaee --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.browsecomp-plus.qwen3-embedding-8b.20250810.README.md @@ -0,0 +1,5 @@ +# browsecomp-plus.qwen3-embedding-8b + +Faiss flat index of the [BrowseComp-Plus](https://texttron.github.io/BrowseComp-Plus/) corpus encoded by Qwen3-Embedding-8B. + +This was generated on 2025/08/10 following [this guide](https://github.com/texttron/tevatron/tree/main/examples/BrowseComp-Plus). diff --git a/pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md b/pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md new file mode 100644 index 000000000..ada2a1947 --- /dev/null +++ b/pyserini/resources/index-metadata/lucene-inverted.browsecomp-plus.bm25.20250810.README.md @@ -0,0 +1,12 @@ +# browsecomp-plus.bm25 + +BM25 index of the [BrowseComp-Plus](https://texttron.github.io/BrowseComp-Plus/) corpus. + +This was generated on 2025/08/10 at commit [a6a45da](https://github.com/castorini/pyserini/commit/a6a45dac5651b1236ae2f3e2ee3b0e71dd403e23) on `basilisk` with the following command: + +``` +python -m pyserini.index.lucene --collection JsonCollection --input corpus/browsecomp-plus/ --index indexes/bm25 --generator DefaultLuceneDocumentGenerator --threads 32 --storeRaw +``` + +where the corpus can be found at https://huggingface.co/datasets/Tevatron/browsecomp-plus-corpus. + diff --git a/pyserini/search/faiss/__main__.py b/pyserini/search/faiss/__main__.py index 3554b6ae9..916465915 100644 --- a/pyserini/search/faiss/__main__.py +++ b/pyserini/search/faiss/__main__.py @@ -89,7 +89,7 @@ def define_dsearch_args(parser): metavar="pooling strategy", required=False, default="cls", - choices=["cls", "mean"], + choices=["cls", "mean", "eos"], help="Pooling strategy for query encoder", ) parser.add_argument( @@ -226,6 +226,15 @@ def define_dsearch_args(parser): default=None, help="Set efSearch for HNSW index", ) + parser.add_argument( + "--padding-side", + type=str, + metavar="left or right", + required=False, + default="right", + choices=["left", "right"], + help="Padding side for the tokenizer", + ) def init_query_encoder( @@ -242,6 +251,7 @@ def init_query_encoder( multimodal=False, instruction_config=None, fp16=False, + padding_side='right', ): encoded_queries_map = { "msmarco-passage-dev-subset": "tct_colbert-msmarco-passage-dev-subset", @@ -289,7 +299,7 @@ def init_query_encoder( if _encoder_class == "openai-api" or "openai" in encoder: kwargs.update(dict(max_length=max_length)) if _encoder_class == "auto": - kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix)) + kwargs.update(dict(pooling=pooling, l2_norm=l2_norm, prefix=prefix, padding_side=padding_side)) if _encoder_class == "clip" or "clip" in encoder: kwargs.update(dict(l2_norm=True, prefix=prefix, multimodal=multimodal)) if _encoder_class == "uniir": @@ -434,7 +444,8 @@ def init_query_encoder( args.query_prefix, args.multimodal, args.instruction_config, - args.fp16 + args.fp16, + args.padding_side ) if args.pca_model: query_encoder = PcaEncoder(query_encoder, args.pca_model) diff --git a/scripts/generate_docs_from_prebuilt_indexes.py b/scripts/generate_docs_from_prebuilt_indexes.py index a771d42c0..81db1c715 100644 --- a/scripts/generate_docs_from_prebuilt_indexes.py +++ b/scripts/generate_docs_from_prebuilt_indexes.py @@ -125,6 +125,7 @@ def generate_prebuilt(index): print('
') print('Other') + generate_prebuilt(TF_INDEX_INFO_BROWSECOMP_PLUS) generate_prebuilt(TF_INDEX_INFO_CIRAL) generate_prebuilt(TF_INDEX_INFO_OTHER) print('
') @@ -187,6 +188,11 @@ def generate_prebuilt(index): generate_prebuilt(FAISS_INDEX_INFO_BRIGHT) print('
') + print('
') + print('BrowseComp-Plus') + generate_prebuilt(FAISS_INDEX_INFO_BROWSECOMP_PLUS) + print('
') + print('
') print('Mr.TyDi') generate_prebuilt(FAISS_INDEX_INFO_MRTYDI) diff --git a/tests/core/test_prebuilt_index.py b/tests/core/test_prebuilt_index.py index 2414b41b1..23115d794 100644 --- a/tests/core/test_prebuilt_index.py +++ b/tests/core/test_prebuilt_index.py @@ -114,6 +114,18 @@ def test_lucene_tf_ciral(self): self.assertEqual(cnt, 8) self._test_urls(urls) + def test_lucene_tf_browsecomp_plus(self): + urls = [] + cnt = 0 + for key in TF_INDEX_INFO: + if 'browsecomp-plus' in key: + cnt += 1 + for url in TF_INDEX_INFO[key]['urls']: + urls.append(url) + + self.assertEqual(cnt, 1) + self._test_urls(urls) + def test_lucene_impact_msmarco(self): urls = [] cnt = 0 @@ -237,6 +249,18 @@ def test_faiss_bright(self): self.assertEqual(cnt, 12) self._test_urls(urls) + def test_faiss_browsecomp_plus(self): + urls = [] + cnt = 0 + for key in FAISS_INDEX_INFO: + if 'browsecomp-plus' in key: + cnt += 1 + for url in FAISS_INDEX_INFO[key]['urls']: + urls.append(url) + + self.assertEqual(cnt, 1) + self._test_urls(urls) + def test_faiss_mrtydi(self): urls = [] cnt = 0