This guide contains instructions for running BM25 baselines on HC4 (v1.0).
The HC4 corpus can be downloaded following the instructions here. After download, verify that all and only specified documents have been downloaded by running the code provided here.
With the corpus downloaded, we need to create 3 separate folders for the 3 languages (Persian, Chinese and Russian) , and unpack the data into the respective folders for each language
mkdir collections/hc4-v1.0-fa collections/hc4-v1.0-zh collections/hc4-v1.0-ruWe can now index these docs as a NeuClirCollection using Anserini bindings from Pyserini
python -m pyserini.index.lucene --collection NeuClirCollection \
--input collections/hc4-v1.0-zh --index indexes/lucene-index.hc4-v1.0-zh \
--generator DefaultLuceneDocumentGenerator --threads 8 \
--storePositions --storeDocvectors --storeRaw -language zh \
>& logs/log.hc4-v1.0-zh &
python -m pyserini.index.lucene --collection NeuClirCollection \
--input collections/hc4-v1.0-fa --index indexes/lucene-index.hc4-v1.0-fa \
--generator DefaultLuceneDocumentGenerator --threads 8 \
--storePositions --storeDocvectors --storeRaw -language fa \
>& logs/log.hc4-v1.0-fa &
python -m pyserini.index.lucene --collection NeuClirCollection \
--input collections/hc4-v1.0-ru --index indexes/lucene-index.hc4-v1.0-ru \
--generator DefaultLuceneDocumentGenerator --threads 8 \
--storePositions --storeDocvectors --storeRaw -language ru \
>& logs/log.hc4-v1.0-ru &Condition: Title
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-test-title \
--output runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.test.title.txt \
--bm25 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-test-title \
--output runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.test.title.txt \
--bm25 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-test-title \
--output runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.test.title.txt \
--bm25 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-test-title \
--output runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.test.title.txt \
--bm25 --rm3 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-test-title \
--output runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.test.title.txt \
--bm25 --rm3 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-test-title \
--output runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.test.title.txt \
--bm25 --rm3 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-test-title \
--output runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.test.title.txt \
--bm25 --rocchio --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-test-title \
--output runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.test.title.txt \
--bm25 --rocchio --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-test-title \
--output runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.test.title.txt \
--bm25 --rocchio --language ru Condition: Description
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-test-description \
--output runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.test.description.txt \
--bm25 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-test-description \
--output runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.test.description.txt \
--bm25 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-test-description \
--output runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.test.description.txt \
--bm25 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-test-description \
--output runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.test.description.txt \
--bm25 --rm3 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-test-description \
--output runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.test.description.txt \
--bm25 --rm3 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-test-description \
--output runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.test.description.txt \
--bm25 --rm3 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-test-description \
--output runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.test.description.txt \
--bm25 --rocchio --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-test-description \
--output runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.test.description.txt \
--bm25 --rocchio --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-test-description \
--output runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.test.description.txt \
--bm25 --rocchio --language ru Condition: Title
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-test runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-test runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-test runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-test runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-test runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-test runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-test runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-test runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.test.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-test runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.test.title.txtCondition: Description
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-test runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-test runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-test runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-test runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-test runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-test runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-test runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-test runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.test.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-test runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.test.description.txtCondition: Title
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-dev-title \
--output runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.dev.title.txt \
--bm25 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-dev-title \
--output runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.dev.title.txt \
--bm25 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-dev-title \
--output runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.dev.title.txt \
--bm25 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-dev-title \
--output runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.dev.title.txt \
--bm25 --rm3 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-dev-title \
--output runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.dev.title.txt \
--bm25 --rm3 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-dev-title \
--output runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.dev.title.txt \
--bm25 --rm3 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-dev-title \
--output runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.dev.title.txt \
--bm25 --rocchio --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-dev-title \
--output runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.dev.title.txt \
--bm25 --rocchio --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-dev-title \
--output runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.dev.title.txt \
--bm25 --rocchio --language ru Condition: Description
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-dev-description \
--output runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.dev.description.txt \
--bm25 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-dev-description \
--output runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.dev.description.txt \
--bm25 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-dev-description \
--output runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.dev.description.txt \
--bm25 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-dev-description \
--output runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.dev.description.txt \
--bm25 --rm3 --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-dev-description \
--output runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.dev.description.txt \
--bm25 --rm3 --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-dev-description \
--output runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.dev.description.txt \
--bm25 --rm3 --language ru
python -m pyserini.search.lucene --index hc4-v1.0-zh \
--topics hc4-v1.0-zh-dev-description \
--output runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.dev.description.txt \
--bm25 --rocchio --language zh
python -m pyserini.search.lucene --index hc4-v1.0-fa \
--topics hc4-v1.0-fa-dev-description \
--output runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.dev.description.txt \
--bm25 --rocchio --language fa
python -m pyserini.search.lucene --index hc4-v1.0-ru \
--topics hc4-v1.0-ru-dev-description \
--output runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.dev.description.txt \
--bm25 --rocchio --language ru Condition: Title
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-dev runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-dev runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-dev runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-dev runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-dev runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-dev runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-dev runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-dev runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.dev.title.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-dev runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.dev.title.txtCondition: Description
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-dev runs/run.hc4-v1.0-zh.bm25.topics.hc4-v1.0-zh.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-dev runs/run.hc4-v1.0-fa.bm25.topics.hc4-v1.0-fa.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-dev runs/run.hc4-v1.0-ru.bm25.topics.hc4-v1.0-ru.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-dev runs/run.hc4-v1.0-zh.bm25-default+rm3.topics.hc4-v1.0-zh.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-dev runs/run.hc4-v1.0-fa.bm25-default+rm3.topics.hc4-v1.0-fa.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-dev runs/run.hc4-v1.0-ru.bm25-default+rm3.topics.hc4-v1.0-ru.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-zh-dev runs/run.hc4-v1.0-zh.bm25-default+rocchio.topics.hc4-v1.0-zh.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-fa-dev runs/run.hc4-v1.0-fa.bm25-default+rocchio.topics.hc4-v1.0-fa.dev.description.txt
python -m pyserini.eval.trec_eval -c -m map -m ndcg_cut.20 -m judged.20 -m recall.1000 hc4-v1.0-ru-dev runs/run.hc4-v1.0-ru.bm25-default+rocchio.topics.hc4-v1.0-ru.dev.description.txtWith the above commands, you should be able to reproduce the following results: