diff --git a/docs/cache-directories.md b/docs/cache-directories.md new file mode 100644 index 0000000000..96e53912e0 --- /dev/null +++ b/docs/cache-directories.md @@ -0,0 +1,40 @@ +# Cache Directories in Anserini + +Anserini uses cache directories for storing various resources, such as: + +- Pre-built indexes +- Encoder models +- Topics and qrels files + +By default, these are stored in your home directory under `~/.cache/pyserini/`. + +## Default Cache Paths + +- Indexes: `~/.cache/pyserini/indexes/` +- Encoders: `~/.cache/pyserini/encoders/` +- Topics and Qrels: `~/.cache/pyserini/topics-and-qrels/` + +## Customizing Cache Directories + +You can customize these paths using environment variables. + +### Using Environment Variables + +```sh +# Set custom cache directory for indexes +export ANSERINI_INDEX_CACHE=/path/to/custom/index/cache + +# Set custom cache directory for encoders +export ANSERINI_ENCODER_CACHE=/path/to/custom/encoder/cache + +# Set custom cache directory for topics and qrels +export ANSERINI_TOPICS_CACHE=/path/to/custom/topics/cache +``` + +## Fallback Order + +When resolving cache directories, Anserini checks for locations in the following order: + +1. System property (e.g., `anserini.index.cache`) +2. Environment variable (e.g., `ANSERINI_INDEX_CACHE`) +3. Default location in the user home directory \ No newline at end of file diff --git a/docs/prebuilt-indexes.md b/docs/prebuilt-indexes.md index 418f156e64..c57926e203 100644 --- a/docs/prebuilt-indexes.md +++ b/docs/prebuilt-indexes.md @@ -19,8 +19,8 @@ The HNSW indexes for dense retrieval models are even larger, for example, the Ar The prebuilt indexes are automatically downloaded to `~/.cache/pyserini/indexes/`, which may not be the best location for you. (Yes, `pyserini`; this is so prebuilt indexes from both Pyserini and Anserini can live in the same location.) -Currently, this path is hard-coded (see [Anserini #2322](https://github.com/castorini/anserini/issues/2322)). -If you want to change the download location, the current workaround is to use symlinks, i.e., symlink `~/.cache/pyserini/indexes/` to the actual path you desire. + +You can customize the location of the cache directory using environment variables or system properties. See the [cache directories documentation](cache-directories.md) for detailed information on how to customize where Anserini stores its cached resources. ## Managing Indexes Manually diff --git a/src/main/java/io/anserini/encoder/OnnxEncoder.java b/src/main/java/io/anserini/encoder/OnnxEncoder.java index 0971e002d6..21c3961a36 100644 --- a/src/main/java/io/anserini/encoder/OnnxEncoder.java +++ b/src/main/java/io/anserini/encoder/OnnxEncoder.java @@ -31,10 +31,9 @@ import ai.onnxruntime.OrtEnvironment; import ai.onnxruntime.OrtException; import ai.onnxruntime.OrtSession; +import io.anserini.util.CacheUtils; public abstract class OnnxEncoder { - private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString(); - protected final BertFullTokenizer tokenizer; protected final DefaultVocabulary vocab; @@ -53,11 +52,7 @@ static protected Path getVocabPath(String vocabName, String vocabURL) throws URI } static protected String getCacheDir() { - File cacheDir = new File(CACHE_DIR); - if (!cacheDir.exists()) { - cacheDir.mkdir(); - } - return cacheDir.getPath(); + return CacheUtils.getEncodersCache(); } static protected Path getModelPath(String modelName, String modelURL) throws IOException, URISyntaxException { diff --git a/src/main/java/io/anserini/eval/RelevanceJudgments.java b/src/main/java/io/anserini/eval/RelevanceJudgments.java index 430145bd60..ecab04c810 100644 --- a/src/main/java/io/anserini/eval/RelevanceJudgments.java +++ b/src/main/java/io/anserini/eval/RelevanceJudgments.java @@ -31,10 +31,10 @@ import java.util.Set; import org.apache.commons.io.FileUtils; +import io.anserini.util.CacheUtils; public class RelevanceJudgments { final private Map> qrels; - static private final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels").toString(); final private static String SERVER_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/"; public static RelevanceJudgments fromQrels(Qrels qrels) throws IOException { @@ -118,11 +118,7 @@ public Map getDocMap(String qid) { } private static String getCacheDir() { - File cacheDir = new File(CACHE_DIR); - if (!cacheDir.exists()) { - cacheDir.mkdir(); - } - return cacheDir.getPath(); + return CacheUtils.getTopicsAndQrelsCache(); } /** diff --git a/src/main/java/io/anserini/search/topicreader/TopicReader.java b/src/main/java/io/anserini/search/topicreader/TopicReader.java index 4a24268873..bd4c619480 100755 --- a/src/main/java/io/anserini/search/topicreader/TopicReader.java +++ b/src/main/java/io/anserini/search/topicreader/TopicReader.java @@ -38,6 +38,7 @@ import org.apache.commons.io.FileUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import io.anserini.util.CacheUtils; /** * A reader of topics, i.e., information needs or queries, in a variety of standard formats. @@ -46,7 +47,6 @@ */ public abstract class TopicReader { private static final Logger LOG = LogManager.getLogger(SearchCollection.class); - private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels").toString(); private static final String SERVER_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/"; private static final Map> TOPIC_FILE_TO_TYPE = new HashMap<>(); @@ -211,11 +211,7 @@ public static Map> getTopicsWithStringIdsFromFileWit } private static String getCacheDir() { - File cacheDir = new File(CACHE_DIR); - if (!cacheDir.exists()) { - cacheDir.mkdir(); - } - return cacheDir.getPath(); + return CacheUtils.getTopicsAndQrelsCache(); } /** diff --git a/src/main/java/io/anserini/util/CacheUtils.java b/src/main/java/io/anserini/util/CacheUtils.java new file mode 100644 index 0000000000..b206caac30 --- /dev/null +++ b/src/main/java/io/anserini/util/CacheUtils.java @@ -0,0 +1,98 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.util; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Path; + +/** + * Mini utility class for handling cache directories in Anserini. + * Fallback to user's home directory for cache. + */ +public class CacheUtils { + private static final String BASE_CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini").toString(); + + public static final String INDEXES_DIR = "indexes"; + public static final String INDEXES_CACHE_PROPERTY = "anserini.index.cache"; + public static final String INDEXES_CACHE_ENV = "ANSERINI_INDEX_CACHE"; + + public static final String ENCODERS_DIR = "encoders"; + public static final String ENCODERS_CACHE_PROPERTY = "anserini.encoder.cache"; + public static final String ENCODERS_CACHE_ENV = "ANSERINI_ENCODER_CACHE"; + + public static final String TOPICS_QRELS_DIR = "topics-and-qrels"; + public static final String TOPICS_QRELS_CACHE_PROPERTY = "anserini.topics.cache"; + public static final String TOPICS_QRELS_CACHE_ENV = "ANSERINI_TOPICS_CACHE"; + + /** + * Gets the cache directory for indexes. + * @return Path to the indexes cache directory + * @throws IOException if the cache directory cannot be created + */ + public static String getIndexesCache() throws IOException { + return getCacheDir(INDEXES_CACHE_PROPERTY, INDEXES_CACHE_ENV, Path.of(BASE_CACHE_DIR, INDEXES_DIR).toString()); + } + + /** + * Gets the cache directory for encoders. + * @return Path to the encoders cache directory + * @throws IOException if the cache directory cannot be created + */ + public static String getEncodersCache() throws IOException { + return getCacheDir(ENCODERS_CACHE_PROPERTY, ENCODERS_CACHE_ENV, Path.of(BASE_CACHE_DIR, ENCODERS_DIR).toString()); + } + + /** + * Gets the cache directory for topics and qrels. + * @return Path to the topics and qrels cache directory + * @throws IOException if the cache directory cannot be created + */ + public static String getTopicsAndQrelsCache() throws IOException { + return getCacheDir(TOPICS_QRELS_CACHE_PROPERTY, TOPICS_QRELS_CACHE_ENV, + Path.of(BASE_CACHE_DIR, TOPICS_QRELS_DIR).toString()); + } + + /** + * Generic method to get a cache directory with fallback options. + * @param propertyName System property name to check first + * @param envVarName Environment variable name to check second + * @param defaultPath Default path to use if neither property nor env var is set + * @return The resolved cache directory path + * @throws IOException if the cache directory cannot be created + */ + private static String getCacheDir(String propertyName, String envVarName, String defaultPath) throws IOException { + String cacheDir = System.getProperty(propertyName); + + if (cacheDir == null || cacheDir.isEmpty()) { + cacheDir = System.getenv(envVarName); + } + + if (cacheDir == null || cacheDir.isEmpty()) { + cacheDir = defaultPath; + } + + File cacheDirFile = new File(cacheDir); + if (!cacheDirFile.exists()) { + if (!cacheDirFile.mkdirs() && !cacheDirFile.exists()) { + throw new IOException("Failed to create cache directory: " + cacheDir + "\n Check that you have write permissions to the directory."); + } + } + + return cacheDir; + } +} \ No newline at end of file diff --git a/src/main/java/io/anserini/util/PrebuiltIndexHandler.java b/src/main/java/io/anserini/util/PrebuiltIndexHandler.java index d807286d96..94094c4e7b 100644 --- a/src/main/java/io/anserini/util/PrebuiltIndexHandler.java +++ b/src/main/java/io/anserini/util/PrebuiltIndexHandler.java @@ -23,6 +23,7 @@ import org.apache.commons.io.input.CountingInputStream; import io.anserini.index.IndexInfo; +import io.anserini.util.CacheUtils; import java.io.FileOutputStream; import java.io.IOException; @@ -35,10 +36,6 @@ import java.nio.file.Path; public class PrebuiltIndexHandler { - private static final String DEFAULT_CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "indexes").toString(); - private static final String CACHE_DIR_PROPERTY = "anserini.index.cache"; - private static final String CACHE_DIR_ENV = "ANSERINI_INDEX_CACHE"; - private String indexName; private String saveRootPath; private IndexInfo info = null; @@ -69,17 +66,7 @@ public PrebuiltIndexHandler(String indexName, String cacheDir) { } private String getCache() { - String cacheDir = System.getProperty(CACHE_DIR_PROPERTY); - - if (cacheDir == null || cacheDir.isEmpty()) { - cacheDir = System.getenv(CACHE_DIR_ENV); - } - - if (cacheDir == null || cacheDir.isEmpty()) { - cacheDir = DEFAULT_CACHE_DIR; - } - - return cacheDir; + return CacheUtils.getIndexesCache(); } private static boolean checkFileExist(Path path) { diff --git a/src/test/java/io/anserini/encoder/EncoderInferenceTest.java b/src/test/java/io/anserini/encoder/EncoderInferenceTest.java index 472588c592..6802244b90 100644 --- a/src/test/java/io/anserini/encoder/EncoderInferenceTest.java +++ b/src/test/java/io/anserini/encoder/EncoderInferenceTest.java @@ -25,22 +25,13 @@ import org.apache.commons.io.FileUtils; public abstract class EncoderInferenceTest { - private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString(); protected String modelName; protected String modelUrl; protected Object[][] examples; protected Object[][] longExamples; - protected String getCacheDir() { - File cacheDir = new File(CACHE_DIR); - if (!cacheDir.exists()) { - cacheDir.mkdir(); - } - return cacheDir.getPath(); - } - protected Path getEncoderModelPath() throws IOException, URISyntaxException { - File modelFile = new File(getCacheDir(), modelName); + File modelFile = new File(OnnxEncoder.getCacheDir(), modelName); FileUtils.copyURLToFile(new URI(modelUrl).toURL(), modelFile); return modelFile.toPath(); } diff --git a/src/test/java/io/anserini/encoder/SpladeEncoderTokenizationTest.java b/src/test/java/io/anserini/encoder/SpladeEncoderTokenizationTest.java index 07d7b58886..0eb1224d36 100644 --- a/src/test/java/io/anserini/encoder/SpladeEncoderTokenizationTest.java +++ b/src/test/java/io/anserini/encoder/SpladeEncoderTokenizationTest.java @@ -33,8 +33,7 @@ import static org.junit.Assert.assertArrayEquals; public class SpladeEncoderTokenizationTest { - private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString(); - private static final String VOCAB_URL = "https://rgw.cs.uwaterloo.ca/pyserini/data/wordpiece-vocab.txt"; + static private final String VOCAB_URL = "https://rgw.cs.uwaterloo.ca/pyserini/data/bert-base-uncased-vocab.txt"; Object[][] examples = new Object[][] { { "which hormone increases calcium levels in the blood?", @@ -78,20 +77,6 @@ public class SpladeEncoderTokenizationTest { 1029, 102 } }, }; - static private String getCacheDir() { - File cacheDir = new File(CACHE_DIR); - if (!cacheDir.exists()) { - cacheDir.mkdir(); - } - return cacheDir.getPath(); - } - - static private Path getVocabPath() throws IOException, URISyntaxException { - File vocabFile = new File(getCacheDir(), "UnicoilVocab.txt"); - FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile); - return vocabFile.toPath(); - } - @Test public void basic() throws Exception { DefaultVocabulary vocabulary = DefaultVocabulary.builder() @@ -119,4 +104,10 @@ private long[] convertTokensToIds(BertFullTokenizer tokenizer, List toke } return tokenIds; } + + static private Path getVocabPath() throws IOException, URISyntaxException { + File vocabFile = new File(OnnxEncoder.getCacheDir(), "splade-vocab.txt"); + FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile); + return vocabFile.toPath(); + } } \ No newline at end of file diff --git a/src/test/java/io/anserini/encoder/UniCoilEncoderTokenizationTest.java b/src/test/java/io/anserini/encoder/UniCoilEncoderTokenizationTest.java index 5915d37296..a7e8953fbc 100644 --- a/src/test/java/io/anserini/encoder/UniCoilEncoderTokenizationTest.java +++ b/src/test/java/io/anserini/encoder/UniCoilEncoderTokenizationTest.java @@ -33,7 +33,6 @@ import static org.junit.Assert.assertArrayEquals; public class UniCoilEncoderTokenizationTest { - private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString(); static private final String VOCAB_URL = "https://rgw.cs.uwaterloo.ca/pyserini/data/wordpiece-vocab.txt"; Object[][] examples = new Object[][] { @@ -78,20 +77,6 @@ public class UniCoilEncoderTokenizationTest { 1029, 102 } }, }; - static private String getCacheDir() { - File cacheDir = new File(CACHE_DIR); - if (!cacheDir.exists()) { - cacheDir.mkdir(); - } - return cacheDir.getPath(); - } - - static private Path getVocabPath() throws IOException, URISyntaxException { - File vocabFile = new File(getCacheDir(), "unicoil-vocab.txt"); - FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile); - return vocabFile.toPath(); - } - @Test public void basic() throws Exception { DefaultVocabulary vocabulary = DefaultVocabulary.builder() @@ -119,4 +104,10 @@ private long[] convertTokensToIds(BertFullTokenizer tokenizer, List toke } return tokenIds; } + + static private Path getVocabPath() throws IOException, URISyntaxException { + File vocabFile = new File(OnnxEncoder.getCacheDir(), "unicoil-vocab.txt"); + FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile); + return vocabFile.toPath(); + } } \ No newline at end of file diff --git a/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java b/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java index d807ff5a9c..d4e9fafe5c 100644 --- a/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java +++ b/src/test/java/io/anserini/eval/RelevanceJudgmentsTest.java @@ -26,6 +26,8 @@ import java.io.IOException; import java.nio.file.Path; +import io.anserini.util.CacheUtils; + public class RelevanceJudgmentsTest{ public int getQrelsCount(RelevanceJudgments qrels) throws IOException{ @@ -1639,42 +1641,42 @@ public void testSymbolExpansion() throws IOException { Path expected; Path produced; - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.msmarco-passage.dev-subset.txt"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.msmarco-passage.dev-subset.txt"); produced = RelevanceJudgments.getQrelsPath(Path.of("msmarco-passage.dev-subset")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.msmarco-v2-passage.dev2.txt"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.msmarco-v2-passage.dev2.txt"); produced = RelevanceJudgments.getQrelsPath(Path.of("msmarco-v2-passage.dev2")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.miracl-v1.0-en-dev.tsv"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.miracl-v1.0-en-dev.tsv"); produced = RelevanceJudgments.getQrelsPath(Path.of("miracl-v1.0-en-dev")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.covid-round3.txt"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.covid-round3.txt"); produced = RelevanceJudgments.getQrelsPath(Path.of("covid-round3")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.ciral-v1.0-yo-test-a-pools.tsv"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.ciral-v1.0-yo-test-a-pools.tsv"); produced = RelevanceJudgments.getQrelsPath(Path.of("ciral-v1.0-yo-test-a-pools")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.adhoc.151-200.txt"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.adhoc.151-200.txt"); produced = RelevanceJudgments.getQrelsPath(Path.of("adhoc.151-200")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.microblog2012.txt"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.microblog2012.txt"); produced = RelevanceJudgments.getQrelsPath(Path.of("microblog2012")); assertNotNull(produced); assertEquals(expected, produced); - expected = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels", "qrels.terabyte04.701-750.txt"); + expected = Path.of(CacheUtils.getTopicsAndQrelsCache(), "qrels.terabyte04.701-750.txt"); produced = RelevanceJudgments.getQrelsPath(Path.of("terabyte04.701-750")); assertNotNull(produced); assertEquals(expected, produced);