Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions docs/cache-directories.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Cache Directories in Anserini

Anserini uses cache directories for storing various resources, such as:

- Pre-built indexes
- Encoder models
- Topics and qrels files

By default, these are stored in your home directory under `~/.cache/pyserini/`.

## Default Cache Paths

- Indexes: `~/.cache/pyserini/indexes/`
- Encoders: `~/.cache/pyserini/encoders/`
- Topics and Qrels: `~/.cache/pyserini/topics-and-qrels/`

## Customizing Cache Directories

You can customize these paths using environment variables.

### Using Environment Variables

```sh
# Set custom cache directory for indexes
export ANSERINI_INDEX_CACHE=/path/to/custom/index/cache

# Set custom cache directory for encoders
export ANSERINI_ENCODER_CACHE=/path/to/custom/encoder/cache

# Set custom cache directory for topics and qrels
export ANSERINI_TOPICS_CACHE=/path/to/custom/topics/cache
```

## Fallback Order

When resolving cache directories, Anserini checks for locations in the following order:

1. System property (e.g., `anserini.index.cache`)
2. Environment variable (e.g., `ANSERINI_INDEX_CACHE`)
3. Default location in the user home directory
4 changes: 2 additions & 2 deletions docs/prebuilt-indexes.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ The HNSW indexes for dense retrieval models are even larger, for example, the Ar

The prebuilt indexes are automatically downloaded to `~/.cache/pyserini/indexes/`, which may not be the best location for you.
(Yes, `pyserini`; this is so prebuilt indexes from both Pyserini and Anserini can live in the same location.)
Currently, this path is hard-coded (see [Anserini #2322](https://github.com/castorini/anserini/issues/2322)).
If you want to change the download location, the current workaround is to use symlinks, i.e., symlink `~/.cache/pyserini/indexes/` to the actual path you desire.

You can customize the location of the cache directory using environment variables or system properties. See the [cache directories documentation](cache-directories.md) for detailed information on how to customize where Anserini stores its cached resources.

## Managing Indexes Manually

Expand Down
9 changes: 2 additions & 7 deletions src/main/java/io/anserini/encoder/OnnxEncoder.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@
import ai.onnxruntime.OrtEnvironment;
import ai.onnxruntime.OrtException;
import ai.onnxruntime.OrtSession;
import io.anserini.util.CacheUtils;

public abstract class OnnxEncoder<T> {
private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString();

protected final BertFullTokenizer tokenizer;

protected final DefaultVocabulary vocab;
Expand All @@ -53,11 +52,7 @@ static protected Path getVocabPath(String vocabName, String vocabURL) throws URI
}

static protected String getCacheDir() {
File cacheDir = new File(CACHE_DIR);
if (!cacheDir.exists()) {
cacheDir.mkdir();
}
return cacheDir.getPath();
return CacheUtils.getEncodersCache();
}

static protected Path getModelPath(String modelName, String modelURL) throws IOException, URISyntaxException {
Expand Down
8 changes: 2 additions & 6 deletions src/main/java/io/anserini/eval/RelevanceJudgments.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
import java.util.Set;

import org.apache.commons.io.FileUtils;
import io.anserini.util.CacheUtils;

public class RelevanceJudgments {
final private Map<String, Map<String, Integer>> qrels;
static private final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels").toString();
final private static String SERVER_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/";

public static RelevanceJudgments fromQrels(Qrels qrels) throws IOException {
Expand Down Expand Up @@ -118,11 +118,7 @@ public Map<String, Integer> getDocMap(String qid) {
}

private static String getCacheDir() {
File cacheDir = new File(CACHE_DIR);
if (!cacheDir.exists()) {
cacheDir.mkdir();
}
return cacheDir.getPath();
return CacheUtils.getTopicsAndQrelsCache();
}

/**
Expand Down
8 changes: 2 additions & 6 deletions src/main/java/io/anserini/search/topicreader/TopicReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.apache.commons.io.FileUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import io.anserini.util.CacheUtils;

/**
* A reader of topics, i.e., information needs or queries, in a variety of standard formats.
Expand All @@ -46,7 +47,6 @@
*/
public abstract class TopicReader<K> {
private static final Logger LOG = LogManager.getLogger(SearchCollection.class);
private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "topics-and-qrels").toString();
private static final String SERVER_PATH = "https://raw.githubusercontent.com/castorini/anserini-tools/master/topics-and-qrels/";
private static final Map<String, Class<? extends TopicReader>> TOPIC_FILE_TO_TYPE = new HashMap<>();

Expand Down Expand Up @@ -211,11 +211,7 @@ public static Map<String, Map<String, String>> getTopicsWithStringIdsFromFileWit
}

private static String getCacheDir() {
File cacheDir = new File(CACHE_DIR);
if (!cacheDir.exists()) {
cacheDir.mkdir();
}
return cacheDir.getPath();
return CacheUtils.getTopicsAndQrelsCache();
}

/**
Expand Down
98 changes: 98 additions & 0 deletions src/main/java/io/anserini/util/CacheUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.util;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;

/**
* Mini utility class for handling cache directories in Anserini.
* Fallback to user's home directory for cache.
*/
public class CacheUtils {
private static final String BASE_CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini").toString();

public static final String INDEXES_DIR = "indexes";
public static final String INDEXES_CACHE_PROPERTY = "anserini.index.cache";
public static final String INDEXES_CACHE_ENV = "ANSERINI_INDEX_CACHE";

public static final String ENCODERS_DIR = "encoders";
public static final String ENCODERS_CACHE_PROPERTY = "anserini.encoder.cache";
public static final String ENCODERS_CACHE_ENV = "ANSERINI_ENCODER_CACHE";

public static final String TOPICS_QRELS_DIR = "topics-and-qrels";
public static final String TOPICS_QRELS_CACHE_PROPERTY = "anserini.topics.cache";
public static final String TOPICS_QRELS_CACHE_ENV = "ANSERINI_TOPICS_CACHE";

/**
* Gets the cache directory for indexes.
* @return Path to the indexes cache directory
* @throws IOException if the cache directory cannot be created
*/
public static String getIndexesCache() throws IOException {
return getCacheDir(INDEXES_CACHE_PROPERTY, INDEXES_CACHE_ENV, Path.of(BASE_CACHE_DIR, INDEXES_DIR).toString());
}

/**
* Gets the cache directory for encoders.
* @return Path to the encoders cache directory
* @throws IOException if the cache directory cannot be created
*/
public static String getEncodersCache() throws IOException {
return getCacheDir(ENCODERS_CACHE_PROPERTY, ENCODERS_CACHE_ENV, Path.of(BASE_CACHE_DIR, ENCODERS_DIR).toString());
}

/**
* Gets the cache directory for topics and qrels.
* @return Path to the topics and qrels cache directory
* @throws IOException if the cache directory cannot be created
*/
public static String getTopicsAndQrelsCache() throws IOException {
return getCacheDir(TOPICS_QRELS_CACHE_PROPERTY, TOPICS_QRELS_CACHE_ENV,
Path.of(BASE_CACHE_DIR, TOPICS_QRELS_DIR).toString());
}

/**
* Generic method to get a cache directory with fallback options.
* @param propertyName System property name to check first
* @param envVarName Environment variable name to check second
* @param defaultPath Default path to use if neither property nor env var is set
* @return The resolved cache directory path
* @throws IOException if the cache directory cannot be created
*/
private static String getCacheDir(String propertyName, String envVarName, String defaultPath) throws IOException {
String cacheDir = System.getProperty(propertyName);

if (cacheDir == null || cacheDir.isEmpty()) {
cacheDir = System.getenv(envVarName);
}

if (cacheDir == null || cacheDir.isEmpty()) {
cacheDir = defaultPath;
}

File cacheDirFile = new File(cacheDir);
if (!cacheDirFile.exists()) {
if (!cacheDirFile.mkdirs() && !cacheDirFile.exists()) {
throw new IOException("Failed to create cache directory: " + cacheDir + "\n Check that you have write permissions to the directory.");
}
}

return cacheDir;
}
}
17 changes: 2 additions & 15 deletions src/main/java/io/anserini/util/PrebuiltIndexHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.commons.io.input.CountingInputStream;

import io.anserini.index.IndexInfo;
import io.anserini.util.CacheUtils;

import java.io.FileOutputStream;
import java.io.IOException;
Expand All @@ -35,10 +36,6 @@
import java.nio.file.Path;

public class PrebuiltIndexHandler {
private static final String DEFAULT_CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "indexes").toString();
private static final String CACHE_DIR_PROPERTY = "anserini.index.cache";
private static final String CACHE_DIR_ENV = "ANSERINI_INDEX_CACHE";

private String indexName;
private String saveRootPath;
private IndexInfo info = null;
Expand Down Expand Up @@ -69,17 +66,7 @@ public PrebuiltIndexHandler(String indexName, String cacheDir) {
}

private String getCache() {
String cacheDir = System.getProperty(CACHE_DIR_PROPERTY);

if (cacheDir == null || cacheDir.isEmpty()) {
cacheDir = System.getenv(CACHE_DIR_ENV);
}

if (cacheDir == null || cacheDir.isEmpty()) {
cacheDir = DEFAULT_CACHE_DIR;
}

return cacheDir;
return CacheUtils.getIndexesCache();
}

private static boolean checkFileExist(Path path) {
Expand Down
11 changes: 1 addition & 10 deletions src/test/java/io/anserini/encoder/EncoderInferenceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,13 @@
import org.apache.commons.io.FileUtils;

public abstract class EncoderInferenceTest {
private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString();
protected String modelName;
protected String modelUrl;
protected Object[][] examples;
protected Object[][] longExamples;

protected String getCacheDir() {
File cacheDir = new File(CACHE_DIR);
if (!cacheDir.exists()) {
cacheDir.mkdir();
}
return cacheDir.getPath();
}

protected Path getEncoderModelPath() throws IOException, URISyntaxException {
File modelFile = new File(getCacheDir(), modelName);
File modelFile = new File(OnnxEncoder.getCacheDir(), modelName);
FileUtils.copyURLToFile(new URI(modelUrl).toURL(), modelFile);
return modelFile.toPath();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
import static org.junit.Assert.assertArrayEquals;

public class SpladeEncoderTokenizationTest {
private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString();
private static final String VOCAB_URL = "https://rgw.cs.uwaterloo.ca/pyserini/data/wordpiece-vocab.txt";
static private final String VOCAB_URL = "https://rgw.cs.uwaterloo.ca/pyserini/data/bert-base-uncased-vocab.txt";

Object[][] examples = new Object[][] {
{ "which hormone increases calcium levels in the blood?",
Expand Down Expand Up @@ -78,20 +77,6 @@ public class SpladeEncoderTokenizationTest {
1029, 102 } },
};

static private String getCacheDir() {
File cacheDir = new File(CACHE_DIR);
if (!cacheDir.exists()) {
cacheDir.mkdir();
}
return cacheDir.getPath();
}

static private Path getVocabPath() throws IOException, URISyntaxException {
File vocabFile = new File(getCacheDir(), "UnicoilVocab.txt");
FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile);
return vocabFile.toPath();
}

@Test
public void basic() throws Exception {
DefaultVocabulary vocabulary = DefaultVocabulary.builder()
Expand Down Expand Up @@ -119,4 +104,10 @@ private long[] convertTokensToIds(BertFullTokenizer tokenizer, List<String> toke
}
return tokenIds;
}

static private Path getVocabPath() throws IOException, URISyntaxException {
File vocabFile = new File(OnnxEncoder.getCacheDir(), "splade-vocab.txt");
FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile);
return vocabFile.toPath();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import static org.junit.Assert.assertArrayEquals;

public class UniCoilEncoderTokenizationTest {
private static final String CACHE_DIR = Path.of(System.getProperty("user.home"), ".cache", "pyserini", "encoders").toString();
static private final String VOCAB_URL = "https://rgw.cs.uwaterloo.ca/pyserini/data/wordpiece-vocab.txt";

Object[][] examples = new Object[][] {
Expand Down Expand Up @@ -78,20 +77,6 @@ public class UniCoilEncoderTokenizationTest {
1029, 102 } },
};

static private String getCacheDir() {
File cacheDir = new File(CACHE_DIR);
if (!cacheDir.exists()) {
cacheDir.mkdir();
}
return cacheDir.getPath();
}

static private Path getVocabPath() throws IOException, URISyntaxException {
File vocabFile = new File(getCacheDir(), "unicoil-vocab.txt");
FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile);
return vocabFile.toPath();
}

@Test
public void basic() throws Exception {
DefaultVocabulary vocabulary = DefaultVocabulary.builder()
Expand Down Expand Up @@ -119,4 +104,10 @@ private long[] convertTokensToIds(BertFullTokenizer tokenizer, List<String> toke
}
return tokenIds;
}

static private Path getVocabPath() throws IOException, URISyntaxException {
File vocabFile = new File(OnnxEncoder.getCacheDir(), "unicoil-vocab.txt");
FileUtils.copyURLToFile(new URI(VOCAB_URL).toURL(), vocabFile);
return vocabFile.toPath();
}
}
Loading
Loading