JabRef · NishantDG-SST · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,8 +9,8 @@

 ## [Unreleased]
 
 ### Added
-
+- We added RefChecker logic to validate entries against DOI, CrossRef, and arXiv sources [#13604](https://github.com/JabRef/jabref/issues/13604)
 - We added support for downloading full-text PDFs from Wiley journals via the Wiley TDM API. [#13404](https://github.com/JabRef/jabref/issues/13404)
 - We added `--key-patterns` option to CLI parameters to allows users to set a citation key's pattern for a specific entry type. [#14707](https://github.com/JabRef/jabref/issues/14707)
 - We added a CLI option `--field-formatters` to the `convert` and `generate-bib-from-aux` commands to apply field formatters during export. [#11520](https://github.com/JabRef/jabref/issues/11520)

diff --git a/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java b/jablib/src/main/java/org/jabref/logic/database/DuplicateCheck.java
@@ -3,6 +3,7 @@
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
@@ -22,6 +23,7 @@
 import org.jabref.model.entry.field.BibField;
 import org.jabref.model.entry.field.Field;
 import org.jabref.model.entry.field.FieldProperty;
+import org.jabref.model.entry.field.InternalField;
 import org.jabref.model.entry.field.OrFields;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.identifier.ISBN;
@@ -33,6 +35,7 @@
 
 /// This class contains utility method for duplicate checking of entries.
 public class DuplicateCheck {
+    public static final double COMPARE_ENTRIES_THRESHOLD = 0.8; // The threshold that determines if entries are likely to be of the same publication
     private static final double DUPLICATE_THRESHOLD = 0.75; // The overall threshold to signal a duplicate pair
 
     private static final Logger LOGGER = LoggerFactory.getLogger(DuplicateCheck.class);
@@ -338,4 +341,44 @@ public Optional<BibEntry> containsDuplicate(final BibDatabase database,
                                                 final BibDatabaseMode bibDatabaseMode) {
         return database.getEntries().stream().filter(other -> isDuplicate(entry, other, bibDatabaseMode)).findFirst();
     }
+
+    /// Computes a similarity score between two entries based on their shared non-internal fields.
+    ///
+    /// It does not return early when a DOI or other identifier matches.
+    /// This means that an entry with the correct DOI but a wrong author name will still receive a low score.
+    /// The old [#isDuplicate] method would have called that a duplicate immediately, which is wrong
+    /// for reference checking purposes.it ignores JabRefs internal fields such as the citation key and entry type marker.
+    /// A fetcher never populates those fields so including them in the comparison would produce incorrect results.
+    /// For eg, if a local entry has a citation key and the fetched entry does not, including the
+    /// citation key field would skew the score.
+    ///
+    /// The returned score is a number in the range [0.0, 1.0].
+    /// A score of 1.0 means every shared non-internal field is identical.
+    /// A score at or above [#COMPARE_ENTRIES_THRESHOLD] means the entries are likely the same publication.
+    /// A score of 0.0 is returned when the two entries share no non-internal fields at all.
+    ///
+    /// @param one the first entry to compare
+    /// @param two the second entry to compare
+    /// @return similarity score in [0.0, 1.0]
+    public static double compareEntries(BibEntry one, BibEntry two) {
+        StringSimilarity stringSimilarity = new StringSimilarity();
+
+        List<Field> commonFields = one.getFields().stream()
+                                      .filter(field -> !(field instanceof InternalField))
+                                      .filter(field -> two.getField(field).isPresent())
+                                      .toList();
+
+        if (commonFields.isEmpty()) {
+            return 0.0;
+        }
+
+        return commonFields.stream()
+                           .mapToDouble(field -> {
+                               String firstValue = one.getField(field).orElse("");
+                               String secondValue = two.getField(field).orElse("");
-                               String firstValue = one.getField(field).orElse("");
-                               String secondValue = two.getField(field).orElse("");
+                               String firstValue = one.getFieldLatexFree(field).orElse("");
+                               String secondValue = two.getFieldLatexFree(field).orElse("");
-                               String firstValue = one.getField(field).orElse("");
-                               String secondValue = two.getField(field).orElse("");
+                               String firstValue = one.getFieldLatexFree(field).orElse("");
+                               String secondValue = two.getFieldLatexFree(field).orElse("");
+                               return stringSimilarity.similarity(firstValue, secondValue);
+                           })
+                           .average()
+                           .orElse(0.0);
+    }
 }
diff --git a/jablib/src/main/java/org/jabref/logic/refcheck/RefCheckResult.java b/jablib/src/main/java/org/jabref/logic/refcheck/RefCheckResult.java
@@ -0,0 +1,8 @@
+package org.jabref.logic.refcheck;
+
+import org.jabref.model.entry.BibEntry;
+
+import org.jspecify.annotations.Nullable;
+
+public record RefCheckResult(RefValidity validity, @Nullable BibEntry otherEntry, double similarityScore) {
+}
diff --git a/jablib/src/main/java/org/jabref/logic/refcheck/RefChecker.java b/jablib/src/main/java/org/jabref/logic/refcheck/RefChecker.java
@@ -0,0 +1,225 @@
+package org.jabref.logic.refcheck;
+
+import java.util.Optional;
+
+import org.jabref.logic.database.DuplicateCheck;
+import org.jabref.logic.importer.FetcherException;
+import org.jabref.logic.importer.fetcher.ArXivFetcher;
+import org.jabref.logic.importer.fetcher.CrossRef;
+import org.jabref.logic.importer.fetcher.DoiFetcher;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.identifier.ArXivIdentifier;
+import org.jabref.model.entry.identifier.DOI;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/// Validates bibliographic entries against authoritative online sources.
+///
+/// For each entry 3 sources are tried in this order:
+///
+/// 1. DOI lookup via [DoiFetcher], using the DOI already present in the entry
+/// 2. DOI discovery via [CrossRef], then fetching via [DoiFetcher]
+/// 3. arXiv identifier lookup via [ArXivFetcher]
+///
+/// The lookup stops as soon as a [RefValidity#REAL] result is found.
+/// If no source produces a real result, the best result among all attempts is returned.
+/// UNSURE is considered better than FAKE.
+/// Among results with the same validity, the one with the higher similarity score is preferred.
+///
+/// No LLM is used. Classification is based entirely on
+/// [org.jabref.logic.database.DuplicateCheck#compareEntries].
+public class RefChecker {
+
+    private static final Logger LOGGER = LoggerFactory.getLogger(RefChecker.class);
+
+    /// A score below this but above zero means we found something but it does not match well.
+    /// This boundary separates UNSURE from FAKE when a candidate was found.
+    private static final double UNSURE_LOWER_BOUND = 0.5;
+
+    private final DoiFetcher doiFetcher;
+    private final ArXivFetcher arXivFetcher;
+    private final CrossRef crossRef;
+
+    /// Creates a RefChecker using only the two main fetchers.
+    /// CrossRef is created with its default constructor.
+    public RefChecker(DoiFetcher doiFetcher, ArXivFetcher arXivFetcher) {
+        this(doiFetcher, arXivFetcher, new CrossRef());
+    }
+
+    /// Creates a RefChecker with all three dependencies supplied explicitly
+    ///
+    /// @param doiFetcher   the DOI fetcher
+    /// @param arXivFetcher the arXiv fetcher
+    /// @param crossRef     the CrossRef fetcher used for DOI discovery
+    public RefChecker(DoiFetcher doiFetcher, ArXivFetcher arXivFetcher, CrossRef crossRef) {
+        this.doiFetcher = doiFetcher;
+        this.arXivFetcher = arXivFetcher;
+        this.crossRef = crossRef;
+    }
+
+    /// Checks a single entry against authoritative online sources and returns a classification.
+    ///
+    /// The three sources are tried in order. If any of them produces a [RefValidity#REAL] result,
+    /// that result is returned immediately without trying the remaining sources.
+    ///
+    /// If none of the sources produce a real result, the best result from all three attempts is returned.
+    ///
+    /// Individual fetcher failures are logged and treated as "not found" so that
+    /// a network error from one source does not make the other sources fail
+    public RefCheckResult check(BibEntry entry) {
+        RefCheckResult doiResult = checkByDoi(entry);
+        if (doiResult.validity() == RefValidity.REAL) {
+            return doiResult;
+        }
+
+        RefCheckResult crossRefResult = checkByCrossRef(entry);
+        if (crossRefResult.validity() == RefValidity.REAL) {
+            return crossRefResult;
+        }
+
+        RefCheckResult arXivResult = checkByArXiv(entry);
+        if (arXivResult.validity() == RefValidity.REAL) {
+            return arXivResult;
+        }
+
+        return bestOf(doiResult, crossRefResult, arXivResult);
+    }
+
+    /// Tries to validate the entry using the DOI already stored in it.
+    /// Returns a FAKE result with no matched entry if the entry has no DOI
+    private RefCheckResult checkByDoi(BibEntry entry) {
+        Optional<DOI> doi = entry.getDOI();
+        if (doi.isEmpty()) {
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        Optional<BibEntry> found;
+        try {
+            found = doiFetcher.performSearchById(doi.get().asString());
+        } catch (FetcherException e) {
+            LOGGER.warn("DOI lookup failed for {}: {}", doi.get().asString(), e.getMessage());
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        if (found.isEmpty()) {
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        return classify(entry, found.get());
+    }
+
+    /// Tries to validate the entry by first discovering a DOI through CrossRef,
+    /// then fetching the authoritative entry using that DOI.
+    /// Returns a FAKE result with no matched entry if CrossRef finds no DOI
+    private RefCheckResult checkByCrossRef(BibEntry entry) {
+        Optional<DOI> foundDoi;
+        try {
+            foundDoi = crossRef.findIdentifier(entry);
+        } catch (FetcherException e) {
+            LOGGER.warn("CrossRef lookup failed: {}", e.getMessage());
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        if (foundDoi.isEmpty()) {
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        Optional<BibEntry> found;
+        try {
+            found = doiFetcher.performSearchById(foundDoi.get().asString());
+        } catch (FetcherException e) {
+            LOGGER.warn("DOI fetch after CrossRef discovery failed for {}: {}", foundDoi.get().asString(), e.getMessage());
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        if (found.isEmpty()) {
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        return classify(entry, found.get());
+    }
+
+    /// Tries to validate the entry using arXiv.
+    /// First looks for an arXiv identifier, then fetches the full entry by that identifier.
+    /// Returns a FAKE result with no matched entry if no arXiv identifier is found
+    private RefCheckResult checkByArXiv(BibEntry entry) {
+        Optional<ArXivIdentifier> identifier;
+        try {
+            identifier = arXivFetcher.findIdentifier(entry);
+        } catch (FetcherException e) {
+            LOGGER.warn("arXiv identifier lookup failed: {}", e.getMessage());
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        if (identifier.isEmpty()) {
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        Optional<BibEntry> found;
+        try {
+            found = arXivFetcher.performSearchById(identifier.get().asString());
+        } catch (FetcherException e) {
+            LOGGER.warn("arXiv fetch failed for {}: {}", identifier.get().asString(), e.getMessage());
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        if (found.isEmpty()) {
+            return new RefCheckResult(RefValidity.FAKE, null, 0.0);
+        }
+
+        return classify(entry, found.get());
+    }
+
+    /// Compares the local entry against the authoritative entry and returns a classified result.
+    private static RefCheckResult classify(BibEntry local, BibEntry authoritative) {
+        double score = DuplicateCheck.compareEntries(local, authoritative);
+
+        if (score >= DuplicateCheck.COMPARE_ENTRIES_THRESHOLD) {
+            return new RefCheckResult(RefValidity.REAL, authoritative, score);
+        }
+
+        if (score >= UNSURE_LOWER_BOUND) {
+            return new RefCheckResult(RefValidity.UNSURE, authoritative, score);
+        }
+
+        return new RefCheckResult(RefValidity.FAKE, authoritative, score);
+    }
+
+    /// Returns the best result from the three attempts.
+    ///
+    /// REAL > UNSURE > FAKE.
+    /// Among results with the same validity the one with the higher similarity score wins.
+    ///
+    /// This method is only called when none of the three results is REAL.
+    private static RefCheckResult bestOf(RefCheckResult doiResult, RefCheckResult crossRefResult,
+                                         RefCheckResult arXivResult) {
+        RefCheckResult best = doiResult;
+
+        if (rank(crossRefResult) > rank(best)
+                || (rank(crossRefResult) == rank(best)
+                && crossRefResult.similarityScore() > best.similarityScore())) {
+            best = crossRefResult;
+        }
+
+        if (rank(arXivResult) > rank(best)
+                || (rank(arXivResult) == rank(best)
+                && arXivResult.similarityScore() > best.similarityScore())) {
+            best = arXivResult;
+        }
+
+        return best;
+    }
+
+    /// REAL > UNSURE > FAKE
+    private static int rank(RefCheckResult result) {
+        return switch (result.validity()) {
+            case REAL ->
+                    2;
+            case UNSURE ->
+                    1;
+            case FAKE ->
+                    0;
+        };
+    }
+}
diff --git a/jablib/src/main/java/org/jabref/logic/refcheck/RefValidity.java b/jablib/src/main/java/org/jabref/logic/refcheck/RefValidity.java
@@ -0,0 +1,7 @@
+package org.jabref.logic.refcheck;
+
+public enum RefValidity {
+    REAL,
+    UNSURE,
+    FAKE
+}
diff --git a/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java b/jablib/src/test/java/org/jabref/logic/database/DuplicateCheckTest.java
@@ -7,6 +7,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.BibEntryTypesManager;
 import org.jabref.model.entry.field.Field;
+import org.jabref.model.entry.field.InternalField;
 import org.jabref.model.entry.field.StandardField;
 import org.jabref.model.entry.types.StandardEntryType;
 
@@ -627,4 +628,61 @@ void differentInCollectionWithTheSameISBNAreNotDuplicates() {
 
         assertFalse(duplicateChecker.isDuplicate(entryOne, entryTwo, BibDatabaseMode.BIBTEX));
     }
+
+    @Test
+    void entriesShareNoNonInternalFields() {
+        BibEntry one = new BibEntry().withField(StandardField.TITLE, "Some Title");
+        BibEntry two = new BibEntry().withField(StandardField.AUTHOR, "Some Author");
+
+        double score = DuplicateCheck.compareEntries(one, two);
+
+        assertEquals(0.0, score);
+    }
+
+    @Test
+    void sameEntryIsComparedWithItself() {
+        BibEntry entry = getSimpleArticle();
+
+        double score = DuplicateCheck.compareEntries(entry, entry);
+
+        assertEquals(1.0, score);
+    }
+
+    @Test
+    void ignoresCitationKeyWhenDeterminingScore() {
+        BibEntry one = new BibEntry()
+                .withField(InternalField.KEY_FIELD, "citationKey1")
+                .withField(StandardField.TITLE, "Identical Title");
+        BibEntry two = new BibEntry()
+                .withField(InternalField.KEY_FIELD, "citationKey2")
+                .withField(StandardField.TITLE, "Identical Title");
+
+        double score = DuplicateCheck.compareEntries(one, two);
+
+        assertEquals(1.0, score);
+    }
+
+    @Test
+    void entriesWithIdenticalTitles() {
+        BibEntry one = new BibEntry().withField(StandardField.TITLE, "Reinforcement learning: An introduction");
+        BibEntry two = new BibEntry().withField(StandardField.TITLE, "Reinforcement learning: An introduction");
+
+        double score = DuplicateCheck.compareEntries(one, two);
+
+        assertTrue(score >= DuplicateCheck.COMPARE_ENTRIES_THRESHOLD);
+    }
+
+    @Test
+    void entriesWithCompletelyDifferentFields() {
+        BibEntry one = new BibEntry()
+                .withField(StandardField.TITLE, "Performance on a Signal")
+                .withField(StandardField.AUTHOR, "Richard Atkinson");
+        BibEntry two = new BibEntry()
+                .withField(StandardField.TITLE, "Rest in Treatment")
+                .withField(StandardField.AUTHOR, "Elizabeth Ballard");
+
+        double score = DuplicateCheck.compareEntries(one, two);
+
+        assertTrue(score < DuplicateCheck.COMPARE_ENTRIES_THRESHOLD);
+    }
 }