Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
ded5c45
refactorings
Jun 2, 2026
6649f26
refactoring, formatting
Jun 2, 2026
4e518c9
killed redundant class, added forgotten repository class
Jun 2, 2026
895017e
refactoring
Jun 3, 2026
0397b85
removed useless exception messages, stripped Serializable, unified ty…
Jun 3, 2026
a9580dc
refactoring
Jun 3, 2026
60c5aff
refactoring
Jun 3, 2026
0ae81c5
removed unused imports
Jun 3, 2026
b1b7a87
renamed variables, manual formatting
Jun 3, 2026
633a99c
renamed variables, manual formatting
Jun 3, 2026
38c30a3
wip
Jun 16, 2026
33cc422
wip
Jun 16, 2026
c7a3ad9
wip
Jun 17, 2026
539413e
wip
Jun 19, 2026
3abd4c2
added now type of GlobalId = url
Jun 19, 2026
921acb9
wip
Jun 22, 2026
e352fe1
handle contributor
Jun 23, 2026
3b8d947
handle http/https doi/handle properly
Jun 24, 2026
1fec2ec
fixed failed url for some imported datasets
Jun 24, 2026
3b0267c
fixed unicode support
Jun 24, 2026
85d465b
removed unused code, rewritten algorithm of finding publication date
Jun 25, 2026
df18d0c
fixed parsing date consisting only of year and month
Jun 25, 2026
574e5d6
made language optional
Jun 25, 2026
3fbacce
added broken doi detection, refactored GlobalId
Jun 26, 2026
646ab8b
fixed http/https identifiers
Jun 30, 2026
3621368
wip
Jul 1, 2026
9e329ce
bug fixes
Jul 1, 2026
740c0fa
fixed potential broken date issue
Jul 1, 2026
04e0f21
fixed logging, removed temporary debug helper
Jul 2, 2026
ea69afb
fixed connection timeout while receiving headers
Jul 2, 2026
a19d7ad
minor optimization
Jul 2, 2026
ba36d7f
removed obsolete code
Jul 2, 2026
0de8833
this, final , static imports, manual formatting, removed redundant va…
Jul 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package edu.harvard.iq.dataverse.persistence;

import static java.util.logging.Level.INFO;
import static java.util.logging.Level.SEVERE;
import static java.util.logging.Logger.getLogger;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import static org.apache.commons.lang3.StringUtils.startsWithIgnoreCase;

import java.io.Serializable;
import java.net.MalformedURLException;
Expand All @@ -19,15 +19,68 @@ public class GlobalId implements Serializable {

public static final String DOI_PROTOCOL = "doi";
public static final String HDL_PROTOCOL = "hdl";
public static final String HTTP_PROTOCOL = "http";
public static final String HTTPS_PROTOCOL = "https";
public static final String HDL_RESOLVER_URL = "https://hdl.handle.net/";
public static final String HDL_RESOLVER_URL2 = "http://hdl.handle.net/";
public static final String DOI_RESOLVER_URL = "https://doi.org/";
public static final String DOI_RESOLVER_URL2 = "http://doi.org/";
public static final String HTTPS_RESOLVER_URL = "https://";
public static final String HTTP_RESOLVER_URL = "http://";
private static final String PID_ALLOWED_CHARACTERS_PATTERN = "^[A-Za-z0-9._/:\\\\-]*";

private static final Logger logger = getLogger(GlobalId.class.getName());

private String protocol;
private String authority;
private String identifier;
private final String protocol;
private final String authority;
private final String identifier;

public static GlobalId fromHDLUrl(final String url) {
final int lastSlashIndex = url.lastIndexOf('/');
final int authorityOffset = url.startsWith(HDL_RESOLVER_URL)
? HDL_RESOLVER_URL.length() : HDL_RESOLVER_URL2.length();
return new GlobalId(HDL_PROTOCOL,
url.substring(authorityOffset, lastSlashIndex),
url.substring(lastSlashIndex + 1));
}

public static GlobalId fromDOIUrl(final String url) {
final int lastSlashIndex = url.lastIndexOf('/');
final int authorityOffset = url.startsWith(DOI_RESOLVER_URL)
? DOI_RESOLVER_URL.length() : DOI_RESOLVER_URL2.length();

final String authority = url.substring(authorityOffset, lastSlashIndex);
final String identifier = url.substring(lastSlashIndex + 1);

// sometimes DOIs come in form of
// https://doi.org/DOI: 10.19195/2353-8546.8.13 or
// https://doi.org/DOI : 10.19195/2353-8546.8.13
// https://doi.org/1https://doi.org/0.14746/eip.2023.2.8
// even after fixing, these links lead to nowhere so it is
// better to let the caller know that it needs to find another identifier
if(startsWithIgnoreCase(authority, "DOI") || authority.startsWith("1https")) {
throw new IllegalArgumentException("Bloken DOI url: ".concat(url));
}
return new GlobalId(DOI_PROTOCOL, authority, identifier);
}

public static GlobalId fromHttpUrl(final String url) throws RuntimeException {
try {
final URL u = new URL(url);
return new GlobalId(HTTP_PROTOCOL, u.getHost(), u.getPath());
} catch (final MalformedURLException e) {
throw new IllegalArgumentException(e);
}
}

public static GlobalId fromHttpsUrl(final String url) throws RuntimeException {
try {
final URL u = new URL(url);
return new GlobalId(HTTPS_PROTOCOL, u.getHost(), u.getPath());
} catch (final MalformedURLException e) {
throw new IllegalArgumentException(e);
}
}

public static Optional<GlobalId> parse(final String identifierString) {
try {
Expand All @@ -42,12 +95,68 @@ public static Optional<GlobalId> parse(final String identifierString) {
* @throws IllegalArgumentException if the passed string cannot be parsed.
* @Thorws NullPointerException i identifier is null
*/
public GlobalId(final String identifier) {
// set the protocol, authority, and identifier via parsePersistentId
if (!parsePersistentId(identifier)) {
throw new IllegalArgumentException("Failed to parse identifier: ".concat(identifier));
public GlobalId(final String identifierString) {
final int index1 = identifierString.indexOf(':');
if (index1 > 0) { // ':' found with one or more characters before it
if(identifierString.startsWith(HTTPS_PROTOCOL)) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space between if (

try {
final URL url = new URL(identifierString.substring(HTTPS_PROTOCOL.length() +1));
this.protocol = HTTPS_PROTOCOL;
this.authority = url.getHost();
this.identifier = url.getPath();
if(conainsNullTerminator(this.identifier)) {
throw createException(identifierString);
}
} catch (final MalformedURLException e) {
throw new IllegalArgumentException(e);
}
} else if(identifierString.startsWith(HTTP_PROTOCOL)) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space between if {

try {
final URL url = new URL(identifierString.substring(HTTP_PROTOCOL.length() +1));
this.protocol = HTTP_PROTOCOL;
this.authority = url.getHost();
this.identifier = url.getPath();
if(conainsNullTerminator(this.identifier)) {
throw createException(identifierString);
}
} catch (final MalformedURLException e) {
throw new IllegalArgumentException(e);
}
} else {
final int index2 = identifierString.indexOf('/', index1 + 1);
if (index2 > 0 && (index2 + 1) < identifierString.length()) { // '/' found with one or more characters
// between ':'
this.protocol = identifierString.substring(0, index1); // and '/' and there are characters after '/'
if (!DOI_PROTOCOL.equals(this.protocol)
&& !HDL_PROTOCOL.equals(this.protocol)) {
throw createException(identifierString);
}
//Strip any whitespace, ; and ' from authority (should finding them cause a failure instead?)
this.authority = formatIdentifierString(identifierString.substring(index1 + 1, index2));
if (conainsNullTerminator(this.authority)) {
throw createException(identifierString);
}
if (this.protocol.equals(DOI_PROTOCOL) && !this.checkDOIAuthority(this.authority)) {
throw createException(identifierString);
}
// Passed all checks
//Strip any whitespace, ; and ' from identifier (should finding them cause a failure instead?)
this.identifier = formatIdentifierString(identifierString.substring(index2 + 1));
if(conainsNullTerminator(this.identifier)) {
throw createException(identifierString);
}
} else {
throw createException(identifierString);
}
}
} else {
throw createException(identifierString);
}
}

private static IllegalArgumentException createException(final String identifier) {
return new IllegalArgumentException("Failed to parse identifier: ".concat(identifier));
}

public GlobalId(final String protocol, final String authority, final String identifier) {
this.protocol = protocol;
Expand Down Expand Up @@ -77,24 +186,12 @@ public String getProtocol() {
return this.protocol;
}

public void setProtocol(final String protocol) {
this.protocol = protocol;
}

public String getAuthority() {
return this.authority;
}

public void setAuthority(final String authority) {
this.authority = authority;
}
public String getAuthority() {
return this.authority;
}

public String getIdentifier() {
return this.identifier;
}

public void setIdentifier(final String identifier) {
this.identifier = identifier;
return this.identifier;
}

public String toString() {
Expand All @@ -108,9 +205,13 @@ public String toString() {
* @return The string representation of this global id.
*/
public String asString() {
return this.protocol == null || this.authority == null || this.identifier == null
? ""
: this.protocol + ':' + this.authority + '/' + this.identifier;
if(this.protocol == null || this.authority == null || this.identifier == null) {
return "";
} else if(HTTP_PROTOCOL.equals(this.protocol) || HTTPS_PROTOCOL.equals(this.protocol)) {
return this.protocol + ':' + this.protocol + "://" + this.authority + this.identifier;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two time this.protocol this was intended?

} else {
return this.protocol + ':' + this.authority + '/' + this.identifier;
}
}

public URL toURL() {
Expand All @@ -120,67 +221,43 @@ public URL toURL() {
return new URL(DOI_RESOLVER_URL + this.authority + '/' + this.identifier);
} else if (HDL_PROTOCOL.equals(this.protocol)) {
return new URL(HDL_RESOLVER_URL + this.authority + '/' + this.identifier);
}
} else if(HTTP_PROTOCOL.equals(this.protocol) || HTTPS_PROTOCOL.equals(this.protocol)) {
return new URL(this.protocol, this.authority, this.identifier);
}
} catch (final MalformedURLException ex) {
logger.log(SEVERE, null, ex);
}
}
return null;
}

public String getStoragePath() {
return this.authority.replace(':', '_') + '/' +
stripLeadingSlashes(this.identifier.replace(':', '_'));
}

private static String stripLeadingSlashes(final String s) {
int index = 0;
while(s.charAt(index) == '/') {
++index;
}
return s.substring(index);
}


/**
* Parse a Persistent Id and set the protocol, authority, and identifier
* <p>
* Example 1: doi:10.5072/FK2/BYM3IW
* protocol: doi
* authority: 10.5072
* identifier: FK2/BYM3IW
* <p>
* Example 2: hdl:1902.1/111012
* protocol: hdl
* authority: 1902.1
* identifier: 111012
*
* @param identifierString
* @param separator the string that separates the authority from the identifier.
* @param destination the global id that will contain the parsed data.
* @return {@code destination}, after its fields have been updated, or
* {@code null} if parsing failed.
*/
private boolean parsePersistentId(final String identifierString) {

final int index1 = identifierString.indexOf(':');
if (index1 > 0) { // ':' found with one or more characters before it
final int index2 = identifierString.indexOf('/', index1 + 1);
if (index2 > 0 && (index2 + 1) < identifierString.length()) { // '/' found with one or more characters
// between ':'
this.protocol = identifierString.substring(0, index1); // and '/' and there are characters after '/'
if (!DOI_PROTOCOL.equals(this.protocol) && !HDL_PROTOCOL.equals(this.protocol)) {
return false;
}
//Strip any whitespace, ; and ' from authority (should finding them cause a failure instead?)
this.authority = formatIdentifierString(identifierString.substring(index1 + 1, index2));
if (testforNullTerminator(this.authority)) {
return false;
}
if (this.protocol.equals(DOI_PROTOCOL) && !this.checkDOIAuthority(this.authority)) {
return false;
}
// Passed all checks
//Strip any whitespace, ; and ' from identifier (should finding them cause a failure instead?)
this.identifier = formatIdentifierString(identifierString.substring(index2 + 1));
return !testforNullTerminator(this.identifier);
} else {
logger.log(INFO, "Error parsing identifier: {0}: '':<authority>/<identifier>'' not found in string",
identifierString);
return false;
}
} else {
logger.log(INFO, "Error parsing identifier: {0}: ''<protocol>:'' not found in string",
identifierString);
return false;
}
public static boolean isDOI(final String id) {
return id.startsWith(DOI_RESOLVER_URL) || id.startsWith(DOI_RESOLVER_URL2);
}

public static boolean isHDL(final String id) {
return id.startsWith(HDL_RESOLVER_URL) || id.startsWith(HDL_RESOLVER_URL2);
}

public static boolean isHTTP(final String id) {
return id.startsWith(HTTP_RESOLVER_URL);
}

public static boolean isHTTPS(final String id) {
return id.startsWith(HTTPS_RESOLVER_URL);
}

private static String formatIdentifierString(final String str) {
Expand All @@ -202,7 +279,7 @@ private static String formatIdentifierString(final String str) {
// http://www.doi.org/doi_handbook/2_Numbering.html
}

private static boolean testforNullTerminator(final String str) {
private static boolean conainsNullTerminator(final String str) {
return str != null ? str.indexOf('\u0000') > 0 : false;
}

Expand Down
Loading
Loading