-
Notifications
You must be signed in to change notification settings - Fork 3
3289 harvest via oai pmh of rein d space fails #3321
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
lbownik
wants to merge
33
commits into
develop
Choose a base branch
from
3289_Harvest_via_OAI_PMH_of_REIN_DSpace_fails
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
33 commits
Select commit
Hold shift + click to select a range
ded5c45
refactorings
6649f26
refactoring, formatting
4e518c9
killed redundant class, added forgotten repository class
895017e
refactoring
0397b85
removed useless exception messages, stripped Serializable, unified ty…
a9580dc
refactoring
60c5aff
refactoring
0ae81c5
removed unused imports
b1b7a87
renamed variables, manual formatting
633a99c
renamed variables, manual formatting
38c30a3
wip
33cc422
wip
c7a3ad9
wip
539413e
wip
3abd4c2
added now type of GlobalId = url
921acb9
wip
e352fe1
handle contributor
3b8d947
handle http/https doi/handle properly
1fec2ec
fixed failed url for some imported datasets
3b0267c
fixed unicode support
85d465b
removed unused code, rewritten algorithm of finding publication date
df18d0c
fixed parsing date consisting only of year and month
574e5d6
made language optional
3fbacce
added broken doi detection, refactored GlobalId
646ab8b
fixed http/https identifiers
3621368
wip
9e329ce
bug fixes
740c0fa
fixed potential broken date issue
04e0f21
fixed logging, removed temporary debug helper
ea69afb
fixed connection timeout while receiving headers
a19d7ad
minor optimization
ba36d7f
removed obsolete code
0de8833
this, final , static imports, manual formatting, removed redundant va…
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,9 @@ | ||
| package edu.harvard.iq.dataverse.persistence; | ||
|
|
||
| import static java.util.logging.Level.INFO; | ||
| import static java.util.logging.Level.SEVERE; | ||
| import static java.util.logging.Logger.getLogger; | ||
| import static org.apache.commons.lang3.StringUtils.isNotBlank; | ||
| import static org.apache.commons.lang3.StringUtils.startsWithIgnoreCase; | ||
|
|
||
| import java.io.Serializable; | ||
| import java.net.MalformedURLException; | ||
|
|
@@ -19,15 +19,68 @@ public class GlobalId implements Serializable { | |
|
|
||
| public static final String DOI_PROTOCOL = "doi"; | ||
| public static final String HDL_PROTOCOL = "hdl"; | ||
| public static final String HTTP_PROTOCOL = "http"; | ||
| public static final String HTTPS_PROTOCOL = "https"; | ||
| public static final String HDL_RESOLVER_URL = "https://hdl.handle.net/"; | ||
| public static final String HDL_RESOLVER_URL2 = "http://hdl.handle.net/"; | ||
| public static final String DOI_RESOLVER_URL = "https://doi.org/"; | ||
| public static final String DOI_RESOLVER_URL2 = "http://doi.org/"; | ||
| public static final String HTTPS_RESOLVER_URL = "https://"; | ||
| public static final String HTTP_RESOLVER_URL = "http://"; | ||
| private static final String PID_ALLOWED_CHARACTERS_PATTERN = "^[A-Za-z0-9._/:\\\\-]*"; | ||
|
|
||
| private static final Logger logger = getLogger(GlobalId.class.getName()); | ||
|
|
||
| private String protocol; | ||
| private String authority; | ||
| private String identifier; | ||
| private final String protocol; | ||
| private final String authority; | ||
| private final String identifier; | ||
|
|
||
| public static GlobalId fromHDLUrl(final String url) { | ||
| final int lastSlashIndex = url.lastIndexOf('/'); | ||
| final int authorityOffset = url.startsWith(HDL_RESOLVER_URL) | ||
| ? HDL_RESOLVER_URL.length() : HDL_RESOLVER_URL2.length(); | ||
| return new GlobalId(HDL_PROTOCOL, | ||
| url.substring(authorityOffset, lastSlashIndex), | ||
| url.substring(lastSlashIndex + 1)); | ||
| } | ||
|
|
||
| public static GlobalId fromDOIUrl(final String url) { | ||
| final int lastSlashIndex = url.lastIndexOf('/'); | ||
| final int authorityOffset = url.startsWith(DOI_RESOLVER_URL) | ||
| ? DOI_RESOLVER_URL.length() : DOI_RESOLVER_URL2.length(); | ||
|
|
||
| final String authority = url.substring(authorityOffset, lastSlashIndex); | ||
| final String identifier = url.substring(lastSlashIndex + 1); | ||
|
|
||
| // sometimes DOIs come in form of | ||
| // https://doi.org/DOI: 10.19195/2353-8546.8.13 or | ||
| // https://doi.org/DOI : 10.19195/2353-8546.8.13 | ||
| // https://doi.org/1https://doi.org/0.14746/eip.2023.2.8 | ||
| // even after fixing, these links lead to nowhere so it is | ||
| // better to let the caller know that it needs to find another identifier | ||
| if(startsWithIgnoreCase(authority, "DOI") || authority.startsWith("1https")) { | ||
| throw new IllegalArgumentException("Bloken DOI url: ".concat(url)); | ||
| } | ||
| return new GlobalId(DOI_PROTOCOL, authority, identifier); | ||
| } | ||
|
|
||
| public static GlobalId fromHttpUrl(final String url) throws RuntimeException { | ||
| try { | ||
| final URL u = new URL(url); | ||
| return new GlobalId(HTTP_PROTOCOL, u.getHost(), u.getPath()); | ||
| } catch (final MalformedURLException e) { | ||
| throw new IllegalArgumentException(e); | ||
| } | ||
| } | ||
|
|
||
| public static GlobalId fromHttpsUrl(final String url) throws RuntimeException { | ||
| try { | ||
| final URL u = new URL(url); | ||
| return new GlobalId(HTTPS_PROTOCOL, u.getHost(), u.getPath()); | ||
| } catch (final MalformedURLException e) { | ||
| throw new IllegalArgumentException(e); | ||
| } | ||
| } | ||
|
|
||
| public static Optional<GlobalId> parse(final String identifierString) { | ||
| try { | ||
|
|
@@ -42,12 +95,68 @@ public static Optional<GlobalId> parse(final String identifierString) { | |
| * @throws IllegalArgumentException if the passed string cannot be parsed. | ||
| * @Thorws NullPointerException i identifier is null | ||
| */ | ||
| public GlobalId(final String identifier) { | ||
| // set the protocol, authority, and identifier via parsePersistentId | ||
| if (!parsePersistentId(identifier)) { | ||
| throw new IllegalArgumentException("Failed to parse identifier: ".concat(identifier)); | ||
| public GlobalId(final String identifierString) { | ||
| final int index1 = identifierString.indexOf(':'); | ||
| if (index1 > 0) { // ':' found with one or more characters before it | ||
| if(identifierString.startsWith(HTTPS_PROTOCOL)) { | ||
| try { | ||
| final URL url = new URL(identifierString.substring(HTTPS_PROTOCOL.length() +1)); | ||
| this.protocol = HTTPS_PROTOCOL; | ||
| this.authority = url.getHost(); | ||
| this.identifier = url.getPath(); | ||
| if(conainsNullTerminator(this.identifier)) { | ||
| throw createException(identifierString); | ||
| } | ||
| } catch (final MalformedURLException e) { | ||
| throw new IllegalArgumentException(e); | ||
| } | ||
| } else if(identifierString.startsWith(HTTP_PROTOCOL)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing space between |
||
| try { | ||
| final URL url = new URL(identifierString.substring(HTTP_PROTOCOL.length() +1)); | ||
| this.protocol = HTTP_PROTOCOL; | ||
| this.authority = url.getHost(); | ||
| this.identifier = url.getPath(); | ||
| if(conainsNullTerminator(this.identifier)) { | ||
| throw createException(identifierString); | ||
| } | ||
| } catch (final MalformedURLException e) { | ||
| throw new IllegalArgumentException(e); | ||
| } | ||
| } else { | ||
| final int index2 = identifierString.indexOf('/', index1 + 1); | ||
| if (index2 > 0 && (index2 + 1) < identifierString.length()) { // '/' found with one or more characters | ||
| // between ':' | ||
| this.protocol = identifierString.substring(0, index1); // and '/' and there are characters after '/' | ||
| if (!DOI_PROTOCOL.equals(this.protocol) | ||
| && !HDL_PROTOCOL.equals(this.protocol)) { | ||
| throw createException(identifierString); | ||
| } | ||
| //Strip any whitespace, ; and ' from authority (should finding them cause a failure instead?) | ||
| this.authority = formatIdentifierString(identifierString.substring(index1 + 1, index2)); | ||
| if (conainsNullTerminator(this.authority)) { | ||
| throw createException(identifierString); | ||
| } | ||
| if (this.protocol.equals(DOI_PROTOCOL) && !this.checkDOIAuthority(this.authority)) { | ||
| throw createException(identifierString); | ||
| } | ||
| // Passed all checks | ||
| //Strip any whitespace, ; and ' from identifier (should finding them cause a failure instead?) | ||
| this.identifier = formatIdentifierString(identifierString.substring(index2 + 1)); | ||
| if(conainsNullTerminator(this.identifier)) { | ||
| throw createException(identifierString); | ||
| } | ||
| } else { | ||
| throw createException(identifierString); | ||
| } | ||
| } | ||
| } else { | ||
| throw createException(identifierString); | ||
| } | ||
| } | ||
|
|
||
| private static IllegalArgumentException createException(final String identifier) { | ||
| return new IllegalArgumentException("Failed to parse identifier: ".concat(identifier)); | ||
| } | ||
|
|
||
| public GlobalId(final String protocol, final String authority, final String identifier) { | ||
| this.protocol = protocol; | ||
|
|
@@ -77,24 +186,12 @@ public String getProtocol() { | |
| return this.protocol; | ||
| } | ||
|
|
||
| public void setProtocol(final String protocol) { | ||
| this.protocol = protocol; | ||
| } | ||
|
|
||
| public String getAuthority() { | ||
| return this.authority; | ||
| } | ||
|
|
||
| public void setAuthority(final String authority) { | ||
| this.authority = authority; | ||
| } | ||
| public String getAuthority() { | ||
| return this.authority; | ||
| } | ||
|
|
||
| public String getIdentifier() { | ||
| return this.identifier; | ||
| } | ||
|
|
||
| public void setIdentifier(final String identifier) { | ||
| this.identifier = identifier; | ||
| return this.identifier; | ||
| } | ||
|
|
||
| public String toString() { | ||
|
|
@@ -108,9 +205,13 @@ public String toString() { | |
| * @return The string representation of this global id. | ||
| */ | ||
| public String asString() { | ||
| return this.protocol == null || this.authority == null || this.identifier == null | ||
| ? "" | ||
| : this.protocol + ':' + this.authority + '/' + this.identifier; | ||
| if(this.protocol == null || this.authority == null || this.identifier == null) { | ||
| return ""; | ||
| } else if(HTTP_PROTOCOL.equals(this.protocol) || HTTPS_PROTOCOL.equals(this.protocol)) { | ||
| return this.protocol + ':' + this.protocol + "://" + this.authority + this.identifier; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two time this.protocol this was intended? |
||
| } else { | ||
| return this.protocol + ':' + this.authority + '/' + this.identifier; | ||
| } | ||
| } | ||
|
|
||
| public URL toURL() { | ||
|
|
@@ -120,67 +221,43 @@ public URL toURL() { | |
| return new URL(DOI_RESOLVER_URL + this.authority + '/' + this.identifier); | ||
| } else if (HDL_PROTOCOL.equals(this.protocol)) { | ||
| return new URL(HDL_RESOLVER_URL + this.authority + '/' + this.identifier); | ||
| } | ||
| } else if(HTTP_PROTOCOL.equals(this.protocol) || HTTPS_PROTOCOL.equals(this.protocol)) { | ||
| return new URL(this.protocol, this.authority, this.identifier); | ||
| } | ||
| } catch (final MalformedURLException ex) { | ||
| logger.log(SEVERE, null, ex); | ||
| } | ||
| } | ||
| return null; | ||
| } | ||
|
|
||
| public String getStoragePath() { | ||
| return this.authority.replace(':', '_') + '/' + | ||
| stripLeadingSlashes(this.identifier.replace(':', '_')); | ||
| } | ||
|
|
||
| private static String stripLeadingSlashes(final String s) { | ||
| int index = 0; | ||
| while(s.charAt(index) == '/') { | ||
| ++index; | ||
| } | ||
| return s.substring(index); | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * Parse a Persistent Id and set the protocol, authority, and identifier | ||
| * <p> | ||
| * Example 1: doi:10.5072/FK2/BYM3IW | ||
| * protocol: doi | ||
| * authority: 10.5072 | ||
| * identifier: FK2/BYM3IW | ||
| * <p> | ||
| * Example 2: hdl:1902.1/111012 | ||
| * protocol: hdl | ||
| * authority: 1902.1 | ||
| * identifier: 111012 | ||
| * | ||
| * @param identifierString | ||
| * @param separator the string that separates the authority from the identifier. | ||
| * @param destination the global id that will contain the parsed data. | ||
| * @return {@code destination}, after its fields have been updated, or | ||
| * {@code null} if parsing failed. | ||
| */ | ||
| private boolean parsePersistentId(final String identifierString) { | ||
|
|
||
| final int index1 = identifierString.indexOf(':'); | ||
| if (index1 > 0) { // ':' found with one or more characters before it | ||
| final int index2 = identifierString.indexOf('/', index1 + 1); | ||
| if (index2 > 0 && (index2 + 1) < identifierString.length()) { // '/' found with one or more characters | ||
| // between ':' | ||
| this.protocol = identifierString.substring(0, index1); // and '/' and there are characters after '/' | ||
| if (!DOI_PROTOCOL.equals(this.protocol) && !HDL_PROTOCOL.equals(this.protocol)) { | ||
| return false; | ||
| } | ||
| //Strip any whitespace, ; and ' from authority (should finding them cause a failure instead?) | ||
| this.authority = formatIdentifierString(identifierString.substring(index1 + 1, index2)); | ||
| if (testforNullTerminator(this.authority)) { | ||
| return false; | ||
| } | ||
| if (this.protocol.equals(DOI_PROTOCOL) && !this.checkDOIAuthority(this.authority)) { | ||
| return false; | ||
| } | ||
| // Passed all checks | ||
| //Strip any whitespace, ; and ' from identifier (should finding them cause a failure instead?) | ||
| this.identifier = formatIdentifierString(identifierString.substring(index2 + 1)); | ||
| return !testforNullTerminator(this.identifier); | ||
| } else { | ||
| logger.log(INFO, "Error parsing identifier: {0}: '':<authority>/<identifier>'' not found in string", | ||
| identifierString); | ||
| return false; | ||
| } | ||
| } else { | ||
| logger.log(INFO, "Error parsing identifier: {0}: ''<protocol>:'' not found in string", | ||
| identifierString); | ||
| return false; | ||
| } | ||
| public static boolean isDOI(final String id) { | ||
| return id.startsWith(DOI_RESOLVER_URL) || id.startsWith(DOI_RESOLVER_URL2); | ||
| } | ||
|
|
||
| public static boolean isHDL(final String id) { | ||
| return id.startsWith(HDL_RESOLVER_URL) || id.startsWith(HDL_RESOLVER_URL2); | ||
| } | ||
|
|
||
| public static boolean isHTTP(final String id) { | ||
| return id.startsWith(HTTP_RESOLVER_URL); | ||
| } | ||
|
|
||
| public static boolean isHTTPS(final String id) { | ||
| return id.startsWith(HTTPS_RESOLVER_URL); | ||
| } | ||
|
|
||
| private static String formatIdentifierString(final String str) { | ||
|
|
@@ -202,7 +279,7 @@ private static String formatIdentifierString(final String str) { | |
| // http://www.doi.org/doi_handbook/2_Numbering.html | ||
| } | ||
|
|
||
| private static boolean testforNullTerminator(final String str) { | ||
| private static boolean conainsNullTerminator(final String str) { | ||
| return str != null ? str.indexOf('\u0000') > 0 : false; | ||
| } | ||
|
|
||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing space between
if (