Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 75 additions & 74 deletions src/oaipmh/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import urllib2
from urllib import urlencode

import sys
import base64
from lxml import etree
import time
Expand Down Expand Up @@ -40,14 +39,15 @@ class BaseClient(common.OAIPMH):
'expected-errcodes': {503},
}

def __init__(self, metadata_registry=None, custom_retry_policy=None):
def __init__(self, metadata_registry=None, custom_retry_policy=None, raw_data=None):
self._metadata_registry = (
metadata_registry or metadata.global_metadata_registry)
self._ignore_bad_character_hack = 0
self._day_granularity = False
self.retry_policy = self.default_retry_policy.copy()
if custom_retry_policy is not None:
self.retry_policy.update(custom_retry_policy)
self._raw_data = raw_data

def updateGranularity(self):
"""Update the granularity setting dependent on that the server says.
Expand Down Expand Up @@ -142,22 +142,21 @@ def GetMetadata_impl(self, args, tree):

def Identify_impl(self, args, tree):
namespaces = self.getNamespaces()
evaluator = etree.XPathEvaluator(tree, namespaces=namespaces)
identify_node = evaluator.evaluate(
'/oai:OAI-PMH/oai:Identify')[0]
identify_evaluator = etree.XPathEvaluator(identify_node,
namespaces=namespaces)
e = identify_evaluator.evaluate

repositoryName = e('string(oai:repositoryName/text())')
baseURL = e('string(oai:baseURL/text())')
protocolVersion = e('string(oai:protocolVersion/text())')
adminEmails = e('oai:adminEmail/text()')
identify_node = tree.xpath(
'/oai:OAI-PMH/oai:Identify',
namespaces=namespaces
)[0]
e = identify_node.xpath

repositoryName = e('string(oai:repositoryName/text())', namespaces=namespaces)
baseURL = e('string(oai:baseURL/text())', namespaces=namespaces)
protocolVersion = e('string(oai:protocolVersion/text())', namespaces=namespaces)
adminEmails = e('oai:adminEmail/text()', namespaces=namespaces)
earliestDatestamp = datestamp_to_datetime(
e('string(oai:earliestDatestamp/text())'))
deletedRecord = e('string(oai:deletedRecord/text())')
granularity = e('string(oai:granularity/text())')
compression = e('oai:compression/text()')
e('string(oai:earliestDatestamp/text())', namespaces=namespaces))
deletedRecord = e('string(oai:deletedRecord/text())', namespaces=namespaces)
granularity = e('string(oai:granularity/text())', namespaces=namespaces)
compression = e('oai:compression/text()', namespaces=namespaces)
# XXX description
identify = common.Identify(
repositoryName, baseURL, protocolVersion,
Expand All @@ -177,18 +176,16 @@ def nextBatch(token):

def ListMetadataFormats_impl(self, args, tree):
namespaces = self.getNamespaces()
evaluator = etree.XPathEvaluator(tree,
namespaces=namespaces)

metadataFormat_nodes = evaluator.evaluate(
'/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat')
metadataFormat_nodes = tree.xpath(
'/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat',
namespaces=namespaces
)
metadataFormats = []
for metadataFormat_node in metadataFormat_nodes:
e = etree.XPathEvaluator(metadataFormat_node,
namespaces=namespaces).evaluate
metadataPrefix = e('string(oai:metadataPrefix/text())')
schema = e('string(oai:schema/text())')
metadataNamespace = e('string(oai:metadataNamespace/text())')
e = metadataFormat_node.xpath
metadataPrefix = e('string(oai:metadataPrefix/text())', namespaces=namespaces)
schema = e('string(oai:schema/text())', namespaces=namespaces)
metadataNamespace = e('string(oai:metadataNamespace/text())', namespaces=namespaces)
metadataFormat = (metadataPrefix, schema, metadataNamespace)
metadataFormats.append(metadataFormat)

Expand Down Expand Up @@ -224,28 +221,23 @@ def nextBatch(token):

# various helper methods

def buildRecords(self,
metadata_prefix, namespaces, metadata_registry, tree):
# first find resumption token if available
evaluator = etree.XPathEvaluator(tree,
namespaces=namespaces)
token = evaluator.evaluate(
'string(/oai:OAI-PMH/*/oai:resumptionToken/text())')
if token.strip() == '':
def buildRecords(self, metadata_prefix, namespaces, metadata_registry, tree):
token = tree.xpath(
'string(/oai:OAI-PMH/*/oai:resumptionToken/text())',
namespaces=namespaces
)
if token.strip() == '' or self._raw_data:
token = None
record_nodes = evaluator.evaluate(
'/oai:OAI-PMH/*/oai:record')
record_nodes = tree.xpath('/oai:OAI-PMH/*/oai:record', namespaces=namespaces)
result = []
for record_node in record_nodes:
record_evaluator = etree.XPathEvaluator(record_node,
namespaces=namespaces)
e = record_evaluator.evaluate
e = record_node.xpath
# find header node
header_node = e('oai:header')[0]
header_node = e('oai:header', namespaces=namespaces)[0]
# create header
header = buildHeader(header_node, namespaces)
# find metadata node
metadata_list = e('oai:metadata')
metadata_list = e('oai:metadata', namespaces=namespaces)
if metadata_list:
metadata_node = metadata_list[0]
# create metadata
Expand All @@ -258,40 +250,41 @@ def buildRecords(self,
return result, token

def buildIdentifiers(self, namespaces, tree):
evaluator = etree.XPathEvaluator(tree,
namespaces=namespaces)
# first find resumption token is available
token = evaluator.evaluate(
'string(/oai:OAI-PMH/*/oai:resumptionToken/text())')
#'string(/oai:OAI-PMH/oai:ListIdentifiers/oai:resumptionToken/text())')
token = tree.xpath(
'string(/oai:OAI-PMH/*/oai:resumptionToken/text())',
namespaces=namespaces
)
if token.strip() == '':
token = None
header_nodes = evaluator.evaluate(
'/oai:OAI-PMH/oai:ListIdentifiers/oai:header')
header_nodes = tree.xpath(
'/oai:OAI-PMH/oai:ListIdentifiers/oai:header',
namespaces=namespaces
)
result = []
for header_node in header_nodes:
header = buildHeader(header_node, namespaces)
result.append(header)
return result, token

def buildSets(self, namespaces, tree):
evaluator = etree.XPathEvaluator(tree,
namespaces=namespaces)
# first find resumption token if available
token = evaluator.evaluate(
'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())')
token = tree.xpath(
'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())',
namespaces=namespaces
)
if token.strip() == '':
token = None
set_nodes = evaluator.evaluate(
'/oai:OAI-PMH/oai:ListSets/oai:set')
set_nodes = tree.xpath(
'/oai:OAI-PMH/oai:ListSets/oai:set',
namespaces=namespaces
)
sets = []
for set_node in set_nodes:
e = etree.XPathEvaluator(set_node,
namespaces=namespaces).evaluate
e = set_node.xpath
# make sure we get back unicode strings instead
# of lxml.etree._ElementUnicodeResult objects.
setSpec = six.text_type(e('string(oai:setSpec/text())'))
setName = six.text_type(e('string(oai:setName/text())'))
setSpec = six.text_type(e('string(oai:setSpec/text())', namespaces=namespaces))
setName = six.text_type(e('string(oai:setName/text())', namespaces=namespaces))
# XXX setDescription nodes
sets.append((setSpec, setName, None))
return sets, token
Expand Down Expand Up @@ -327,22 +320,31 @@ def makeRequest(self, **kw):

class Client(BaseClient):

def __init__(self, base_url, metadata_registry=None, credentials=None,
local_file=False, force_http_get=False, custom_retry_policy=None):
BaseClient.__init__(self, metadata_registry,
custom_retry_policy=custom_retry_policy)
def __init__(
self,
base_url,
metadata_registry=None,
credentials=None,
local_file=False,
force_http_get=False,
custom_retry_policy=None,
raw_data=None,
):
BaseClient.__init__(
self, metadata_registry, custom_retry_policy=custom_retry_policy, raw_data=raw_data
)
self._base_url = base_url
self._local_file = local_file
self._force_http_get = force_http_get
if credentials is not None:
self._credentials = base64.encodestring('%s:%s' % credentials)
self._credentials = base64.encodebytes(credentials.encode()).decode()
else:
self._credentials = None

def makeRequest(self, **kw):
"""Either load a local XML file or actually retrieve XML from a server.
"""
if self._local_file:
if isinstance(self._raw_data, str):
return self._raw_data.encode('ascii', 'replace')
elif self._local_file:
with codecs.open(self._base_url, 'r', 'utf-8') as xmlfile:
text = xmlfile.read()
return text.encode('ascii', 'replace')
Expand All @@ -367,13 +369,12 @@ def makeRequest(self, **kw):
)

def buildHeader(header_node, namespaces):
e = etree.XPathEvaluator(header_node,
namespaces=namespaces).evaluate
identifier = e('string(oai:identifier/text())')
e = header_node.xpath
identifier = e('string(oai:identifier/text())', namespaces=namespaces)
datestamp = datestamp_to_datetime(
str(e('string(oai:datestamp/text())')))
setspec = [str(s) for s in e('oai:setSpec/text()')]
deleted = e("@status = 'deleted'")
str(e('string(oai:datestamp/text())', namespaces=namespaces)))
setspec = [str(s) for s in e('oai:setSpec/text()', namespaces=namespaces)]
deleted = e("@status = 'deleted'", namespaces=namespaces)
return common.Header(header_node, identifier, datestamp, setspec, deleted)

def ResumptionListGenerator(firstBatch, nextBatch):
Expand Down
10 changes: 5 additions & 5 deletions src/oaipmh/common.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pkg_resources

from oaipmh import error

class Header(object):
Expand Down Expand Up @@ -49,7 +47,7 @@ def getField(self, name):
class Identify(object):
def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails,
earliestDatestamp, deletedRecord, granularity, compression,
toolkit_description=True):
toolkit_description=False):
self._repositoryName = repositoryName
self._baseURL = baseURL
self._protocolVersion = protocolVersion
Expand All @@ -59,8 +57,10 @@ def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails,
self._granularity = granularity
self._compression = compression
self._descriptions = []

if toolkit_description:
import pkg_resources

req = pkg_resources.Requirement.parse('pyoai')
egg = pkg_resources.working_set.find(req)
if egg:
Expand All @@ -77,7 +77,7 @@ def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails,
'%s'
'<URL>http://infrae.com/products/oaipack</URL>'
'</toolkit>' % version)

def repositoryName(self):
return self._repositoryName

Expand Down
71 changes: 41 additions & 30 deletions src/oaipmh/metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import sys

from lxml import etree
from lxml.etree import SubElement
from oaipmh import common

if sys.version_info[0] == 3:
Expand All @@ -21,7 +19,7 @@ class MetadataRegistry(object):
def __init__(self):
self._readers = {}
self._writers = {}

def registerReader(self, metadata_prefix, reader):
self._readers[metadata_prefix] = reader

Expand All @@ -30,10 +28,10 @@ def registerWriter(self, metadata_prefix, writer):

def hasReader(self, metadata_prefix):
return metadata_prefix in self._readers

def hasWriter(self, metadata_prefix):
return metadata_prefix in self._writers

def readMetadata(self, metadata_prefix, element):
"""Turn XML into metadata object.

Expand All @@ -45,7 +43,7 @@ def readMetadata(self, metadata_prefix, element):

def writeMetadata(self, metadata_prefix, element, metadata):
"""Write metadata as XML.

element - ElementTree element to write under
metadata - metadata object to write
"""
Expand All @@ -65,28 +63,44 @@ def __init__(self, fields, namespaces=None):

def __call__(self, element):
map = {}
# create XPathEvaluator for this element
xpath_evaluator = etree.XPathEvaluator(element,
namespaces=self._namespaces)

e = xpath_evaluator.evaluate
# now extra field info according to xpath expr
# Alias for element.xpath
e = element.xpath
for field_name, (field_type, expr) in list(self._fields.items()):
if field_type == 'bytes':
value = str(e(expr))
elif field_type == 'bytesList':
value = [str(item) for item in e(expr)]
elif field_type == 'text':
# make sure we get back unicode strings instead
# of lxml.etree._ElementUnicodeResult objects.
value = text_type(e(expr))
elif field_type == 'textList':
# make sure we get back unicode strings instead
# of lxml.etree._ElementUnicodeResult objects.
value = [text_type(v) for v in e(expr)]
else:
raise Error("Unknown field type: %s" % field_type)
map[field_name] = value
try:
# The core logic is to safely handle the result from xpath()
raw_result = e(expr, namespaces=self._namespaces)

value = None
if field_type == 'bytes':
value = str(raw_result)
elif field_type == 'bytesList':
# Ensure the result is iterable before the list comprehension
value = [str(item) for item in (raw_result if isinstance(raw_result, list) else [raw_result])]
elif field_type == 'text':
value = text_type(raw_result)
elif field_type == 'textList':
# This is the critical part to fix the error
if isinstance(raw_result, list):
# This handles the expected case: a list of elements/strings
value = [text_type(v) for v in raw_result]
elif raw_result is not None:
# This handles a single value being returned
value = [text_type(raw_result)]
else:
# Handles cases with no result (None)
value = []
else:
raise Error("Unknown field type: %s" % field_type)

map[field_name] = value
except Exception as ex:
# A robust way to prevent crashes
print(f"Warning: Error processing field '{field_name}' with expression '{expr}': {ex}", file=sys.stderr)
if field_type.endswith('List'):
map[field_name] = []
else:
map[field_name] = ""

return common.Metadata(element, map)

oai_dc_reader = MetadataReader(
Expand All @@ -111,6 +125,3 @@ def __call__(self, element):
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'dc' : 'http://purl.org/dc/elements/1.1/'}
)