diff --git a/src/oaipmh/client.py b/src/oaipmh/client.py index fc8dba5..baf8a7a 100644 --- a/src/oaipmh/client.py +++ b/src/oaipmh/client.py @@ -12,7 +12,6 @@ import urllib2 from urllib import urlencode -import sys import base64 from lxml import etree import time @@ -40,7 +39,7 @@ class BaseClient(common.OAIPMH): 'expected-errcodes': {503}, } - def __init__(self, metadata_registry=None, custom_retry_policy=None): + def __init__(self, metadata_registry=None, custom_retry_policy=None, raw_data=None): self._metadata_registry = ( metadata_registry or metadata.global_metadata_registry) self._ignore_bad_character_hack = 0 @@ -48,6 +47,7 @@ def __init__(self, metadata_registry=None, custom_retry_policy=None): self.retry_policy = self.default_retry_policy.copy() if custom_retry_policy is not None: self.retry_policy.update(custom_retry_policy) + self._raw_data = raw_data def updateGranularity(self): """Update the granularity setting dependent on that the server says. @@ -142,22 +142,21 @@ def GetMetadata_impl(self, args, tree): def Identify_impl(self, args, tree): namespaces = self.getNamespaces() - evaluator = etree.XPathEvaluator(tree, namespaces=namespaces) - identify_node = evaluator.evaluate( - '/oai:OAI-PMH/oai:Identify')[0] - identify_evaluator = etree.XPathEvaluator(identify_node, - namespaces=namespaces) - e = identify_evaluator.evaluate - - repositoryName = e('string(oai:repositoryName/text())') - baseURL = e('string(oai:baseURL/text())') - protocolVersion = e('string(oai:protocolVersion/text())') - adminEmails = e('oai:adminEmail/text()') + identify_node = tree.xpath( + '/oai:OAI-PMH/oai:Identify', + namespaces=namespaces + )[0] + e = identify_node.xpath + + repositoryName = e('string(oai:repositoryName/text())', namespaces=namespaces) + baseURL = e('string(oai:baseURL/text())', namespaces=namespaces) + protocolVersion = e('string(oai:protocolVersion/text())', namespaces=namespaces) + adminEmails = e('oai:adminEmail/text()', namespaces=namespaces) earliestDatestamp = datestamp_to_datetime( - e('string(oai:earliestDatestamp/text())')) - deletedRecord = e('string(oai:deletedRecord/text())') - granularity = e('string(oai:granularity/text())') - compression = e('oai:compression/text()') + e('string(oai:earliestDatestamp/text())', namespaces=namespaces)) + deletedRecord = e('string(oai:deletedRecord/text())', namespaces=namespaces) + granularity = e('string(oai:granularity/text())', namespaces=namespaces) + compression = e('oai:compression/text()', namespaces=namespaces) # XXX description identify = common.Identify( repositoryName, baseURL, protocolVersion, @@ -177,18 +176,16 @@ def nextBatch(token): def ListMetadataFormats_impl(self, args, tree): namespaces = self.getNamespaces() - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - - metadataFormat_nodes = evaluator.evaluate( - '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') + metadataFormat_nodes = tree.xpath( + '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat', + namespaces=namespaces + ) metadataFormats = [] for metadataFormat_node in metadataFormat_nodes: - e = etree.XPathEvaluator(metadataFormat_node, - namespaces=namespaces).evaluate - metadataPrefix = e('string(oai:metadataPrefix/text())') - schema = e('string(oai:schema/text())') - metadataNamespace = e('string(oai:metadataNamespace/text())') + e = metadataFormat_node.xpath + metadataPrefix = e('string(oai:metadataPrefix/text())', namespaces=namespaces) + schema = e('string(oai:schema/text())', namespaces=namespaces) + metadataNamespace = e('string(oai:metadataNamespace/text())', namespaces=namespaces) metadataFormat = (metadataPrefix, schema, metadataNamespace) metadataFormats.append(metadataFormat) @@ -224,28 +221,23 @@ def nextBatch(token): # various helper methods - def buildRecords(self, - metadata_prefix, namespaces, metadata_registry, tree): - # first find resumption token if available - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') - if token.strip() == '': + def buildRecords(self, metadata_prefix, namespaces, metadata_registry, tree): + token = tree.xpath( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())', + namespaces=namespaces + ) + if token.strip() == '' or self._raw_data: token = None - record_nodes = evaluator.evaluate( - '/oai:OAI-PMH/*/oai:record') + record_nodes = tree.xpath('/oai:OAI-PMH/*/oai:record', namespaces=namespaces) result = [] for record_node in record_nodes: - record_evaluator = etree.XPathEvaluator(record_node, - namespaces=namespaces) - e = record_evaluator.evaluate + e = record_node.xpath # find header node - header_node = e('oai:header')[0] + header_node = e('oai:header', namespaces=namespaces)[0] # create header header = buildHeader(header_node, namespaces) # find metadata node - metadata_list = e('oai:metadata') + metadata_list = e('oai:metadata', namespaces=namespaces) if metadata_list: metadata_node = metadata_list[0] # create metadata @@ -258,16 +250,17 @@ def buildRecords(self, return result, token def buildIdentifiers(self, namespaces, tree): - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - # first find resumption token is available - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') #'string(/oai:OAI-PMH/oai:ListIdentifiers/oai:resumptionToken/text())') + token = tree.xpath( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())', + namespaces=namespaces + ) if token.strip() == '': token = None - header_nodes = evaluator.evaluate( - '/oai:OAI-PMH/oai:ListIdentifiers/oai:header') + header_nodes = tree.xpath( + '/oai:OAI-PMH/oai:ListIdentifiers/oai:header', + namespaces=namespaces + ) result = [] for header_node in header_nodes: header = buildHeader(header_node, namespaces) @@ -275,23 +268,23 @@ def buildIdentifiers(self, namespaces, tree): return result, token def buildSets(self, namespaces, tree): - evaluator = etree.XPathEvaluator(tree, - namespaces=namespaces) - # first find resumption token if available - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())') + token = tree.xpath( + 'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())', + namespaces=namespaces + ) if token.strip() == '': token = None - set_nodes = evaluator.evaluate( - '/oai:OAI-PMH/oai:ListSets/oai:set') + set_nodes = tree.xpath( + '/oai:OAI-PMH/oai:ListSets/oai:set', + namespaces=namespaces + ) sets = [] for set_node in set_nodes: - e = etree.XPathEvaluator(set_node, - namespaces=namespaces).evaluate + e = set_node.xpath # make sure we get back unicode strings instead # of lxml.etree._ElementUnicodeResult objects. - setSpec = six.text_type(e('string(oai:setSpec/text())')) - setName = six.text_type(e('string(oai:setName/text())')) + setSpec = six.text_type(e('string(oai:setSpec/text())', namespaces=namespaces)) + setName = six.text_type(e('string(oai:setName/text())', namespaces=namespaces)) # XXX setDescription nodes sets.append((setSpec, setName, None)) return sets, token @@ -327,22 +320,31 @@ def makeRequest(self, **kw): class Client(BaseClient): - def __init__(self, base_url, metadata_registry=None, credentials=None, - local_file=False, force_http_get=False, custom_retry_policy=None): - BaseClient.__init__(self, metadata_registry, - custom_retry_policy=custom_retry_policy) + def __init__( + self, + base_url, + metadata_registry=None, + credentials=None, + local_file=False, + force_http_get=False, + custom_retry_policy=None, + raw_data=None, + ): + BaseClient.__init__( + self, metadata_registry, custom_retry_policy=custom_retry_policy, raw_data=raw_data + ) self._base_url = base_url self._local_file = local_file self._force_http_get = force_http_get if credentials is not None: - self._credentials = base64.encodestring('%s:%s' % credentials) + self._credentials = base64.encodebytes(credentials.encode()).decode() else: self._credentials = None def makeRequest(self, **kw): - """Either load a local XML file or actually retrieve XML from a server. - """ - if self._local_file: + if isinstance(self._raw_data, str): + return self._raw_data.encode('ascii', 'replace') + elif self._local_file: with codecs.open(self._base_url, 'r', 'utf-8') as xmlfile: text = xmlfile.read() return text.encode('ascii', 'replace') @@ -367,13 +369,12 @@ def makeRequest(self, **kw): ) def buildHeader(header_node, namespaces): - e = etree.XPathEvaluator(header_node, - namespaces=namespaces).evaluate - identifier = e('string(oai:identifier/text())') + e = header_node.xpath + identifier = e('string(oai:identifier/text())', namespaces=namespaces) datestamp = datestamp_to_datetime( - str(e('string(oai:datestamp/text())'))) - setspec = [str(s) for s in e('oai:setSpec/text()')] - deleted = e("@status = 'deleted'") + str(e('string(oai:datestamp/text())', namespaces=namespaces))) + setspec = [str(s) for s in e('oai:setSpec/text()', namespaces=namespaces)] + deleted = e("@status = 'deleted'", namespaces=namespaces) return common.Header(header_node, identifier, datestamp, setspec, deleted) def ResumptionListGenerator(firstBatch, nextBatch): diff --git a/src/oaipmh/common.py b/src/oaipmh/common.py index c602ada..cec1437 100644 --- a/src/oaipmh/common.py +++ b/src/oaipmh/common.py @@ -1,5 +1,3 @@ -import pkg_resources - from oaipmh import error class Header(object): @@ -49,7 +47,7 @@ def getField(self, name): class Identify(object): def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, earliestDatestamp, deletedRecord, granularity, compression, - toolkit_description=True): + toolkit_description=False): self._repositoryName = repositoryName self._baseURL = baseURL self._protocolVersion = protocolVersion @@ -59,8 +57,10 @@ def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, self._granularity = granularity self._compression = compression self._descriptions = [] - + if toolkit_description: + import pkg_resources + req = pkg_resources.Requirement.parse('pyoai') egg = pkg_resources.working_set.find(req) if egg: @@ -77,7 +77,7 @@ def __init__(self, repositoryName, baseURL, protocolVersion, adminEmails, '%s' 'http://infrae.com/products/oaipack' '' % version) - + def repositoryName(self): return self._repositoryName diff --git a/src/oaipmh/metadata.py b/src/oaipmh/metadata.py index 14d9ad0..0b90f68 100644 --- a/src/oaipmh/metadata.py +++ b/src/oaipmh/metadata.py @@ -1,7 +1,5 @@ import sys -from lxml import etree -from lxml.etree import SubElement from oaipmh import common if sys.version_info[0] == 3: @@ -21,7 +19,7 @@ class MetadataRegistry(object): def __init__(self): self._readers = {} self._writers = {} - + def registerReader(self, metadata_prefix, reader): self._readers[metadata_prefix] = reader @@ -30,10 +28,10 @@ def registerWriter(self, metadata_prefix, writer): def hasReader(self, metadata_prefix): return metadata_prefix in self._readers - + def hasWriter(self, metadata_prefix): return metadata_prefix in self._writers - + def readMetadata(self, metadata_prefix, element): """Turn XML into metadata object. @@ -45,7 +43,7 @@ def readMetadata(self, metadata_prefix, element): def writeMetadata(self, metadata_prefix, element, metadata): """Write metadata as XML. - + element - ElementTree element to write under metadata - metadata object to write """ @@ -65,28 +63,44 @@ def __init__(self, fields, namespaces=None): def __call__(self, element): map = {} - # create XPathEvaluator for this element - xpath_evaluator = etree.XPathEvaluator(element, - namespaces=self._namespaces) - - e = xpath_evaluator.evaluate - # now extra field info according to xpath expr + # Alias for element.xpath + e = element.xpath for field_name, (field_type, expr) in list(self._fields.items()): - if field_type == 'bytes': - value = str(e(expr)) - elif field_type == 'bytesList': - value = [str(item) for item in e(expr)] - elif field_type == 'text': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - value = text_type(e(expr)) - elif field_type == 'textList': - # make sure we get back unicode strings instead - # of lxml.etree._ElementUnicodeResult objects. - value = [text_type(v) for v in e(expr)] - else: - raise Error("Unknown field type: %s" % field_type) - map[field_name] = value + try: + # The core logic is to safely handle the result from xpath() + raw_result = e(expr, namespaces=self._namespaces) + + value = None + if field_type == 'bytes': + value = str(raw_result) + elif field_type == 'bytesList': + # Ensure the result is iterable before the list comprehension + value = [str(item) for item in (raw_result if isinstance(raw_result, list) else [raw_result])] + elif field_type == 'text': + value = text_type(raw_result) + elif field_type == 'textList': + # This is the critical part to fix the error + if isinstance(raw_result, list): + # This handles the expected case: a list of elements/strings + value = [text_type(v) for v in raw_result] + elif raw_result is not None: + # This handles a single value being returned + value = [text_type(raw_result)] + else: + # Handles cases with no result (None) + value = [] + else: + raise Error("Unknown field type: %s" % field_type) + + map[field_name] = value + except Exception as ex: + # A robust way to prevent crashes + print(f"Warning: Error processing field '{field_name}' with expression '{expr}': {ex}", file=sys.stderr) + if field_type.endswith('List'): + map[field_name] = [] + else: + map[field_name] = "" + return common.Metadata(element, map) oai_dc_reader = MetadataReader( @@ -111,6 +125,3 @@ def __call__(self, element): 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc' : 'http://purl.org/dc/elements/1.1/'} ) - - -