Skip to content

Commit c76bf27

Browse files
Merge pull request #30 from discoverygarden/JIM-45
JIM-45: New script to replace datastreams.
2 parents cb0b497 + 7f7ae75 commit c76bf27

File tree

1 file changed

+156
-174
lines changed

1 file changed

+156
-174
lines changed

scripts/datastream_updater.py

Lines changed: 156 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -1,200 +1,182 @@
1-
#!/usr/bin/env python
2-
3-
import base64
4-
import datetime
51
import argparse
6-
from lxml import etree as ET
7-
from lxml.etree import QName
2+
import base64
3+
import os
84
import mimetypes
9-
import logging
10-
11-
# Setting up basic logging
12-
logging.basicConfig(level=logging.INFO)
13-
14-
15-
def format_xml_element(element, level=0, indent=" "):
16-
"""
17-
Formats an XML element by adding appropriate spacing and indentation.
18-
19-
Args:
20-
element (Element): The XML element to format.
21-
level (int, optional): The current level of indentation. Defaults to 0.
22-
indent (str, optional): The string used for indentation. Defaults to " ".
5+
from datetime import datetime
6+
import xml.etree.ElementTree as ET
237

24-
Returns:
25-
None
26-
"""
27-
spacing = "\n" + level * indent
28-
29-
if len(element):
30-
if not element.text or not element.text.strip():
31-
element.text = spacing + indent
32-
if not element.tail or not element.tail.strip():
33-
element.tail = spacing
34-
for child in element:
35-
format_xml_element(child, level + 1, indent)
36-
else:
37-
if level and (not element.tail or not element.tail.strip()):
38-
element.tail = spacing
8+
NAMESPACES = {
9+
'foxml': 'info:fedora/fedora-system:def/foxml#'
10+
}
3911

12+
def register_namespaces():
13+
"""Registers all known namespaces with ElementTree for clean output."""
14+
for prefix, uri in NAMESPACES.items():
15+
ET.register_namespace(prefix, uri)
4016

41-
def compress_and_encode(file_path):
17+
def update_foxml_datastream(input_path, output_path, dsid, content_file, label, mimetype, control_group):
4218
"""
43-
Compresses and encodes the binary data from the given file path.
19+
Adds or replaces a datastream in a FOXML file with Base64 encoded content,
20+
with precise indentation and multi-line formatting that preserves the original document's style.
4421
4522
Args:
46-
file_path (str): The path to the file containing the binary data.
47-
48-
Returns:
49-
tuple: A tuple containing the indented base64-encoded data and the original size of the binary data.
23+
input_path (str): Path to the source FOXML file.
24+
output_path (str): Path to save the modified FOXML file.
25+
dsid (str): The ID of the datastream to add/update (e.g., 'OBJ', 'MODS').
26+
content_file (str): Path to the file containing the new content.
27+
label (str): The label for the new datastream version.
28+
mimetype (str): The MIME type of the content file.
29+
control_group (str): The control group for the datastream (e.g., 'M', 'X').
5030
"""
51-
with open(file_path, "rb") as f_in:
52-
binary_data = f_in.read()
53-
original_size = len(binary_data)
54-
base64_data = base64.b64encode(binary_data)
55-
base64_lines = [
56-
base64_data[i : i + 80].decode("utf-8")
57-
for i in range(0, len(base64_data), 80)
58-
]
59-
indented_base64 = "\n ".join(base64_lines)
60-
return indented_base64, original_size
61-
62-
63-
def register_namespaces(xml_path):
64-
"""
65-
Registers XML namespaces from the given XML file.
66-
67-
Args:
68-
xml_path (str): The path to the XML file.
31+
if not os.path.exists(content_file):
32+
print(f"Error: Content file not found at '{content_file}'")
33+
return
6934

70-
Raises:
71-
Exception: If there is an error registering the namespaces.
72-
"""
73-
try:
74-
namespaces = dict(
75-
[node for _, node in ET.iterparse(xml_path, events=["start-ns"])]
76-
)
77-
for ns in namespaces:
78-
ET.register_namespace(ns, namespaces[ns])
79-
except Exception as e:
80-
logging.error(f"Error registering namespaces: {e}")
81-
raise
82-
83-
84-
def add_datastream_version(
85-
xml_path, dsid, base64_data, original_size, mimetype, label=None
86-
):
87-
"""
88-
Adds a new version of a datastream to an XML file.
35+
print(f"Reading content from '{content_file}'...")
36+
with open(content_file, 'rb') as f:
37+
binary_content_bytes = f.read()
38+
39+
encoded_content_string = base64.b64encode(binary_content_bytes).decode('ascii')
40+
content_size = os.path.getsize(content_file)
41+
print(f"Content read successfully. Original size: {content_size} bytes.")
8942

90-
Args:
91-
xml_path (str): The path to the XML file.
92-
dsid (str): The ID of the datastream.
93-
base64_data (str): The base64-encoded content of the datastream.
94-
original_size (int): The original size of the datastream in bytes.
95-
mimetype (str): The MIME type of the datastream.
96-
label (str, optional): The label for the datastream version. If not provided, a default label will be used.
97-
98-
Returns:
99-
str: The XML string with the new datastream version added.
100-
101-
Raises:
102-
ET.ParseError: If there is an error parsing the XML file.
103-
Exception: If there is an error creating the XML string.
104-
"""
43+
register_namespaces()
10544
try:
106-
root = ET.parse(xml_path).getroot()
45+
tree = ET.parse(input_path)
46+
root = tree.getroot()
10747
except ET.ParseError as e:
108-
logging.exception(f"XML parsing error: {e}")
48+
print(f"Error parsing XML file '{input_path}': {e}")
10949
return
11050

111-
nsmap = {
112-
"foxml": "info:fedora/fedora-system:def/foxml#",
113-
"xsi": "http://www.w3.org/2001/XMLSchema-instance",
114-
"audit": "info:fedora/fedora-system:def/audit#",
115-
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
116-
"fedora": "info:fedora/fedora-system:def/relations-external#",
117-
"fedora-model": "info:fedora/fedora-system:def/model#",
118-
"islandora": "http://islandora.ca/ontology/relsext#",
119-
}
120-
121-
# Have to use qualified names when creating an element.
122-
ds_version_tag = QName(nsmap["foxml"], "datastreamVersion")
123-
binary_content_tag = QName(nsmap["foxml"], "binaryContent")
124-
125-
datastream = root.find(f".//foxml:datastream[@ID='{dsid}']", namespaces=nsmap)
51+
datastream_xpath = f"./foxml:datastream[@ID='{dsid}']"
52+
datastream = root.find(datastream_xpath, NAMESPACES)
53+
54+
datastream_indent = ' '
55+
version_indent = datastream_indent + ' '
56+
content_indent = version_indent + ' '
57+
base64_indent = ' ' * 14
58+
12659
if datastream is None:
127-
logging.warning(f"Datastream with ID of {dsid} does not exist.")
128-
return
60+
print(f"Datastream with ID '{dsid}' not found. Creating a new one.")
61+
datastream = ET.SubElement(root, f"{{{NAMESPACES['foxml']}}}datastream", {
62+
'ID': dsid, 'STATE': 'A', 'CONTROL_GROUP': control_group, 'VERSIONABLE': 'true'
63+
})
64+
if len(root) > 1:
65+
prev_sibling = root[-2]
66+
datastream.tail = prev_sibling.tail
67+
prev_sibling.tail = '\n' + datastream_indent
68+
else:
69+
root.text = '\n' + datastream_indent
70+
datastream.tail = '\n'
71+
72+
datastream.text = '\n' + version_indent
73+
last_version = None
74+
version_num = 0
12975

130-
if label is None:
131-
datastream_version = datastream.find(
132-
".//foxml:datastreamVersion[last()]", namespaces=nsmap
133-
)
134-
label = (
135-
datastream_version.get("LABEL")
136-
if datastream_version is not None
137-
else "default_label"
138-
)
139-
140-
new_id = "{}.{}".format(
141-
dsid, len(datastream.findall(".//foxml:datastreamVersion", namespaces=nsmap))
142-
)
143-
datastream_version = ET.SubElement(
144-
datastream,
145-
ds_version_tag,
146-
{
147-
"ID": new_id,
148-
"LABEL": label,
149-
"MIMETYPE": mimetype,
150-
"SIZE": str(original_size),
151-
},
76+
else:
77+
print(f"Found existing datastream with ID '{dsid}'. Adding a new version.")
78+
versions = datastream.findall(f"{{{NAMESPACES['foxml']}}}datastreamVersion", NAMESPACES)
79+
last_version = versions[-1] if versions else None
80+
version_num = len(versions)
81+
if last_version is not None:
82+
last_version.tail = '\n' + version_indent
83+
else:
84+
datastream.text = '\n' + version_indent
85+
86+
new_version_id = f"{dsid}.{version_num}"
87+
now = datetime.utcnow()
88+
main_part = now.strftime('%Y-%m-%dT%H:%M:%S')
89+
milliseconds = f'{now.microsecond // 1000:03d}'
90+
created_timestamp = f'{main_part}.{milliseconds}Z'
91+
92+
if not mimetype:
93+
mimetype, _ = mimetypes.guess_type(content_file)
94+
mimetype = mimetype or 'application/octet-stream'
95+
print(f"Guessed MIME type: '{mimetype}'")
96+
97+
if not label:
98+
label = f"{dsid} datastream"
99+
100+
ds_version_attrs = {
101+
'ID': new_version_id, 'LABEL': label, 'CREATED': created_timestamp,
102+
'MIMETYPE': mimetype, 'SIZE': str(content_size)
103+
}
104+
ds_version = ET.SubElement(datastream, f"{{{NAMESPACES['foxml']}}}datastreamVersion", ds_version_attrs)
105+
106+
ds_version.text = '\n' + content_indent
107+
ds_version.tail = '\n' + datastream_indent
108+
109+
binary_content_element = ET.SubElement(ds_version, f"{{{NAMESPACES['foxml']}}}binaryContent")
110+
111+
LINE_WIDTH = 76
112+
chunks = [encoded_content_string[i:i + LINE_WIDTH] for i in range(0, len(encoded_content_string), LINE_WIDTH)]
113+
114+
binary_content_element.text = (
115+
f"\n{base64_indent}" +
116+
f"\n{base64_indent}".join(chunks) +
117+
f"\n{content_indent}"
152118
)
153119

154-
dt = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
155-
datastream_version.set("CREATED", dt)
156-
157-
binary_content = ET.SubElement(datastream_version, binary_content_tag)
158-
binary_content.text = "\n " + base64_data + "\n "
120+
binary_content_element.tail = '\n' + version_indent
159121

160122
try:
161-
ET.indent(root, space=" ")
162-
format_xml_element(root)
163-
xml_string = ET.tostring(
164-
root, encoding="utf-8", method="xml", xml_declaration=True
165-
)
166-
except Exception as e:
167-
logging.exception(f"Error creating XML string: {e}")
168-
raise
169-
170-
return xml_string
171-
172-
173-
if __name__ == "__main__":
174-
parser = argparse.ArgumentParser()
175-
parser.add_argument("--xml", help="path to the XML file to modify", required=True)
176-
parser.add_argument("--dsid", help="ID of the datastream to modify", required=True)
123+
tree.write(output_path, encoding='UTF-8', xml_declaration=True)
124+
print(f"Successfully created new version '{new_version_id}'.")
125+
print(f"Modified FOXML file saved to '{output_path}'")
126+
except IOError as e:
127+
print(f"Error writing to output file '{output_path}': {e}")
128+
129+
130+
if __name__ == '__main__':
131+
parser = argparse.ArgumentParser(
132+
description='Add or update a datastream in a FOXML file with Base64 encoded content.',
133+
formatter_class=argparse.RawTextHelpFormatter
134+
)
135+
parser.add_argument(
136+
'-i', '--input-foxml',
137+
required=True,
138+
help='Path to the input FOXML file.'
139+
)
140+
parser.add_argument(
141+
'-o', '--output-foxml',
142+
required=True,
143+
help='Path to save the modified output FOXML file.'
144+
)
145+
parser.add_argument(
146+
'--dsid',
147+
required=True,
148+
help='The ID for the datastream (e.g., "OBJ", "MODS", "FULL_TEXT").'
149+
)
177150
parser.add_argument(
178-
"--content",
179-
help="path to the binary content to add as a new datastreamVersion",
151+
'-f', '--file',
180152
required=True,
153+
dest='content_file',
154+
help='Path to the file to be used as the new datastream content.'
155+
)
156+
parser.add_argument(
157+
'--label',
158+
default=None,
159+
help='A human-readable label for the new datastream version. \n(default: "[dsid] datastream")'
160+
)
161+
parser.add_argument(
162+
'--mimetype',
163+
default=None,
164+
help='The MIME type of the content file (e.g., "application/pdf").\n(default: auto-detected or "application/octet-stream")'
165+
)
166+
parser.add_argument(
167+
'--control-group',
168+
default='M',
169+
choices=['M', 'X', 'R', 'E'],
170+
help='The control group for the datastream. \'M\' (Managed) is typical for binary content. \n(default: M)'
181171
)
182-
parser.add_argument("--label", help="label of the new datastream version")
183-
parser.add_argument("--output", help="path to the output XML file", required=True)
184172
args = parser.parse_args()
185173

186-
try:
187-
mimetype, _ = mimetypes.guess_type(args.content)
188-
mimetype = mimetype or "application/octet-stream"
189-
190-
base64_data, original_size = compress_and_encode(args.content)
191-
register_namespaces(args.xml)
192-
updated_xml = add_datastream_version(
193-
args.xml, args.dsid, base64_data, original_size, mimetype, args.label
194-
)
195-
196-
if updated_xml:
197-
with open(args.output, "w") as f_out:
198-
f_out.write(updated_xml.decode("utf-8"))
199-
except Exception as e:
200-
logging.exception(f"Error in script execution: {e}")
174+
update_foxml_datastream(
175+
input_path=args.input_foxml,
176+
output_path=args.output_foxml,
177+
dsid=args.dsid,
178+
content_file=args.content_file,
179+
label=args.label,
180+
mimetype=args.mimetype,
181+
control_group=args.control_group
182+
)

0 commit comments

Comments
 (0)