1- #!/usr/bin/env python
2-
3- import base64
4- import datetime
51import argparse
6- from lxml import etree as ET
7- from lxml . etree import QName
2+ import base64
3+ import os
84import mimetypes
9- import logging
10-
11- # Setting up basic logging
12- logging .basicConfig (level = logging .INFO )
13-
14-
15- def format_xml_element (element , level = 0 , indent = " " ):
16- """
17- Formats an XML element by adding appropriate spacing and indentation.
18-
19- Args:
20- element (Element): The XML element to format.
21- level (int, optional): The current level of indentation. Defaults to 0.
22- indent (str, optional): The string used for indentation. Defaults to " ".
5+ from datetime import datetime
6+ import xml .etree .ElementTree as ET
237
24- Returns:
25- None
26- """
27- spacing = "\n " + level * indent
28-
29- if len (element ):
30- if not element .text or not element .text .strip ():
31- element .text = spacing + indent
32- if not element .tail or not element .tail .strip ():
33- element .tail = spacing
34- for child in element :
35- format_xml_element (child , level + 1 , indent )
36- else :
37- if level and (not element .tail or not element .tail .strip ()):
38- element .tail = spacing
8+ NAMESPACES = {
9+ 'foxml' : 'info:fedora/fedora-system:def/foxml#'
10+ }
3911
12+ def register_namespaces ():
13+ """Registers all known namespaces with ElementTree for clean output."""
14+ for prefix , uri in NAMESPACES .items ():
15+ ET .register_namespace (prefix , uri )
4016
41- def compress_and_encode ( file_path ):
17+ def update_foxml_datastream ( input_path , output_path , dsid , content_file , label , mimetype , control_group ):
4218 """
43- Compresses and encodes the binary data from the given file path.
19+ Adds or replaces a datastream in a FOXML file with Base64 encoded content,
20+ with precise indentation and multi-line formatting that preserves the original document's style.
4421
4522 Args:
46- file_path (str): The path to the file containing the binary data.
47-
48- Returns:
49- tuple: A tuple containing the indented base64-encoded data and the original size of the binary data.
23+ input_path (str): Path to the source FOXML file.
24+ output_path (str): Path to save the modified FOXML file.
25+ dsid (str): The ID of the datastream to add/update (e.g., 'OBJ', 'MODS').
26+ content_file (str): Path to the file containing the new content.
27+ label (str): The label for the new datastream version.
28+ mimetype (str): The MIME type of the content file.
29+ control_group (str): The control group for the datastream (e.g., 'M', 'X').
5030 """
51- with open (file_path , "rb" ) as f_in :
52- binary_data = f_in .read ()
53- original_size = len (binary_data )
54- base64_data = base64 .b64encode (binary_data )
55- base64_lines = [
56- base64_data [i : i + 80 ].decode ("utf-8" )
57- for i in range (0 , len (base64_data ), 80 )
58- ]
59- indented_base64 = "\n " .join (base64_lines )
60- return indented_base64 , original_size
61-
62-
63- def register_namespaces (xml_path ):
64- """
65- Registers XML namespaces from the given XML file.
66-
67- Args:
68- xml_path (str): The path to the XML file.
31+ if not os .path .exists (content_file ):
32+ print (f"Error: Content file not found at '{ content_file } '" )
33+ return
6934
70- Raises:
71- Exception: If there is an error registering the namespaces.
72- """
73- try :
74- namespaces = dict (
75- [node for _ , node in ET .iterparse (xml_path , events = ["start-ns" ])]
76- )
77- for ns in namespaces :
78- ET .register_namespace (ns , namespaces [ns ])
79- except Exception as e :
80- logging .error (f"Error registering namespaces: { e } " )
81- raise
82-
83-
84- def add_datastream_version (
85- xml_path , dsid , base64_data , original_size , mimetype , label = None
86- ):
87- """
88- Adds a new version of a datastream to an XML file.
35+ print (f"Reading content from '{ content_file } '..." )
36+ with open (content_file , 'rb' ) as f :
37+ binary_content_bytes = f .read ()
38+
39+ encoded_content_string = base64 .b64encode (binary_content_bytes ).decode ('ascii' )
40+ content_size = os .path .getsize (content_file )
41+ print (f"Content read successfully. Original size: { content_size } bytes." )
8942
90- Args:
91- xml_path (str): The path to the XML file.
92- dsid (str): The ID of the datastream.
93- base64_data (str): The base64-encoded content of the datastream.
94- original_size (int): The original size of the datastream in bytes.
95- mimetype (str): The MIME type of the datastream.
96- label (str, optional): The label for the datastream version. If not provided, a default label will be used.
97-
98- Returns:
99- str: The XML string with the new datastream version added.
100-
101- Raises:
102- ET.ParseError: If there is an error parsing the XML file.
103- Exception: If there is an error creating the XML string.
104- """
43+ register_namespaces ()
10544 try :
106- root = ET .parse (xml_path ).getroot ()
45+ tree = ET .parse (input_path )
46+ root = tree .getroot ()
10747 except ET .ParseError as e :
108- logging . exception (f"XML parsing error : { e } " )
48+ print (f"Error parsing XML file ' { input_path } ' : { e } " )
10949 return
11050
111- nsmap = {
112- "foxml" : "info:fedora/fedora-system:def/foxml#" ,
113- "xsi" : "http://www.w3.org/2001/XMLSchema-instance" ,
114- "audit" : "info:fedora/fedora-system:def/audit#" ,
115- "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
116- "fedora" : "info:fedora/fedora-system:def/relations-external#" ,
117- "fedora-model" : "info:fedora/fedora-system:def/model#" ,
118- "islandora" : "http://islandora.ca/ontology/relsext#" ,
119- }
120-
121- # Have to use qualified names when creating an element.
122- ds_version_tag = QName (nsmap ["foxml" ], "datastreamVersion" )
123- binary_content_tag = QName (nsmap ["foxml" ], "binaryContent" )
124-
125- datastream = root .find (f".//foxml:datastream[@ID='{ dsid } ']" , namespaces = nsmap )
51+ datastream_xpath = f"./foxml:datastream[@ID='{ dsid } ']"
52+ datastream = root .find (datastream_xpath , NAMESPACES )
53+
54+ datastream_indent = ' '
55+ version_indent = datastream_indent + ' '
56+ content_indent = version_indent + ' '
57+ base64_indent = ' ' * 14
58+
12659 if datastream is None :
127- logging .warning (f"Datastream with ID of { dsid } does not exist." )
128- return
60+ print (f"Datastream with ID '{ dsid } ' not found. Creating a new one." )
61+ datastream = ET .SubElement (root , f"{{{ NAMESPACES ['foxml' ]} }}datastream" , {
62+ 'ID' : dsid , 'STATE' : 'A' , 'CONTROL_GROUP' : control_group , 'VERSIONABLE' : 'true'
63+ })
64+ if len (root ) > 1 :
65+ prev_sibling = root [- 2 ]
66+ datastream .tail = prev_sibling .tail
67+ prev_sibling .tail = '\n ' + datastream_indent
68+ else :
69+ root .text = '\n ' + datastream_indent
70+ datastream .tail = '\n '
71+
72+ datastream .text = '\n ' + version_indent
73+ last_version = None
74+ version_num = 0
12975
130- if label is None :
131- datastream_version = datastream .find (
132- ".//foxml:datastreamVersion[last()]" , namespaces = nsmap
133- )
134- label = (
135- datastream_version .get ("LABEL" )
136- if datastream_version is not None
137- else "default_label"
138- )
139-
140- new_id = "{}.{}" .format (
141- dsid , len (datastream .findall (".//foxml:datastreamVersion" , namespaces = nsmap ))
142- )
143- datastream_version = ET .SubElement (
144- datastream ,
145- ds_version_tag ,
146- {
147- "ID" : new_id ,
148- "LABEL" : label ,
149- "MIMETYPE" : mimetype ,
150- "SIZE" : str (original_size ),
151- },
76+ else :
77+ print (f"Found existing datastream with ID '{ dsid } '. Adding a new version." )
78+ versions = datastream .findall (f"{{{ NAMESPACES ['foxml' ]} }}datastreamVersion" , NAMESPACES )
79+ last_version = versions [- 1 ] if versions else None
80+ version_num = len (versions )
81+ if last_version is not None :
82+ last_version .tail = '\n ' + version_indent
83+ else :
84+ datastream .text = '\n ' + version_indent
85+
86+ new_version_id = f"{ dsid } .{ version_num } "
87+ now = datetime .utcnow ()
88+ main_part = now .strftime ('%Y-%m-%dT%H:%M:%S' )
89+ milliseconds = f'{ now .microsecond // 1000 :03d} '
90+ created_timestamp = f'{ main_part } .{ milliseconds } Z'
91+
92+ if not mimetype :
93+ mimetype , _ = mimetypes .guess_type (content_file )
94+ mimetype = mimetype or 'application/octet-stream'
95+ print (f"Guessed MIME type: '{ mimetype } '" )
96+
97+ if not label :
98+ label = f"{ dsid } datastream"
99+
100+ ds_version_attrs = {
101+ 'ID' : new_version_id , 'LABEL' : label , 'CREATED' : created_timestamp ,
102+ 'MIMETYPE' : mimetype , 'SIZE' : str (content_size )
103+ }
104+ ds_version = ET .SubElement (datastream , f"{{{ NAMESPACES ['foxml' ]} }}datastreamVersion" , ds_version_attrs )
105+
106+ ds_version .text = '\n ' + content_indent
107+ ds_version .tail = '\n ' + datastream_indent
108+
109+ binary_content_element = ET .SubElement (ds_version , f"{{{ NAMESPACES ['foxml' ]} }}binaryContent" )
110+
111+ LINE_WIDTH = 76
112+ chunks = [encoded_content_string [i :i + LINE_WIDTH ] for i in range (0 , len (encoded_content_string ), LINE_WIDTH )]
113+
114+ binary_content_element .text = (
115+ f"\n { base64_indent } " +
116+ f"\n { base64_indent } " .join (chunks ) +
117+ f"\n { content_indent } "
152118 )
153119
154- dt = datetime .datetime .utcnow ().strftime ("%Y-%m-%dT%H:%M:%S.%f" )[:- 3 ] + "Z"
155- datastream_version .set ("CREATED" , dt )
156-
157- binary_content = ET .SubElement (datastream_version , binary_content_tag )
158- binary_content .text = "\n " + base64_data + "\n "
120+ binary_content_element .tail = '\n ' + version_indent
159121
160122 try :
161- ET .indent (root , space = " " )
162- format_xml_element (root )
163- xml_string = ET .tostring (
164- root , encoding = "utf-8" , method = "xml" , xml_declaration = True
165- )
166- except Exception as e :
167- logging .exception (f"Error creating XML string: { e } " )
168- raise
169-
170- return xml_string
171-
172-
173- if __name__ == "__main__" :
174- parser = argparse .ArgumentParser ()
175- parser .add_argument ("--xml" , help = "path to the XML file to modify" , required = True )
176- parser .add_argument ("--dsid" , help = "ID of the datastream to modify" , required = True )
123+ tree .write (output_path , encoding = 'UTF-8' , xml_declaration = True )
124+ print (f"Successfully created new version '{ new_version_id } '." )
125+ print (f"Modified FOXML file saved to '{ output_path } '" )
126+ except IOError as e :
127+ print (f"Error writing to output file '{ output_path } ': { e } " )
128+
129+
130+ if __name__ == '__main__' :
131+ parser = argparse .ArgumentParser (
132+ description = 'Add or update a datastream in a FOXML file with Base64 encoded content.' ,
133+ formatter_class = argparse .RawTextHelpFormatter
134+ )
135+ parser .add_argument (
136+ '-i' , '--input-foxml' ,
137+ required = True ,
138+ help = 'Path to the input FOXML file.'
139+ )
140+ parser .add_argument (
141+ '-o' , '--output-foxml' ,
142+ required = True ,
143+ help = 'Path to save the modified output FOXML file.'
144+ )
145+ parser .add_argument (
146+ '--dsid' ,
147+ required = True ,
148+ help = 'The ID for the datastream (e.g., "OBJ", "MODS", "FULL_TEXT").'
149+ )
177150 parser .add_argument (
178- "--content" ,
179- help = "path to the binary content to add as a new datastreamVersion" ,
151+ '-f' , '--file' ,
180152 required = True ,
153+ dest = 'content_file' ,
154+ help = 'Path to the file to be used as the new datastream content.'
155+ )
156+ parser .add_argument (
157+ '--label' ,
158+ default = None ,
159+ help = 'A human-readable label for the new datastream version. \n (default: "[dsid] datastream")'
160+ )
161+ parser .add_argument (
162+ '--mimetype' ,
163+ default = None ,
164+ help = 'The MIME type of the content file (e.g., "application/pdf").\n (default: auto-detected or "application/octet-stream")'
165+ )
166+ parser .add_argument (
167+ '--control-group' ,
168+ default = 'M' ,
169+ choices = ['M' , 'X' , 'R' , 'E' ],
170+ help = 'The control group for the datastream. \' M\' (Managed) is typical for binary content. \n (default: M)'
181171 )
182- parser .add_argument ("--label" , help = "label of the new datastream version" )
183- parser .add_argument ("--output" , help = "path to the output XML file" , required = True )
184172 args = parser .parse_args ()
185173
186- try :
187- mimetype , _ = mimetypes .guess_type (args .content )
188- mimetype = mimetype or "application/octet-stream"
189-
190- base64_data , original_size = compress_and_encode (args .content )
191- register_namespaces (args .xml )
192- updated_xml = add_datastream_version (
193- args .xml , args .dsid , base64_data , original_size , mimetype , args .label
194- )
195-
196- if updated_xml :
197- with open (args .output , "w" ) as f_out :
198- f_out .write (updated_xml .decode ("utf-8" ))
199- except Exception as e :
200- logging .exception (f"Error in script execution: { e } " )
174+ update_foxml_datastream (
175+ input_path = args .input_foxml ,
176+ output_path = args .output_foxml ,
177+ dsid = args .dsid ,
178+ content_file = args .content_file ,
179+ label = args .label ,
180+ mimetype = args .mimetype ,
181+ control_group = args .control_group
182+ )
0 commit comments