|
29 | 29 | import datetime |
30 | 30 | import itertools |
31 | 31 | import logging |
32 | | -import openpyxl |
33 | 32 | import os |
34 | 33 | import shutil |
35 | | -import stat |
36 | 34 | import subprocess |
37 | 35 | import sys |
38 | 36 | import time |
@@ -258,174 +256,6 @@ def create_spreadsheet(args, sips, volumes, logger): |
258 | 256 | logger.info("Description CSV created.") |
259 | 257 |
|
260 | 258 |
|
261 | | -def create_aspace_excel_sheet(args, sips, volumes, logger): |
262 | | - """Create new copy of ASpace XLSX and append rows describing disk images.""" |
263 | | - xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx")) |
264 | | - template_path = os.path.abspath( |
265 | | - os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx") |
266 | | - ) |
267 | | - |
268 | | - try: |
269 | | - shutil.copyfile(template_path, xlsx_path) |
270 | | - except OSError as err: |
271 | | - logger.error(f"Unable to copy ASpace template to destination: {err}") |
272 | | - |
273 | | - # Set ASpace file permissions |
274 | | - try: |
275 | | - os.chmod( |
276 | | - xlsx_path, |
277 | | - stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH, |
278 | | - ) |
279 | | - except OSError as err: |
280 | | - logger.error(f"Error setting permissions: {err}") |
281 | | - |
282 | | - workbook = openpyxl.load_workbook(filename=xlsx_path) |
283 | | - worksheet = workbook["Data"] |
284 | | - |
285 | | - # TODO: Deduplicate with create_speadsheet |
286 | | - # Maybe create separate method that creates dict with info, and handle |
287 | | - # opening/writing csv or xlsx separately |
288 | | - for item in sorted(os.listdir(sips)): |
289 | | - sip_path = os.path.join(sips, item) |
290 | | - |
291 | | - if not os.path.isdir(sip_path): |
292 | | - continue |
293 | | - |
294 | | - disk_volumes = volumes[item] |
295 | | - number_volumes = len(disk_volumes) |
296 | | - |
297 | | - date_earliest = "" |
298 | | - date_latest = "" |
299 | | - |
300 | | - # Get and sum information from all DFXML files generated |
301 | | - dfxml_files = [] |
302 | | - subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation") |
303 | | - if args.bagfiles: |
304 | | - subdoc_dir = os.path.join( |
305 | | - sip_path, "data", "metadata", "submissionDocumentation" |
306 | | - ) |
307 | | - for root, _, files in os.walk(subdoc_dir): |
308 | | - for file in files: |
309 | | - if file.startswith("dfxml"): |
310 | | - dfxml_files.append(os.path.join(root, file)) |
311 | | - |
312 | | - dfxml_files_info = [] |
313 | | - for dfxml_file in dfxml_files: |
314 | | - dfxml_info = _parse_dfxml(dfxml_file, logger) |
315 | | - if not dfxml_info: |
316 | | - logger.warning( |
317 | | - "No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format( |
318 | | - dfxml_file |
319 | | - ) |
320 | | - ) |
321 | | - continue |
322 | | - dfxml_files_info.append(dfxml_info) |
323 | | - |
324 | | - file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info]) |
325 | | - total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info]) |
326 | | - file_systems = [volume["file_system"] for volume in disk_volumes] |
327 | | - # Deduplicate list |
328 | | - file_systems = list(dict.fromkeys(file_systems)) |
329 | | - file_systems_str = ", ".join(file_systems) |
330 | | - |
331 | | - for dfxml_info in dfxml_files_info: |
332 | | - if not date_earliest or dfxml_info["date_earliest"] < date_earliest: |
333 | | - date_earliest = dfxml_info["date_earliest"] |
334 | | - if not date_latest or dfxml_info["date_latest"] > date_latest: |
335 | | - date_latest = dfxml_info["date_latest"] |
336 | | - |
337 | | - # Create list with empty string for each of template's columns |
338 | | - row_to_write = [] |
339 | | - for _ in range(173): |
340 | | - row_to_write.append("") |
341 | | - |
342 | | - # Row indices for fields to write |
343 | | - INDEX_FILENAME = 6 |
344 | | - INDEX_LEVEL_OF_DESCRIPTION = 8 |
345 | | - INDEX_DATE_START = 23 |
346 | | - INDEX_DATE_END = 24 |
347 | | - INDEX_EXTENT_NUMBER = 34 |
348 | | - INDEX_EXTENT_TYPE = 35 |
349 | | - INDEX_SIZE = 36 |
350 | | - INDEX_SCOPE_CONTENTS = 170 |
351 | | - |
352 | | - # Fields that are always constant |
353 | | - row_to_write[INDEX_FILENAME] = item |
354 | | - row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File" |
355 | | - |
356 | | - if file_count == 0: |
357 | | - row_to_write[ |
358 | | - INDEX_SCOPE_CONTENTS |
359 | | - ] = "Error gathering statistics from SIP directory" |
360 | | - |
361 | | - worksheet.append(row_to_write) |
362 | | - |
363 | | - logger.error("Unable to read DFXML files for {}".format(sip_path)) |
364 | | - continue |
365 | | - |
366 | | - # Get file formats from Brunnhilde |
367 | | - file_formats = [] |
368 | | - file_format_csv = os.path.join( |
369 | | - sip_path, |
370 | | - "metadata", |
371 | | - "submissionDocumentation", |
372 | | - "brunnhilde", |
373 | | - "csv_reports", |
374 | | - "formats.csv", |
375 | | - ) |
376 | | - if args.bagfiles: |
377 | | - file_format_csv = os.path.join( |
378 | | - sip_path, |
379 | | - "data", |
380 | | - "metadata", |
381 | | - "submissionDocumentation", |
382 | | - "brunnhilde", |
383 | | - "csv_reports", |
384 | | - "formats.csv", |
385 | | - ) |
386 | | - |
387 | | - try: |
388 | | - with open(file_format_csv, "r") as f: |
389 | | - reader = csv.reader(f) |
390 | | - next(reader) |
391 | | - for row in itertools.islice(reader, 5): |
392 | | - file_formats.append(row[0]) |
393 | | - except: |
394 | | - file_formats.append( |
395 | | - "ERROR! No Brunnhilde formats.csv file to pull formats from." |
396 | | - ) |
397 | | - |
398 | | - file_formats = [element or "Unidentified" for element in file_formats] |
399 | | - file_formats_str = ", ".join(file_formats) |
400 | | - |
401 | | - if number_volumes > 1: |
402 | | - scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format( |
403 | | - number_volumes, file_systems_str, file_formats_str |
404 | | - ) |
405 | | - else: |
406 | | - scope_content = ( |
407 | | - "Files exported from {} file system volume. File formats: {}".format( |
408 | | - disk_volumes[0]["file_system"], file_formats_str |
409 | | - ) |
410 | | - ) |
411 | | - |
412 | | - row_to_write[INDEX_DATE_START] = str(date_earliest[:4]) |
413 | | - row_to_write[INDEX_DATE_END] = str(date_latest[:4]) |
414 | | - row_to_write[INDEX_EXTENT_NUMBER] = str(file_count) |
415 | | - row_to_write[INDEX_EXTENT_TYPE] = "digital files" |
416 | | - row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes)) |
417 | | - row_to_write[INDEX_SCOPE_CONTENTS] = scope_content |
418 | | - |
419 | | - worksheet.append(row_to_write) |
420 | | - |
421 | | - logger.info("Described %s successfully." % (sip_path)) |
422 | | - |
423 | | - workbook.save(filename=xlsx_path) |
424 | | - workbook.close() |
425 | | - |
426 | | - logger.info("ArchivesSpace description XLSX created.") |
427 | | - |
428 | | - |
429 | 259 | def _parse_dfxml(dfxml_path, logger, export_all=False): |
430 | 260 | """Parse DFXML and return dict of information for spreadsheet.""" |
431 | 261 | volume_info = { |
@@ -593,12 +423,6 @@ def _make_parser(): |
593 | 423 | help="Export AppleDouble resource forks from HFS-formatted disks", |
594 | 424 | action="store_true", |
595 | 425 | ) |
596 | | - parser.add_argument( |
597 | | - "-c", |
598 | | - "--csv", |
599 | | - help="Write description CSV (old default) instead of ArchivesSpace XLSX", |
600 | | - action="store_true", |
601 | | - ) |
602 | 426 | parser.add_argument("--quiet", action="store_true", help="Write only errors to log") |
603 | 427 | parser.add_argument( |
604 | 428 | "source", help="Source directory containing disk images (and related files)" |
@@ -740,16 +564,10 @@ def main(): |
740 | 564 | ) |
741 | 565 |
|
742 | 566 | # write description |
743 | | - if args.csv: |
744 | | - try: |
745 | | - create_spreadsheet(args, sips, volumes, logger) |
746 | | - except Exception as err: |
747 | | - logger.error(f"Error creating description csv: {err}") |
748 | | - else: |
749 | | - try: |
750 | | - create_aspace_excel_sheet(args, sips, volumes, logger) |
751 | | - except Exception as err: |
752 | | - logger.error(f"Error creating ArchivesSpace description xlsx: {err}") |
| 567 | + try: |
| 568 | + create_spreadsheet(args, sips, volumes, logger) |
| 569 | + except Exception as err: |
| 570 | + logger.error(f"Error creating description csv: {err}") |
753 | 571 |
|
754 | 572 | # print unprocessed list |
755 | 573 | if unprocessed: |
|
0 commit comments