forked from cernopendata/data-curation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_file_indexes.py
executable file
·129 lines (105 loc) · 4.81 KB
/
create_file_indexes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
import json
import os
import sys
import zlib
os.makedirs("test/eos-file-indexes", exist_ok=True)
os.makedirs("test/records", exist_ok=True)
def get_file_size(afile):
"Return file size of a file."
return os.path.getsize(afile)
def get_file_checksum(afile):
"""Return the ADLER32 checksum of a file."""
checksum = zlib.adler32(open(afile, "rb").read(), 1) & 0xFFFFFFFF
checksum = "{:#010x}".format(checksum).split("0x")[1]
return checksum
for AFIXTUREFILE in [
"test/atlas-hi-2024-hi-2015-data.json",
"test/atlas-hi-2024-mc-hi-minbias.json",
"test/atlas-hi-2024-summary.json",
]:
with open(AFIXTUREFILE, "r") as fdesc:
records = json.loads(fdesc.read())
for record in records:
# first, fix the license information
record["license"]["attribution"] = "CC0"
# second, fix the file information
files_new = []
for afile in record.get("files", []):
afilename = afile["filename"]
basename = os.path.basename(afilename)
basename = basename.replace("_filelist.json", "")
prefixes = []
with open(f"test/{afilename}", "r") as fdr:
rootfileinfos = json.loads(fdr.read())
for rootfileinfo in rootfileinfos:
rootfileinfo["checksum"] = rootfileinfo["checksum"].replace(
"adler32", "adler32:"
)
prefix = rootfileinfo["filename"].split(":", 1)[0]
if prefix not in prefixes:
prefixes.append(prefix)
del rootfileinfo["events"]
del rootfileinfo["type"]
rootfileinfo["uri"] = rootfileinfo["uri_root"].replace(
":1094//eos/opendata", "//eos/opendata"
)
del rootfileinfo["uri_root"]
if len(prefixes) > 1:
print("[ERROR] Several prefixes found: {prefixes}")
sys.exit(1)
prefix = prefixes[0]
with open(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.txt", "w"
) as fdw:
for rootfileinfo in rootfileinfos:
fdw.write(rootfileinfo["uri"] + "\n")
with open(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json", "w"
) as fdw:
new_content = json.dumps(
rootfileinfos,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)
fdw.write(new_content + "\n")
files_new.append(
{
"checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
"size": get_file_size(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
),
"type": "index.json",
"uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.json",
}
)
files_new.append(
{
"checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
"size": get_file_size(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
),
"type": "index.txt",
"uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.txt",
}
)
record["files"] = files_new
# print EOS copy command statements
print(f"eos mkdir -p /eos/opendata/atlas/rucio/{prefix}/file-indexes")
print(
f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.json /eos/opendata/atlas/rucio/{prefix}/file-indexes"
)
print(
f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.txt /eos/opendata/atlas/rucio/{prefix}/file-indexes"
)
new_content = json.dumps(
records,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)
with open(f"test/records/{os.path.basename(AFIXTUREFILE)}", "w") as fdesc:
fdesc.write(new_content + "\n")