forked from cernopendata/data-curation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_file_indexes.py
executable file
·144 lines (118 loc) · 5.54 KB
/
create_file_indexes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python3
import json
import os
import sys
import zlib
os.makedirs("test/eos-file-indexes", exist_ok=True)
os.makedirs("test/records", exist_ok=True)
def get_file_size(afile):
"Return file size of a file."
return os.path.getsize(afile)
def get_file_checksum(afile):
"""Return the ADLER32 checksum of a file."""
checksum = zlib.adler32(open(afile, "rb").read(), 1) & 0xFFFFFFFF
checksum = "{:#010x}".format(checksum).split("0x")[1]
return checksum
for AFIXTUREFILE in [
"test/atlas-2024-mc-pp-boson-nominal.json",
"test/atlas-2024-mc-pp-exotics-nominal.json",
"test/atlas-2024-mc-pp-higgs-nominal.json",
"test/atlas-2024-mc-pp-higgs-syst.json",
"test/atlas-2024-mc-pp-jet-nominal.json",
"test/atlas-2024-mc-pp-jet-syst.json",
"test/atlas-2024-mc-pp-susy-nominal.json",
"test/atlas-2024-mc-pp-top-nominal.json",
"test/atlas-2024-mc-pp-top-syst.json",
"test/atlas-2024-pp-2015-data.json",
"test/atlas-2024-pp-2016-data.json",
"test/atlas-2024-summary.json",
]:
with open(AFIXTUREFILE, "r") as fdesc:
records = json.loads(fdesc.read())
for record in records:
# first, fix the license information
record["license"]["attribution"] = "CC0"
# second, fix relations
if record["recid"] == "80020": # only for summary record
new_relations = []
for arelation in record["relations"]:
if arelation["type"] == "isChildOf":
arelation["type"] = "isParentOf"
new_relations.append(arelation)
record["relations"] = new_relations
# third, fix the file information
files_new = []
for afile in record.get("files", []):
afilename = afile["filename"]
basename = os.path.basename(afilename)
basename = basename.replace("_filelist.json", "")
prefixes = []
with open(f"test/{afilename}", "r") as fdr:
rootfileinfos = json.loads(fdr.read())
for rootfileinfo in rootfileinfos:
rootfileinfo["checksum"] = rootfileinfo["checksum"].replace(
"adler32", "adler32:"
)
prefix = rootfileinfo["filename"].split(":", 1)[0]
if prefix not in prefixes:
prefixes.append(prefix)
del rootfileinfo["events"]
del rootfileinfo["type"]
rootfileinfo["uri"] = rootfileinfo["uri_root"].replace(
":1094//eos/opendata", "//eos/opendata"
)
del rootfileinfo["uri_root"]
if len(prefixes) > 1:
print("[ERROR] Several prefixes found: {prefixes}")
sys.exit(1)
prefix = prefixes[0]
with open(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.txt", "w"
) as fdw:
for rootfileinfo in rootfileinfos:
fdw.write(rootfileinfo["uri"] + "\n")
with open(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json", "w"
) as fdw:
new_content = json.dumps(
rootfileinfos,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)
fdw.write(new_content + "\n")
files_new.append(
{
"checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
"size": get_file_size(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
),
"type": "index.json",
"uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.json",
}
)
files_new.append(
{
"checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
"size": get_file_size(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
),
"type": "index.txt",
"uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.txt",
}
)
record["files"] = files_new
# print EOS copy command statements
print(f"eos mkdir -p /eos/opendata/atlas/rucio/{prefix}/file-indexes")
print(f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.json /eos/opendata/atlas/rucio/{prefix}/file-indexes")
print(f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.txt /eos/opendata/atlas/rucio/{prefix}/file-indexes")
new_content = json.dumps(
records,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)
with open(f"test/records/{os.path.basename(AFIXTUREFILE)}", "w") as fdesc:
fdesc.write(new_content + "\n")