forked from cernopendata/data-curation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_data_metadata.py
executable file
·71 lines (61 loc) · 3.16 KB
/
create_data_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3
import datetime
# Grab the list of datasets that we want to run over
dataset_input = 'pp_data_p6026_GRL.txt'
# Set a post-fix for the file, so that we can nicely version things
static_did_post = '_OpenData_v0_p6026_'+datetime.date.today().isoformat()
# Dictionary mapping datasets to file names
datasets = {}
# Dictionary of Datasets --> dictionary of file names
# file names --> dictionary of properties (checksum, events, uri, type, size)
file_locations = {}
# Let's go over the list of files...
with open(dataset_input,'r') as dataset_list_file:
for bline in dataset_list_file:
# Make sure we ignore comments - in case folks are commenting out datasets
aline = bline.split('#')[0].strip()
if len(aline)<2:
continue
# Initialize our dataset lists and file location lists
datasets[ aline.strip() ] = []
file_locations[ aline.strip() ] = {}
print(f'Read in {len(datasets.keys())} datasets')
# Get our rucio client ready
from rucio.client.client import Client
rc = Client()
# Loop over all the datasets
for dataset_number,dataset in enumerate(datasets):
# Let the people know how we're doing
if (dataset_number+1)%10==0:
print(f'Working on dataset {dataset_number+1} of {len(datasets)}: {dataset}')
# Get the scope
my_scope=dataset.split(':')[0]
# Grab the list of files from rucio - for data, we are always going to take _all_ the events
fl = rc.list_files(scope=my_scope,name=dataset.split(':')[1])
# Note that we're stashing the full file list so we can check if we got all the files later
for a in fl:
# Update the map of datasets : files
datasets[dataset] += [ a['name'] ]
# Get the first part of the per-file metadata
file_locations[dataset][ my_scope+':'+a['name'] ] = { 'checksum':'adler32'+a['adler32'], 'size':a['bytes'], 'events':a['events'], 'type':'DAOD_PHYSLITE' }
# Second rucio query, needed to get the file location on eos
replicalist = rc.list_replicas([{'scope':my_scope,'name':dataset.split(':')[1]}])
# Go through all the results (all the files in the dataset again)
for areplica in replicalist:
# Make sure we found that file before - just error checking, this should never be printed
if areplica['scope']+':'+areplica['name'] not in file_locations[dataset]:
print(f'Warning: did not find {areplica["scope"]} {areplica["name"]} in file_locations for {dataset}')
continue
# Go through the physical locations and get the one at the open data endpoint
for a_pfn in areplica['pfns']:
if 'opendata/atlas' in a_pfn:
file_locations[dataset][ my_scope+':'+areplica['name'] ]['uri'] = a_pfn
break
else:
# We didn't find one on the open data endpoint
print(f'Did not find {dataset} file {my_scope+":"+areplica["name"]} on eos in pfns {areplica["pfns"]}')
# Record the file mapping that we established
import json
with open( 'data_file_mapping'+static_did_post+'.json' , 'w' ) as file_backup:
json.dump( obj={'file_dictionary':datasets, 'file_locations':file_locations} , fp=file_backup )
# All done!