1111from settings import config , CACHE , BLOCK_SIZE , DB_FILENAME
1212
1313
14- def extract ():
14+ def extract (keep_files = True ):
15+ """
16+ Given an HPSS path in the zstash database or passed via the command line,
17+ extract the archived data based on the file pattern (if given).
18+ """
1519 parser = argparse .ArgumentParser (
1620 usage = 'zstash extract [<args>] [files]' ,
1721 description = 'Extract files from existing archive' )
18- required = parser .add_argument_group ('required named arguments' )
1922 optional = parser .add_argument_group ('optional named arguments' )
2023 optional .add_argument ('--hpss' , type = str , help = 'path to HPSS storage' )
2124 parser .add_argument ('files' , nargs = '*' , default = ['*' ])
@@ -50,7 +53,8 @@ def extract():
5053 config .hpss = args .hpss
5154
5255 # Start doing actual work
53- logging .debug ('Running zstash extract' )
56+ cmd = 'extract' if keep_files else 'check'
57+ logging .debug ('Running zstash ' + cmd )
5458 logging .debug ('Local path : %s' % (config .path ))
5559 logging .debug ('HPSS path : %s' % (config .hpss ))
5660 logging .debug ('Max size : %i' % (config .maxsize ))
@@ -59,7 +63,7 @@ def extract():
5963 # Find matching files
6064 matches = []
6165 for file in args .files :
62- cur .execute (u"select * from files where name GLOB ?" , (file ,))
66+ cur .execute (u"select * from files where name GLOB ? or tar GLOB ? " , (file , file ))
6367 matches = matches + cur .fetchall ()
6468
6569 # Remove duplicates
@@ -69,15 +73,32 @@ def extract():
6973 matches = sorted (matches , key = lambda x : (x [5 ], x [6 ]))
7074
7175 # Retrieve from tapes
72- extractFiles (matches )
76+ failures = extractFiles (matches , keep_files )
7377
7478 # Close database
7579 logging .debug ('Closing index database' )
7680 con .close ()
7781
82+ if failures :
83+ logging .error ('Encountered an error for files:' )
84+ for fail in failures :
85+ logging .error ('{} in {}' .format (fail [1 ], fail [5 ]))
7886
79- def extractFiles (files ):
87+ broken_tars = set (sorted ([f [5 ] for f in failures ]))
88+ logging .error ('The following tar archives had errors:' )
89+ for tar in broken_tars :
90+ logging .error (tar )
8091
92+
93+ def extractFiles (files , keep_files ):
94+ """
95+ Given a list of database rows, extract the files from the
96+ tar archives to the current location on disk.
97+
98+ If keep_files is False, the files are not extracted.
99+ This is used for when checking if the files in an HPSS
100+ repository are valid.
101+ """
81102 failures = []
82103 tfname = None
83104 newtar = True
@@ -97,7 +118,8 @@ def extractFiles(files):
97118 tar = tarfile .open (tfname , "r" )
98119
99120 # Extract file
100- logging .info ('Extracting %s' % (file [1 ]))
121+ cmd = 'Extracting' if keep_files else 'Checking'
122+ logging .info (cmd + ' %s' % (file [1 ]))
101123 try :
102124
103125 # Seek file position
@@ -108,39 +130,48 @@ def extractFiles(files):
108130
109131 if tarinfo .isfile ():
110132 # fileobj to extract
111- fin = tar .extractfile (tarinfo )
112- fname = tarinfo .name
113- path , name = os .path .split (fname )
114- if path != '' :
115- if not os .path .isdir (path ):
116- os .makedirs (path )
117- fout = open (fname , 'w' )
118- hash_md5 = hashlib .md5 ()
119- while True :
120- s = fin .read (BLOCK_SIZE )
121- if len (s ) > 0 :
122- fout .write (s )
123- hash_md5 .update (s )
124- if len (s ) < BLOCK_SIZE :
125- break
126- fin .close ()
127- fout .close ()
133+ try :
134+ fin = tar .extractfile (tarinfo )
135+ fname = tarinfo .name
136+ path , name = os .path .split (fname )
137+ if path != '' :
138+ if not os .path .isdir (path ):
139+ os .makedirs (path )
140+ if keep_files :
141+ fout = open (fname , 'w' )
142+
143+ hash_md5 = hashlib .md5 ()
144+ while True :
145+ s = fin .read (BLOCK_SIZE )
146+ if len (s ) > 0 :
147+ hash_md5 .update (s )
148+ if keep_files :
149+ fout .write (s )
150+ if len (s ) < BLOCK_SIZE :
151+ break
152+ finally :
153+ fin .close ()
154+ if keep_files :
155+ fout .close ()
156+
128157 md5 = hash_md5 .hexdigest ()
129- tar .chown (tarinfo , fname )
130- tar .chmod (tarinfo , fname )
131- tar .utime (tarinfo , fname )
132- # Verify size
133- if os .path .getsize (fname ) != file [2 ]:
134- logging .error ('size mismatch for: %s' % (fname ))
158+ if keep_files :
159+ tar .chown (tarinfo , fname )
160+ tar .chmod (tarinfo , fname )
161+ tar .utime (tarinfo , fname )
162+ # Verify size
163+ if os .path .getsize (fname ) != file [2 ]:
164+ logging .error ('size mismatch for: %s' % (fname ))
135165 # Verify md5 checksum
136166 if md5 != file [4 ]:
137167 logging .error ('md5 mismatch for: %s' % (fname ))
138168 logging .error ('md5 of extracted file: %s' % (md5 ))
139169 logging .error ('md5 of original file: %s' % (file [4 ]))
170+ failures .append (file )
140171 else :
141172 logging .debug ('Valid md5: %s %s' % (md5 , fname ))
142173
143- else :
174+ elif keep_files :
144175 tar .extract (tarinfo )
145176 # Note: tar.extract() will not restore time stamps of symbolic
146177 # links. Could not find a Python-way to restore it either, so
@@ -155,6 +186,7 @@ def extractFiles(files):
155186 except :
156187 traceback .print_exc ()
157188 logging .error ('Retrieving %s' % (file [1 ]))
189+ failures .append (file )
158190
159191 # Close current archive?
160192 if (i == nfiles - 1 or files [i ][5 ] != files [i + 1 ][5 ]):
@@ -165,3 +197,5 @@ def extractFiles(files):
165197
166198 # Open new archive next time
167199 newtar = True
200+
201+ return failures
0 commit comments