Skip to content

Commit b4df3cc

Browse files
committed
adds a google storage blobstore
1 parent 6505241 commit b4df3cc

File tree

4 files changed

+111
-4
lines changed

4 files changed

+111
-4
lines changed

provenance/_config.py

+13
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,19 @@ def __init__(self, *args, **kargs):
5151
BLOBSTORE_TYPES['sftp'] = SFTPStore
5252

5353

54+
try:
55+
import provenance.google_storage as gs
56+
BLOBSTORE_TYPES['gs'] = gs.GSStore
57+
58+
except ImportError as e:
59+
class GSStore(object):
60+
_err = e
61+
def __init__(self, *args, **kargs):
62+
raise(self._err)
63+
64+
BLOBSTORE_TYPES['gs'] = GSStore
65+
66+
5467
blobstore_from_config = atomic_item_from_config(type_dict=BLOBSTORE_TYPES,
5568
item_plural='Blobstores')
5669

provenance/blobstores.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -223,19 +223,19 @@ class S3Store(RemoteStore):
223223
def __init__(self, cachedir, basepath, s3_config=None, s3fs=None,
224224
read=True, write=True, read_through_write=True,
225225
delete=False, on_duplicate_key='skip', cleanup_cachedir=False,
226-
always_check_s3=False):
226+
always_check_remote=False):
227227
"""
228228
Parameters
229229
----------
230-
always_check_s3 : bool
230+
always_check_remote : bool
231231
When True S3 will be checked with every __contains__ call. Otherwise it will
232232
short-circuit if the blob is found in the cachedir. For performance reasons this
233233
should always be set to False. The only reason why you would want to use this
234234
is if you are using a S3Store and a DiskStore in a ChainedStore together for
235235
some reason. Since the S3Store basically doubles as a DiskStore with it's cachedir
236236
chaining the two doesn't really make sense though.
237237
"""
238-
super(S3Store, self).__init__(always_check_remote=always_check_s3,
238+
super(S3Store, self).__init__(always_check_remote=always_check_remote,
239239
cachedir = cachedir,
240240
basepath = basepath,
241241
cleanup_cachedir = cleanup_cachedir,
@@ -262,7 +262,7 @@ def _download_file(self, remote_path, dest_filename):
262262
self.s3fs.get(remote_path, dest_filename)
263263

264264

265-
265+
266266
class ChainedStore(BaseBlobStore):
267267
def __init__(self, stores, read=True, write=True, read_through_write=True,
268268
delete=True, on_duplicate_key='skip'):

provenance/google_storage.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import os
2+
import shutil
3+
4+
from boltons import funcutils as bfu
5+
from google.cloud import storage as gs
6+
from joblib.disk import mkdirp
7+
from memoized_property import memoized_property
8+
9+
from . import blobstores as bs
10+
11+
12+
# TODO: catch and retry w/new client on
13+
# BrokenPipeError: [Errno 32] Broken pipe
14+
# ConnectionResetError: [Errno 54] Connection reset by peer
15+
# more?
16+
17+
def retry(f, max_attempts=2):
18+
19+
@bfu.wraps(f)
20+
def with_retry(store, *args, **kargs):
21+
actual_attempts = 0
22+
while True:
23+
try:
24+
return f(store, *args, **kargs)
25+
except (BrokenPipeError, ConnectionError) as e:
26+
actual_attempts +=1
27+
if actual_attempts >= max_attempts:
28+
raise e
29+
else:
30+
store._setup_client()
31+
return with_retry
32+
33+
34+
35+
class GSStore(bs.RemoteStore):
36+
def __init__(self, cachedir, bucket, basepath='', project=None,
37+
read=True, write=True, read_through_write=True,
38+
delete=False, on_duplicate_key='skip', cleanup_cachedir=False,
39+
always_check_remote=False):
40+
"""
41+
Parameters
42+
----------
43+
always_check_remote : bool
44+
When True GS (Google Storage) will be checked with every __contains__ call. Otherwise it will
45+
short-circuit if the blob is found in the cachedir. For performance reasons this
46+
should always be set to False. The only reason why you would want to use this
47+
is if you are using a GSStore and a DiskStore in a ChainedStore together for
48+
some reason. Since the GSStore basically doubles as a DiskStore with it's cachedir
49+
chaining the two doesn't really make sense though.
50+
"""
51+
super(GSStore, self).__init__(always_check_remote=always_check_remote,
52+
cachedir = cachedir,
53+
basepath = basepath,
54+
cleanup_cachedir = cleanup_cachedir,
55+
read=read, write=write, read_through_write=read_through_write,
56+
delete=delete, on_duplicate_key=on_duplicate_key)
57+
58+
self.bucket_name = bucket
59+
self.project = project
60+
61+
def _setup_client(self):
62+
del self._client
63+
del self._bucket
64+
# force re-memoization
65+
assert self.bucket is not None
66+
67+
@memoized_property
68+
def client(self):
69+
return gs.Client(project=self.project)
70+
71+
@memoized_property
72+
def bucket(self):
73+
return self.client.get_bucket(self.bucket_name)
74+
75+
@retry
76+
def _exists(self, path):
77+
blobs = list(self.bucket.list_blobs(prefix=path))
78+
return len(blobs) == 1
79+
80+
@retry
81+
def _delete_remote(self, path):
82+
self.blob(path).delete()
83+
84+
def _blob(self, path):
85+
return self._bucket.blob(path)
86+
87+
@retry
88+
def _upload_file(self, filename, path):
89+
self._blob(path).upload_from_filename(filename)
90+
91+
@retry
92+
def _download_file(self, remote_path, dest_filename):
93+
self._blob(remote_path).download_to_filename(dest_filename)

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
subpackages = {
99
'sftp': ['paramiko'],
10+
'google_storage': ['google-cloud'],
1011
'vis': ['graphviz', 'frozendict']
1112
}
1213

0 commit comments

Comments
 (0)