Skip to content

Commit 018e4dd

Browse files
committed
Move remote catalogue from CSV to gzipped SQLite:
* This will speed up s3sup status and push commands. * Migration is automatic and requires no intervention. * One caveat is that after running an s3sup push with this version or newer, it will no longer be possible to use_older versions of s3sup (<= 0.3.0) with the same project.
1 parent c804821 commit 018e4dd

12 files changed

Lines changed: 289 additions & 64 deletions

File tree

CHANGELOG.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
# Changelog
2-
All notable changes to this project will be documented in this file.
3-
4-
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
2+
All notable changes to s3sup are documented in this file with each release
3+
version. ChangeLog format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
54
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
65

6+
## [0.4.0] - 2019-05-03
7+
### Changed
8+
- Remote catalogue now a gzipped SQLite database rather than an uncompressed
9+
CSV file. This will speed up s3sup status and push commands. Migration is
10+
automatic and requires no intervention. One caveat is that after running an
11+
`s3sup push` with this version or newer, it will no longer be possible to
12+
use _older_ versions of s3sup (<= 0.3.0) with the same project.
13+
714
## [0.3.0] - 2019-04-09
815
### Added
916
- Prevent files being deleted on S3 even when they are removed locally.
@@ -22,12 +29,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2229

2330
### Fixed
2431
- Make S3 attribute changes only once, previously they were being repeated
25-
twice, unnecessarily due to double listing in catalogue.change_list.
32+
twice, unnecessarily due to double listing in catalogue.change\_list.
33+
2634

2735
## [0.2.2] - 2019-03-19
2836
### Fixed
2937
- Handle `s3_project_root` being either not set or set as empty string.
3038

39+
3140
## [0.2.1] - 2019-03-11
3241
### Added
3342
- New command: `s3sup inspect` for reviewing attributes calculated from the
@@ -37,6 +46,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
3746
- Minor performance improvements from memoization.
3847
- Display file size along with uploads.
3948

49+
4050
## [0.1.1] - 2019-03-02
4151
### Added
4252
- Initial release

s3sup/catalogue.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
import csv
2+
import contextlib
3+
import gzip
4+
import sqlite3
5+
import shutil
6+
import tempfile
27
import enum
38
import collections
49
import copy
@@ -30,6 +35,31 @@ class ChangeReason(enum.Enum):
3035
ChangeReason['NO_CHANGE']: CR_STYLE('green', '^', 'unchanged', 'unchanged')
3136
}
3237

38+
MAX_DB_SCHEMA_VERSION = 2
39+
40+
41+
@contextlib.contextmanager
42+
def load_gzipped_sqlite(path):
43+
with tempfile.NamedTemporaryFile() as out_f:
44+
with gzip.open(path, 'rb') as in_f:
45+
shutil.copyfileobj(in_f, out_f)
46+
c = sqlite3.connect(out_f.name)
47+
c.row_factory = sqlite3.Row
48+
yield c
49+
c.close()
50+
51+
52+
@contextlib.contextmanager
53+
def write_gzipped_sqlite(path):
54+
with tempfile.NamedTemporaryFile() as in_f:
55+
c = sqlite3.connect(in_f.name)
56+
c.row_factory = sqlite3.Row
57+
yield c
58+
c.commit()
59+
c.close()
60+
with gzip.open(path, 'wb') as out_f:
61+
shutil.copyfileobj(in_f, out_f)
62+
3363

3464
class Catalogue:
3565

@@ -51,12 +81,34 @@ def from_csv(self, path: str):
5181
for path, content_hash, attributes_hash in csv.reader(f):
5282
self.add_file(path, content_hash, attributes_hash)
5383

54-
def to_csv(self, path: str):
55-
with open(path, 'wt', newline='') as f:
56-
writer = csv.writer(f, quoting=csv.QUOTE_ALL)
57-
writer.writerow(['path', 'content_hash', 'attributes_hash'])
58-
for path, (content_hsh, attributes_hsh) in self.to_dict().items():
59-
writer.writerow([path, content_hsh, attributes_hsh])
84+
def from_sqlite(self, path: str):
85+
with load_gzipped_sqlite(path) as c:
86+
schema_version = c.execute('PRAGMA user_version').fetchone()[0]
87+
if schema_version > MAX_DB_SCHEMA_VERSION:
88+
raise click.ClickException((
89+
'Upgrade to latest s3sup to continue. The s3sup version'
90+
'last used to push this project to S3 was newer than the '
91+
'installed version. The newer remote catalogue format is '
92+
'not readable by older s3sup version. Catalogue schema is '
93+
'version {0}, this s3sup only supports catalogue schema '
94+
'up to version {1}.').format(
95+
schema_version, MAX_DB_SCHEMA_VERSION))
96+
# Handle migrations here
97+
for row in c.execute('SELECT * FROM files'):
98+
self.add_file(
99+
row['path'], row['content_hash'], row['attributes_hash'])
100+
101+
def to_sqlite(self, path: str):
102+
with write_gzipped_sqlite(path) as c:
103+
c.execute('PRAGMA user_version = {v:d}'.format(
104+
v=MAX_DB_SCHEMA_VERSION))
105+
c.execute('''CREATE TABLE files (
106+
path TEXT,
107+
content_hash TEXT,
108+
attributes_hash TEXT)''')
109+
c.executemany(
110+
'INSERT INTO files VALUES (?, ?, ?)',
111+
[(path, ch, ah) for path, (ch, ah) in self.to_dict().items()])
60112

61113
def diff_dict(self, remote_catalogue):
62114
lcl = self.to_dict()

s3sup/project.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -116,43 +116,75 @@ def local_catalogue(self):
116116

117117
@functools.lru_cache(maxsize=8)
118118
def get_remote_catalogue(self):
119-
_, b = self._boto_bucket()
120-
rmt_cat_fp = self.file_prepper_wrapped('.s3sup.catalogue.csv')
121-
f = b.Object(rmt_cat_fp.s3_path())
122119
remote_cat = s3sup.catalogue.Catalogue(
123120
preserve_deleted_files=self._preserve_deleted_files)
124121

122+
_, b = self._boto_bucket()
123+
old_cat_fp = self.file_prepper_wrapped('.s3sup.catalogue.csv')
124+
old_f = b.Object(old_cat_fp.s3_path())
125+
126+
new_cat_fp = self.file_prepper_wrapped('.s3sup.cat')
127+
new_f = b.Object(new_cat_fp.s3_path())
128+
125129
hndl, tmpp = tempfile.mkstemp()
126130
os.close(hndl)
127131
try:
128-
f.download_file(tmpp)
129-
remote_cat.from_csv(tmpp)
132+
new_f.download_file(tmpp)
133+
remote_cat.from_sqlite(tmpp)
130134
except botocore.exceptions.NoCredentialsError:
131135
raise click.UsageError(
132136
'Cannot find AWS credentials.\n -> Configure AWS credentials '
133-
' using any mthod that the underlying boto3 library supports:'
137+
' using any method that the underlying boto3 library supports:'
134138
'\n -> https://boto3.amazonaws.com/v1/documentation/'
135139
'api/latest/guide/configuration.html')
136140
except botocore.exceptions.ClientError:
137141
if self.verbose:
138142
click.echo(
139-
'Project not uploaded before (no {0} on S3).'.format(
140-
rmt_cat_fp.s3_path()))
143+
('Could not find SQLite based remote catalogue on S3 '
144+
'(expected at {0}).').format(new_cat_fp.s3_path()))
145+
try:
146+
old_f.download_file(tmpp)
147+
remote_cat.from_csv(tmpp)
148+
click.echo(click.style((
149+
'WARNING: After the next s3sup push, do not attempt to '
150+
'use older versions of s3sup (0.3.0 or below) with this '
151+
'project, as they will no longer be able to read the '
152+
'remote catalogue.'), fg='blue'))
153+
except botocore.exceptions.ClientError:
154+
if self.verbose:
155+
click.echo(
156+
('Could not find older CSV based remote catalogue on '
157+
'S3 either (expected at {0}). This indicates the '
158+
'project has never been pushed to S3 before.').format(
159+
old_cat_fp.s3_path()))
160+
pass
141161
pass
142162
os.remove(tmpp)
143163
return remote_cat
144164

145165
def write_remote_catalogue(self, catalogue):
146166
hndl, tmpp = tempfile.mkstemp()
147167
os.close(hndl)
148-
catalogue.to_csv(tmpp)
149-
rmt_cat_fp = self.file_prepper_wrapped('.s3sup.catalogue.csv')
168+
catalogue.to_sqlite(tmpp)
169+
rmt_cat_fp = self.file_prepper_wrapped('.s3sup.cat')
150170
_, b = self._boto_bucket()
151171
o = b.Object(rmt_cat_fp.s3_path())
152172
with open(tmpp, 'rb') as lf:
153173
o.put(Body=lf, ACL='private')
154174
os.remove(tmpp)
155175

176+
# Deliberately break older s3sup clients <= 0.3.0.
177+
# This file even needs uploading even for projects that have never used
178+
# the old format, just in-case an old version of s3sup is used on it
179+
# in the future (perhaps if part of a CI/CD system is used).
180+
old_rmt_cat_fp = self.file_prepper_wrapped('.s3sup.catalogue.csv')
181+
the_breaker = (
182+
b'\xF9\xF9This is a deliberately corrupt old version of the s3sup '
183+
b'catalogue format It is not used any more and this file is only '
184+
b'here to cause s3sup clients <= 0.3.0 to fail, rather than have '
185+
b'them try to upload everything again.')
186+
b.Object(old_rmt_cat_fp.s3_path()).put(Body=the_breaker, ACL='private')
187+
156188
def calculate_diff(self):
157189
local_cat = self.local_catalogue()
158190
remote_cat = self.get_remote_catalogue()
@@ -171,7 +203,7 @@ def sync(self):
171203

172204
if self.dryrun:
173205
click.echo(click.style(
174-
'Not making any changes as this is a dryrun.', fg='blue'))
206+
'Not making any changes as this is a dry run.', fg='blue'))
175207
return changes
176208

177209
_, b = self._boto_bucket()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
setup(
1313
name='s3sup',
14-
version='0.3.0',
14+
version='0.4.0',
1515
description='Static site uploader for Amazon S3',
1616
long_description=long_description,
1717
long_description_content_type='text/markdown',
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This only appears in the 1.1 version.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<html>
2+
<head>
3+
<title>Hello world</title>
4+
</head>
5+
<body>
6+
<p>Hello world!</p>
7+
</body>
8+
</html>
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[aws]
2+
region_name = 'eu-west-1'
3+
s3_bucket_name = 'www.example.com'
4+
5+
[[path_specific]]
6+
path = '.*'
7+
Cache-Control = 'private; max-age=400'
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"path","content_hash","attributes_hash"
2+
"index.html","58a5a2dbce6698aa96666b90dbb3daf38813c4c74f6acb14abfc584f6ebb3a8c","2c8ecc19f27aab8c5b37d1825ba6f02a4d8051b9b00bc46a30e3f623d4f447da"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
<html>
2+
<head>
3+
<title>Hello world</title>
4+
</head>
5+
<body>
6+
<p>Hello world!</p>
7+
</body>
8+
</html>
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[aws]
2+
region_name = 'eu-west-1'
3+
s3_bucket_name = 'www.example.com'
4+
5+
[[path_specific]]
6+
path = '.*'
7+
Cache-Control = 'private; max-age=400'

0 commit comments

Comments
 (0)