diff --git a/README.md b/README.md index 3889ad2..8e1c7fe 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,17 @@ look like this: remote = your.remote-host.org:/share/fat-store sshuser = fat +## S3 remote + +Edit your .gitfat file and add the following: + + [s3] + bucket={bucketname} + key={access_key_id} + secret={secret_access_key} + +And then you're done. + # A worked example Before we start, let's turn on verbose reporting so we can see what's diff --git a/git-fat b/git-fat index 7edb7ba..03df2ce 100755 --- a/git-fat +++ b/git-fat @@ -40,6 +40,70 @@ except ImportError: return output subprocess.check_output = backport_check_output +try: + from boto.s3.connection import S3Connection + from boto.s3.key import Key + class S3Backend(object): + + def __init__(self,bucket,key,secret,objdir): + self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore + self.bucket = bucket + self.key = key + self.secret = secret + self.objdir = objdir + + def get_bucket(self): + conn = S3Connection(self.key, self.secret) + bkt = conn.get_bucket(self.bucket) + return bkt + + def pull(self,files): + bkt = self.get_bucket() + for file in files: + localfile = os.path.abspath(os.path.join(self.objdir,file)) + if os.path.isfile(localfile): + self.verbose('Object %s already exists, skipping.' % file) + else: + self.verbose('Getting object %s from s3 bucket %s' % (file,self.bucket)) + k = Key(bkt) + k.key = file + localfile = os.path.abspath(os.path.join(self.objdir,file)) + try: + k.get_contents_to_filename(localfile, + cb=S3Counter(), + num_cb=500) + except KeyboardInterrupt: + # If we cancel during download, make sure the partial + # download is removed. + os.remove(localfile) + raise + + def push(self,files): + bkt = self.get_bucket() + for file in files: + k = bkt.get_key(file) + if bkt.get_key(file): + self.verbose('Object %s already exists in bucket %s, skipping.' % (file,self.bucket)) + else: + k = Key(bkt) + k.key = file + localfile = os.path.abspath(os.path.join(self.objdir,file)) + self.verbose('Uploading object %s to s3 bucket %s' % (file,self.bucket)) + try: + k.set_contents_from_filename(localfile, + cb=S3Counter(), + num_cb=500) + except KeyboardInterrupt: + # If we cancel during upload, delete the partially uploaded + # remote object. Otherwise we'll have problems later. + k.delete() + raise +except ImportError: + class S3Backend(object): + + def __init__(self,bucket,key,secret,objdir): + raise RuntimeError("S3Backend requires boto.") + BLOCK_SIZE = 4096 def verbose_stderr(*args, **kwargs): @@ -47,6 +111,17 @@ def verbose_stderr(*args, **kwargs): def verbose_ignore(*args, **kwargs): pass +def ensure_binary_mode(stream): + try: # Attempt the Python-3 way, also needed to handle unicode + return stream.detach() + except: + pass + if sys.platform == "win32": + # Fall back to Python-2 way, only needed on Windows + import msvcrt + msvcrt.setmode(stream.fileno(), os.O_BINARY) + return stream + def mkdir_p(path): import errno try: @@ -63,10 +138,10 @@ def umask(): return old def readblocks(stream): - bytes = 0 + bytecount = 0 while True: data = stream.read(BLOCK_SIZE) - bytes += len(data) + bytecount += len(data) if not data: break yield data @@ -116,73 +191,135 @@ def gitconfig_set(name, value, file=None): args += [name, value] p = subprocess.check_call(args) +try: + import fish + class S3Counter: + def __init__(self): + self.fish = None + def __call__(self, complete, total): + if self.fish is None: + self.fish = fish.ProgressFish(total=total/1024) + self.fish.animate(amount=complete/1024) +except ImportError: + class S3Counter: + def __init__(self): + self.count = 0 + def __call__(self, complete, total): + if complete * 10 / total > self.count: + self.count += 1 + sys.stdout.write('.') + sys.stdout.flush() + +class RsyncBackend(object): + + def __init__(self,remote,ssh_port,ssh_user,options,objdir): + self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore + self.remote = remote + self.ssh_port = ssh_port + self.ssh_user = ssh_user + self.options = options + self.objdir = objdir + + def get_rsync_command(self,push): + (remote, ssh_port, ssh_user, options) = self.get_rsync() + if push: + self.verbose('Pushing to %s' % (remote)) + else: + self.verbose('Pulling from %s' % (remote)) + + cmd = [b'rsync', b'--progress', b'--ignore-existing', b'--from0', b'--files-from=-'] + rshopts = b'' + if self.ssh_user: + rshopts += b' -l ' + self.ssh_user + if self.ssh_port: + rshopts += b' -p ' + self.ssh_port + if rshopts: + cmd.append(b'--rsh=ssh' + rshopts) + if push: + cmd += [self.objdir + b'/', self.remote + b'/'] + else: + cmd += [self.remote + b'/', self.objdir + b'/'] + return cmd + + def pull(self,files): + cmd = self.get_rsync_command(push=False) + self.verbose('Executing: %s' % ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input=b'\x00'.join(files)) + + def push(self,files): + cmd = self.get_rsync_command(push=True) + self.verbose('Executing: %s' % ' '.join(cmd)) + p = subprocess.Popen(cmd, stdin=subprocess.PIPE) + p.communicate(input=b'\x00'.join(files)) + class GitFat(object): DecodeError = RuntimeError def __init__(self): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip() - self.objdir = os.path.join(self.gitdir, 'fat', 'objects') + self.objdir = os.path.join(self.gitdir, b'fat', b'objects') if os.environ.get('GIT_FAT_VERSION') == '1': self.encode = self.encode_v1 else: self.encode = self.encode_v2 def magiclen(enc): - return len(enc(hashlib.sha1('dummy').hexdigest(), 5)) + return len(enc(hashlib.sha1(b'dummy').hexdigest().encode('ASCII'), 5)) self.magiclen = magiclen(self.encode) # Current version self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions + self.backend = self.get_backend(self.objdir) def setup(self): mkdir_p(self.objdir) - def get_rsync(self): - cfgpath = os.path.join(self.gitroot,'.gitfat') - remote = gitconfig_get('rsync.remote', file=cfgpath) - ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) - ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) - options = gitconfig_get('rsync.options', file=cfgpath) - if remote is None: - raise RuntimeError('No rsync.remote in %s' % cfgpath) - return remote, ssh_port, ssh_user, options - def get_rsync_command(self,push): - (remote, ssh_port, ssh_user, options) = self.get_rsync() - if push: - self.verbose('Pushing to %s' % (remote)) + def get_backend(self,objdir): + """ + Parse the .gitfat config file and pick the first supported backend + to use. Currently supports rsync and s3. + """ + cfgpath = os.path.join(self.gitroot,b'.gitfat') + if gitconfig_get('rsync.remote', file=cfgpath): + remote = gitconfig_get('rsync.remote', file=cfgpath) + ssh_port = gitconfig_get('rsync.sshport', file=cfgpath) + ssh_user = gitconfig_get('rsync.sshuser', file=cfgpath) + options = gitconfig_get('rsync.options', file=cfgpath) + return RsyncBackend(remote,ssh_port,ssh_user,options,objdir) + elif gitconfig_get('s3.bucket', file=cfgpath): + bucket = gitconfig_get('s3.bucket', file=cfgpath) + key = gitconfig_get('s3.key', file=cfgpath) + if key is None: + try: + key = os.environ['AWS_ACCESS_KEY_ID'] + except KeyError: + raise RuntimeError('No s3.key in %s' % cfgpath) + secret = gitconfig_get('s3.secret', file=cfgpath) + if secret is None: + try: + secret = os.environ['AWS_SECRET_ACCESS_KEY'] + except KeyError: + raise RuntimeError('No s3.secret in %s' % cfgpath) + return S3Backend(bucket,key,secret,objdir) else: - self.verbose('Pulling from %s' % (remote)) + raise RuntimeError('No supported backends specified in %s' % cfgpath) - cmd = ['rsync', '--progress', '--ignore-existing', '--from0', '--files-from=-'] - rshopts = '' - if ssh_user: - rshopts += ' -l ' + ssh_user - if ssh_port: - rshopts += ' -p ' + ssh_port - if rshopts: - cmd.append('--rsh=ssh' + rshopts) - if options: - cmd += options.split(' ') - if push: - cmd += [self.objdir + '/', remote + '/'] - else: - cmd += [remote + '/', self.objdir + '/'] - return cmd def revparse(self, revname): return subprocess.check_output(['git', 'rev-parse', revname]).strip() - def encode_v1(self, digest, bytes): + def encode_v1(self, digest, bytecount): 'Produce legacy representation of file to be stored in repository.' - return '#$# git-fat %s\n' % (digest,) - def encode_v2(self, digest, bytes): + return (b'#$# git-fat ' + digest + b'\n') + def encode_v2(self, digest, bytecount): 'Produce representation of file to be stored in repository. 20 characters can hold 64-bit integers.' - return '#$# git-fat %s %20d\n' % (digest, bytes) - def decode(self, string, noraise=False): - cookie = '#$# git-fat ' - if string.startswith(cookie): - parts = string[len(cookie):].split() + return (b'#$# git-fat ' + digest + (' %20d\n' % (bytecount,)).encode('ASCII')) + def decode(self, bstring, noraise=False): + cookie = b'#$# git-fat ' + if bstring.startswith(cookie): + parts = bstring[len(cookie):].split() digest = parts[0] - bytes = int(parts[1]) if len(parts) > 1 else None - return digest, bytes + bytecount = int(parts[1]) if len(parts) > 1 else None + return digest, int(bytecount) elif noraise: return None, None else: - raise GitFat.DecodeError('Could not decode %s' % (string)) + raise GitFat.DecodeError('Could not decode %s' % repr(bstring)) def decode_stream(self, stream): 'Return digest if git-fat cache, otherwise return iterator over entire file contents' preamble = stream.read(self.magiclen) @@ -198,13 +335,13 @@ class GitFat(object): return False, None # read file try: - digest, bytes = self.decode_stream(open(fname)) + digest, bytecount = self.decode_stream(open(fname, 'rb')) except IOError: return False, None - if isinstance(digest, str): - return digest, bytes + if isinstance(digest, bytes): + return digest, bytecount else: - return None, bytes + return None, bytecount def decode_clean(self, body): ''' Attempt to decode version in working tree. The tree version could be changed to have a more @@ -212,16 +349,17 @@ class GitFat(object): version decodes successfully, it indicates that the fat data is not currently available in this repository. ''' - digest, bytes = self.decode(body, noraise=True) + digest, bytecount = self.decode(body, noraise=True) return digest def filter_clean(self, instream, outstreamclean): h = hashlib.new('sha1') - bytes = 0 - fd, tmpname = tempfile.mkstemp(dir=self.objdir) + bytecount = 0 + # mkstemp requires 'str' rather than native filesystem bytes + fd, tmpname = tempfile.mkstemp(dir=self.objdir.decode(sys.getfilesystemencoding())) try: ishanging = False cached = False # changes to True when file is cached - with os.fdopen(fd, 'w') as cache: + with os.fdopen(fd, 'wb') as cache: outstream = cache blockiter = readblocks(instream) firstblock = True @@ -232,10 +370,10 @@ class GitFat(object): outstream = outstreamclean firstblock = False h.update(block) - bytes += len(block) + bytecount += len(block) outstream.write(block) outstream.flush() - digest = h.hexdigest() + digest = h.hexdigest().encode('ASCII') objfile = os.path.join(self.objdir, digest) if not ishanging: if os.path.exists(objfile): @@ -247,7 +385,7 @@ class GitFat(object): os.rename(tmpname, objfile) self.verbose('git-fat filter-clean: caching to %s' % objfile) cached = True - outstreamclean.write(self.encode(digest, bytes)) + outstreamclean.write(self.encode(digest, bytecount)) finally: if not cached: os.remove(tmpname) @@ -258,20 +396,26 @@ class GitFat(object): version of the file on stdin and produces the "clean" (repository) version on stdout. ''' self.setup() + # Set stdin and stdout to binary mode + sys.stdin = ensure_binary_mode(sys.stdin) + sys.stdout = ensure_binary_mode(sys.stdout) self.filter_clean(sys.stdin, sys.stdout) def cmd_filter_smudge(self): self.setup() - result, bytes = self.decode_stream(sys.stdin) - if isinstance(result, str): # We got a digest + # Ensure streams are treated as binary + sys.stdin = ensure_binary_mode(sys.stdin) + sys.stdout = ensure_binary_mode(sys.stdout) + result, bytecount = self.decode_stream(sys.stdin) + if isinstance(result, bytes): # We got a digest objfile = os.path.join(self.objdir, result) try: - cat(open(objfile), sys.stdout) self.verbose('git-fat filter-smudge: restoring from %s' % objfile) - except IOError: # file not found + cat(open(objfile, 'rb'), sys.stdout) + except IOError: # file not found self.verbose('git-fat filter-smudge: fat object missing %s' % objfile) - sys.stdout.write(self.encode(result, bytes)) # could leave a better notice about how to recover this file - else: # We have an iterable over the original input. + sys.stdout.write(self.encode(result, bytecount)) # could leave a better notice about how to recover this file + else: # We have an iterable over the original input. self.verbose('git-fat filter-smudge: not a managed file') cat_iter(result, sys.stdout) def catalog_objects(self): @@ -286,13 +430,13 @@ class GitFat(object): p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: - output.write(line.split()[0] + '\n') + output.write(line.split()[0] + b'\n') output.close() cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) cut_thread.start() for line in p2.stdout: objhash, objtype, size = line.split() - if objtype == 'blob' and int(size) in self.magiclens: + if objtype == b'blob' and int(size) in self.magiclens: try: fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0] referenced.add(fathash) @@ -339,10 +483,7 @@ class GitFat(object): # (includes history). Finer-grained pushing would be useful. pushall = '--all' in args files = self.referenced_objects(all=pushall) & self.catalog_objects() - cmd = self.get_rsync_command(push=True) - self.verbose('Executing: %s' % ' '.join(cmd)) - p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - p.communicate(input='\x00'.join(files)) + self.backend.push(files) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' for digest, fname in self.orphan_files(): @@ -373,10 +514,7 @@ class GitFat(object): if rev: refargs['rev'] = rev files = self.filter_objects(refargs, self.parse_pull_patterns(args)) - cmd = self.get_rsync_command(push=False) - self.verbose('Executing: %s' % ' '.join(cmd)) - p = subprocess.Popen(cmd, stdin=subprocess.PIPE) - p.communicate(input='\x00'.join(files)) + self.backend.pull(files) self.checkout() def parse_pull_patterns(self, args): @@ -444,6 +582,7 @@ class GitFat(object): time1 = time.time() self.verbose('%d of %d blobs are >= %d bytes [elapsed %.3fs]' % (numlarge, numblobs, threshsize, time1-time0)) def cmd_find(self, args): + # FIXME: Need input validation here maxsize = int(args[0]) blobsizes = dict(self.gen_large_blobs('--all', maxsize)) time0 = time.time() @@ -464,14 +603,15 @@ class GitFat(object): revlist.wait() difftree.wait() def cmd_index_filter(self, args): + # FIXME: Need input validation here manage_gitattributes = '--manage-gitattributes' in args filelist = set(f.strip() for f in open(args[0]).readlines()) lsfiles = subprocess.Popen(['git', 'ls-files', '-s'], stdout=subprocess.PIPE) updateindex = subprocess.Popen(['git', 'update-index', '--index-info'], stdin=subprocess.PIPE) - for line in lsfiles.stdout: - mode, sep, tail = line.partition(' ') - blobhash, sep, tail = tail.partition(' ') - stageno, sep, tail = tail.partition('\t') + for line in lsfiles.stdout.read(): + mode, sep, tail = line.partition(b' ') + blobhash, sep, tail = tail.partition(b' ') + stageno, sep, tail = tail.partition(b'\t') filename = tail.strip() if filename not in filelist: continue @@ -479,7 +619,7 @@ class GitFat(object): # skip symbolic links continue # This file will contain the hash of the cleaned object - hashfile = os.path.join(self.gitdir, 'fat', 'index-filter', blobhash) + hashfile = os.path.join(self.gitdir, b'fat', b'index-filter', blobhash) try: cleanedobj = open(hashfile).read().rstrip() except IOError: