From 032539868a1062e64a552c513352de539824c680 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 17 Jan 2019 14:24:52 -0500 Subject: [PATCH 01/21] Dummy empty commit to facilitate long lived PR From 2fe21983c39dcb0d6f6232ff4d2f33755ec47832 Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Wed, 20 Feb 2019 16:41:27 +0000 Subject: [PATCH 02/21] WIP: added venv to diff --- reproman/distributions/venv.py | 1 + reproman/interface/diff.py | 14 +++++++--- reproman/interface/tests/files/diff_1.yaml | 30 ++++++++++++++++++++++ reproman/interface/tests/files/diff_2.yaml | 30 ++++++++++++++++++++++ reproman/interface/tests/test_diff.py | 12 +++++++++ 5 files changed, 84 insertions(+), 3 deletions(-) diff --git a/reproman/distributions/venv.py b/reproman/distributions/venv.py index f66b65e54..cb7f8856a 100644 --- a/reproman/distributions/venv.py +++ b/reproman/distributions/venv.py @@ -47,6 +47,7 @@ class VenvEnvironment(SpecObject): path = attrib() python_version = attrib() packages = TypedList(VenvPackage) + _diff_cmp_fields = ('path', 'python_version') @attr.s diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index f77be7f62..56c04b66d 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -20,6 +20,7 @@ from ..distributions.debian import DebianDistribution from ..distributions.conda import CondaDistribution from ..distributions.vcs import GitDistribution, SVNDistribution +from ..distributions.venv import VenvDistribution, VenvEnvironment __docformat__ = 'restructuredtext' @@ -88,7 +89,8 @@ def diff(env_1, env_2): DebianDistribution: 'Debian package', CondaDistribution: 'Conda package', GitDistribution: 'Git repository', - SVNDistribution: 'SVN repository' + SVNDistribution: 'SVN repository', + VenvDistribution: 'Venv environment', } env_1_dist_types = { d.__class__ for d in env_1.distributions } @@ -103,12 +105,18 @@ def diff(env_1, env_2): 'pkg_diffs': []} dist_1 = env_1.get_distribution(dist_type) if dist_1: - pkgs_1 = {p._diff_cmp_id: p for p in dist_1.packages} + if dist_type == VenvDistribution: + pkgs_1 = {p._diff_cmp_id: p for p in dist_1.environments} + else: + pkgs_1 = {p._diff_cmp_id: p for p in dist_1.packages} else: pkgs_1 = {} dist_2 = env_2.get_distribution(dist_type) if dist_2: - pkgs_2 = {p._diff_cmp_id: p for p in dist_2.packages} + if dist_type == VenvDistribution: + pkgs_2 = {p._diff_cmp_id: p for p in dist_2.environments} + else: + pkgs_2 = {p._diff_cmp_id: p for p in dist_2.packages} else: pkgs_2 = {} dist_res['pkgs_1'] = pkgs_1 diff --git a/reproman/interface/tests/files/diff_1.yaml b/reproman/interface/tests/files/diff_1.yaml index 08a098312..1873b86ca 100644 --- a/reproman/interface/tests/files/diff_1.yaml +++ b/reproman/interface/tests/files/diff_1.yaml @@ -58,4 +58,34 @@ distributions: - path: /path/1/to/different/svn/commit revision: 12 uuid: 95e4b738-84c7-154c-f082-34d40e21fdd4 +- name: venv + path: /usr/bin/virtualenv + venv_version: 15.1.0 + environments: + - path: /home/user/venvs/test/both + python_version: 2.7.15+ + packages: + - name: six + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/1_only + python_version: 2.7.15+ + packages: + - name: funcsigs + version: 1.0.2 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/funcsigs-1.0.2.dist-info/METADATA + - name: rpaths + version: '0.13' + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/rpaths-0.13.dist-info/METADATA files: [/etc/a, /etc/b] diff --git a/reproman/interface/tests/files/diff_2.yaml b/reproman/interface/tests/files/diff_2.yaml index bed05c111..21eb718a7 100644 --- a/reproman/interface/tests/files/diff_2.yaml +++ b/reproman/interface/tests/files/diff_2.yaml @@ -58,4 +58,34 @@ distributions: - path: /path/2/to/different/svn/commit revision: 14 uuid: 95e4b738-84c7-154c-f082-34d40e21fdd4 +- name: venv + path: /usr/bin/virtualenv + venv_version: 15.1.0 + environments: + - path: /home/user/venvs/test/both + python_version: 2.7.15+ + packages: + - name: six + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/2_only + python_version: 2.7.15+ + packages: + - name: funcsigs + version: 1.0.2 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/funcsigs-1.0.2.dist-info/METADATA + - name: rpaths + version: '0.13' + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/rpaths-0.13.dist-info/METADATA files: [/etc/c, /etc/b] diff --git a/reproman/interface/tests/test_diff.py b/reproman/interface/tests/test_diff.py index 77e0a0adb..0bc51b883 100644 --- a/reproman/interface/tests/test_diff.py +++ b/reproman/interface/tests/test_diff.py @@ -153,6 +153,18 @@ def test_diff_svn(): assert_not_in('(/path/2/to/common/svn/repo)', outputs.out) +def test_diff_venv(): + with swallow_outputs() as outputs: + args = ['diff', diff_1_yaml, diff_2_yaml] + rv = main(args) + assert_equal(rv, 3) + assert_equal(outputs.err, '') + assert_in('Venv environments:', outputs.out) + assert_in('< /home/user/venvs/test/1_only 2.7.15+', outputs.out) + assert_in('> /home/user/venvs/test/2_only 2.7.15+', outputs.out) + assert_not_in('/home/user/venvs/test/both', outputs.out) + + def test_diff_satisfies_unsupported_distribution(): # using subprocess.call() here because we're looking for a condition # that raises an exception in main(), so it doesn't return and we From 66d8b673590f233eb239ebcf1f77784db97a964d Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Wed, 20 Mar 2019 21:02:22 +0000 Subject: [PATCH 03/21] replacing SpecObject._comparison_fields with _diff_cmp_fields + _diff_fields --- reproman/distributions/base.py | 46 +++++++++++++++++++++----------- reproman/distributions/debian.py | 1 - reproman/distributions/redhat.py | 3 ++- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 6f56083bb..ad6c11842 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -46,8 +46,6 @@ class SpecObject(object): _diff_cmp_fields = tuple() # Fields of the primary interest when showing diff _diff_fields = tuple() - # Fields used in determination of comparison (satisfied_by and identical_to) - _comparison_fields = tuple() @property def _diff_cmp_id(self): @@ -60,12 +58,19 @@ def _diff_cmp_id(self): @property def _cmp_id(self): - if not self._comparison_fields: + if not self._diff_cmp_fields: # Might need to be gone or some custom exception raise RuntimeError( - "Cannot establish identity of %r since _comaprison_fields " + "Cannot establish identity of %r since _diff_cmp_fields " "are not defined" % self) - return tuple(getattr(self, a) for a in self._comparison_fields) + if not self._diff_fields: + # Might need to be gone or some custom exception + raise RuntimeError( + "Cannot establish identity of %r since _diff_fields " + "are not defined" % self) + vals = [ getattr(self, a) for a in self._diff_cmp_fields ] + vals.extend([ getattr(self, a) for a in self._diff_fields ]) + return tuple(vals) @property def _diff_vals(self): @@ -94,8 +99,8 @@ def diff_subidentity_string(self): @property def identity_string(self): - """like diff_identity_string, but for _comparison_fields (used in - satisfied_by comparisons) + """like diff_identity_string, but for both _diff_cmp_fields and + _diff_fields (used in satisfied_by comparisons) """ return " ".join(str(el) for el in self._cmp_id if el is not None) @@ -137,15 +142,15 @@ def _satisfied_by(self, other): spec object. We require that the values of the attributes given by - _comparison_fields are the same. A specobject with a value of None - for one of these attributes is less specific than one with - a specific value; the former cannot satisfy the latter, - but the latter can satisfy the former. + _diff_cmp_fields and _diff_fields are the same. A specobject + with a value of None for one of these attributes is less specific + than one with a specific value; the former cannot satisfy the + latter, but the latter can satisfy the former. TODO: Ensure we don't encounter the case where self is completely unspecified (all values are None), in which case satisfied_by() returns True by default. Perhaps this is done by making - sure that at least one of the _comparison_fields cannot be None. + sure that at least one of the _diff_cmp_fields cannot be None. TODO: derive _collection_type directly from _collection. This isn't possible at the moment because DebianDistribution.packages is @@ -162,7 +167,14 @@ def _satisfied_by(self, other): raise TypeError('don''t know how to determine if a %s is satisfied by a %s' % (self.__class__, other_collection_type)) if not isinstance(other, self.__class__): raise TypeError('incompatible specobject types') - for attr_name in self._comparison_fields: + for attr_name in self._diff_cmp_fields: + self_value = getattr(self, attr_name) + other_value = getattr(other, attr_name) + if self_value is None: + continue + if self_value != other_value: + return False + for attr_name in self._diff_fields: self_value = getattr(self, attr_name) other_value = getattr(other, attr_name) if self_value is None: @@ -176,11 +188,15 @@ def _identical_to(self, other): """Determine if the other object is identical to the spec object. We require that the objects are of the same type and that the - values of the attributes given by _comparison_fields are the same. + values of the attributes given by _diff_cmp_fields and _diff_fields + are the same. """ if not isinstance(other, self.__class__): return False - for attr_name in self._comparison_fields: + for attr_name in self._diff_cmp_fields: + if getattr(self, attr_name) != getattr(other, attr_name): + return False + for attr_name in self._diff_fields: if getattr(self, attr_name) != getattr(other, attr_name): return False return True diff --git a/reproman/distributions/debian.py b/reproman/distributions/debian.py index 8154953dc..db0a37e40 100644 --- a/reproman/distributions/debian.py +++ b/reproman/distributions/debian.py @@ -91,7 +91,6 @@ class DEBPackage(Package): _diff_cmp_fields = ('name', 'architecture') _diff_fields = ('version',) - _comparison_fields = ('name', 'architecture', 'version') _register_with_representer(DEBPackage) diff --git a/reproman/distributions/redhat.py b/reproman/distributions/redhat.py index 8f7baf0c4..87a48fefe 100644 --- a/reproman/distributions/redhat.py +++ b/reproman/distributions/redhat.py @@ -65,7 +65,8 @@ class RPMPackage(Package): vendor = attrib() url = attrib() files = attrib(default=attr.Factory(list), hash=False) - _comparison_fields = ('name', 'version', 'architecture') + _diff_cmp_fields = ('name', 'architecture') + _diff_fields = ('version',) _register_with_representer(RPMPackage) From 3ab7a04e3c776245557771883b50f9556539ca0e Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Thu, 21 Mar 2019 13:44:23 +0000 Subject: [PATCH 04/21] refactored _satisfied_by() and _identical_to() in SpecObject --- reproman/distributions/base.py | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index ad6c11842..57f7ff2a3 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -167,16 +167,7 @@ def _satisfied_by(self, other): raise TypeError('don''t know how to determine if a %s is satisfied by a %s' % (self.__class__, other_collection_type)) if not isinstance(other, self.__class__): raise TypeError('incompatible specobject types') - for attr_name in self._diff_cmp_fields: - self_value = getattr(self, attr_name) - other_value = getattr(other, attr_name) - if self_value is None: - continue - if self_value != other_value: - return False - for attr_name in self._diff_fields: - self_value = getattr(self, attr_name) - other_value = getattr(other, attr_name) + for (self_value, other_value) in zip(self._cmp_id, other._cmp_id): if self_value is None: continue if self_value != other_value: @@ -189,17 +180,11 @@ def _identical_to(self, other): We require that the objects are of the same type and that the values of the attributes given by _diff_cmp_fields and _diff_fields - are the same. + (dereferenced in _cmp_id) are the same. """ if not isinstance(other, self.__class__): return False - for attr_name in self._diff_cmp_fields: - if getattr(self, attr_name) != getattr(other, attr_name): - return False - for attr_name in self._diff_fields: - if getattr(self, attr_name) != getattr(other, attr_name): - return False - return True + return all(sv==ov for sv, ov in zip(self._cmp_id, other._cmp_id)) def _register_with_representer(cls): From 9599683742f99db8a0f6dbaadf510ecd2b4f63ec Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Thu, 21 Mar 2019 14:36:53 +0000 Subject: [PATCH 05/21] added class SpecDiff --- reproman/distributions/base.py | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 57f7ff2a3..64d537360 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -424,3 +424,60 @@ def _create_package(self, **package_fields): provided by _get_packagefields_for_files """ return + + +class SpecDiff: + + """Difference object for SpecObjects. + + Instantiate with SpecDiff(a, b). a and b must be of the same type + or TypeError is raised. Either (but not both) may be None. + + Attributes: + + a, b: The two objects being compared. + + If _diff_cmp_fields is defined for the SpecObjects: + + diff_cmp_id: The _diff_cmp_id of the two objects. If + _diff_cmp_id are different for the two objects, + they cannot be compared and ValueError is raised. + + diff_vals_a, diff_vals_b: _diff_vals for a and b, respectively, + or None if a or b is None. + + For collection SpecObjects (e.g. DebianDistribution, containing + DEBPackages; these have _collection_attribute defined), we also + have: + + collection: a list of SpecDiff objects for the contained + SpecObjects. + """ + + def __init__(self, a, b): + if not isinstance(a, (SpecObject, type(None))) \ + or not isinstance(b, (SpecObject, type(None))): + raise TypeError('objects must be SpecObjects or None') + if not a and not b: + raise TypeError('objects cannot both be None') + if a and b and type(a) != type(b): + raise TypeError('objects must be of the same type') + self.cls = type(a) if a is not None else type(b) + self.a = a + self.b = b + if self.cls._diff_cmp_fields: + if a and b and a._diff_cmp_id != b._diff_cmp_id: + raise ValueError('objects\' _diff_cmp_id differ') + self.diff_vals_a = a._diff_vals if a else None + self.diff_vals_b = b._diff_vals if b else None + self.diff_cmp_id = a._diff_cmp_id if a else b._diff_cmp_id + if hasattr(a, '_collection_attribute'): + self.collection = [] + a_collection = { c._diff_cmp_id: c for c in a.collection } if a else {} + b_collection = { c._diff_cmp_id: c for c in b.collection } if b else {} + all_cmp_ids = set(a_collection).union(b_collection) + for cmp_id in all_cmp_ids: + ac = a_collection[cmp_id] if cmp_id in a_collection else None + bc = b_collection[cmp_id] if cmp_id in b_collection else None + self.collection.append(SpecDiff(ac, bc)) + return From eaf140d356c323dc33715da293858646cd5f2de6 Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Tue, 2 Apr 2019 20:11:41 +0000 Subject: [PATCH 06/21] added _collection_attribute to CondaDistribution, GitDistribution, SVNDistribution, and VenvDistribution --- reproman/distributions/conda.py | 1 + reproman/distributions/vcs.py | 4 ++++ reproman/distributions/venv.py | 1 + 3 files changed, 6 insertions(+) diff --git a/reproman/distributions/conda.py b/reproman/distributions/conda.py index 462d7dcf7..7c529e863 100644 --- a/reproman/distributions/conda.py +++ b/reproman/distributions/conda.py @@ -136,6 +136,7 @@ class CondaDistribution(Distribution): environments = TypedList(CondaEnvironment) _cmp_field = ('path',) + _collection_attribute = 'packages' def initiate(self, environment): """ diff --git a/reproman/distributions/vcs.py b/reproman/distributions/vcs.py index b118fd48c..200f8845e 100644 --- a/reproman/distributions/vcs.py +++ b/reproman/distributions/vcs.py @@ -125,6 +125,8 @@ class GitDistribution(VCSDistribution): _cmd = "git" packages = TypedList(GitRepo) + _collection_attribute = 'packages' + def initiate(self, session=None): pass @@ -296,6 +298,8 @@ class SVNDistribution(VCSDistribution): _cmd = "svn" packages = TypedList(SVNRepo) + _collection_attribute = 'packages' + def install_packages(self, session, use_version=True): raise NotImplementedError SVNRepo._distribution = SVNDistribution diff --git a/reproman/distributions/venv.py b/reproman/distributions/venv.py index 0efff7ec9..92a6c41dd 100644 --- a/reproman/distributions/venv.py +++ b/reproman/distributions/venv.py @@ -55,6 +55,7 @@ class VenvDistribution(Distribution): path = attrib() venv_version = attrib() environments = TypedList(VenvEnvironment) + _collection_attribute = 'environments' def initiate(self, _): return From 029dd3d5a38dc27cf0b96a411d7702a5749dad5d Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Tue, 2 Apr 2019 20:24:53 +0000 Subject: [PATCH 07/21] using SpecDiff for diff interface --- reproman/interface/diff.py | 88 +++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index 429f793e6..2aa40402b 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -21,6 +21,7 @@ from ..distributions.conda import CondaDistribution from ..distributions.vcs import GitDistribution, SVNDistribution from ..distributions.venv import VenvDistribution, VenvEnvironment +from ..distributions.base import SpecDiff __docformat__ = 'restructuredtext' @@ -82,8 +83,6 @@ def __call__(prov1, prov2, satisfies): @staticmethod def diff(env_1, env_2): - result = {'method': 'diff', 'distributions': []} - # distribution type -> package type string supported_distributions = { DebianDistribution: 'Debian package', @@ -97,47 +96,24 @@ def diff(env_1, env_2): env_2_dist_types = { d.__class__ for d in env_2.distributions } all_dist_types = env_1_dist_types.union(env_2_dist_types) + diffs = [] + for dist_type in all_dist_types: if dist_type not in supported_distributions: msg = 'diff doesn\'t know how to handle %s' % str(dist_type) raise ValueError(msg) - dist_res = {'pkg_type': supported_distributions[dist_type], - 'pkg_diffs': []} dist_1 = env_1.get_distribution(dist_type) - if dist_1: - if dist_type == VenvDistribution: - pkgs_1 = {p._diff_cmp_id: p for p in dist_1.environments} - else: - pkgs_1 = {p._diff_cmp_id: p for p in dist_1.packages} - else: - pkgs_1 = {} dist_2 = env_2.get_distribution(dist_type) - if dist_2: - if dist_type == VenvDistribution: - pkgs_2 = {p._diff_cmp_id: p for p in dist_2.environments} - else: - pkgs_2 = {p._diff_cmp_id: p for p in dist_2.packages} - else: - pkgs_2 = {} - dist_res['pkgs_1'] = pkgs_1 - dist_res['pkgs_2'] = pkgs_2 - pkgs_1_s = set(pkgs_1) - pkgs_2_s = set(pkgs_2) - dist_res['pkgs_only_1'] = pkgs_1_s - pkgs_2_s - dist_res['pkgs_only_2'] = pkgs_2_s - pkgs_1_s - for cmp_key in pkgs_1_s.intersection(pkgs_2_s): - package_1 = pkgs_1[cmp_key] - package_2 = pkgs_2[cmp_key] - if package_1._diff_vals != package_2._diff_vals: - dist_res['pkg_diffs'].append((package_1, package_2)) - result['distributions'].append(dist_res) + diffs.append({'diff': SpecDiff(dist_1, dist_2), + 'pkg_type': supported_distributions[dist_type]}) files1 = set(env_1.files) files2 = set(env_2.files) - result['files_1_only'] = files1 - files2 - result['files_2_only'] = files2 - files1 - return result + return {'method': 'diff', + 'diffs': diffs, + 'files_1_only': files1 - files2, + 'files_2_only': files2 - files1} @staticmethod def satisfies(env_1, env_2): @@ -188,30 +164,44 @@ def render_cmdline_diff(result): status = 0 - for dist_res in result['distributions']: + for diff_d in result['diffs']: - if dist_res['pkgs_only_1'] or dist_res['pkgs_only_2']: - print(_make_plural(dist_res['pkg_type']) + ':') + diff = diff_d['diff'] + pkg_type = diff_d['pkg_type'] - if dist_res['pkgs_only_1']: - for cmp_key in sorted(dist_res['pkgs_only_1']): - package = dist_res['pkgs_1'][cmp_key] + pkgs_only_1 = {} + pkgs_only_2 = {} + pkg_diffs = [] + + for pkg_diff in diff.collection: + if pkg_diff.a is None: + pkgs_only_2[pkg_diff.b._diff_cmp_id] = pkg_diff.b + elif pkg_diff.b is None: + pkgs_only_1[pkg_diff.a._diff_cmp_id] = pkg_diff.a + elif pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: + pkg_diffs.append(pkg_diff) + if pkgs_only_1 or pkgs_only_2: + print(_make_plural(pkg_type) + ':') + + if pkgs_only_1: + for cmp_key in sorted(pkgs_only_1): + package = pkgs_only_1[cmp_key] print('< %s' % package.diff_identity_string) status = 3 - if dist_res['pkgs_only_1'] and dist_res['pkgs_only_2']: + if pkgs_only_1 and pkgs_only_2: print('---') - if dist_res['pkgs_only_2']: - for cmp_key in sorted(dist_res['pkgs_only_2']): - package = dist_res['pkgs_2'][cmp_key] + if pkgs_only_2: + for cmp_key in sorted(pkgs_only_2): + package = pkgs_only_2[cmp_key] print('> %s' % package.diff_identity_string) status = 3 - for (package_1, package_2) in dist_res['pkg_diffs']: - print('%s %s:' % (dist_res['pkg_type'], - " ".join(package_1._diff_cmp_id))) - print('< %s' % package_1.diff_subidentity_string) - print('---') - print('> %s' % package_2.diff_subidentity_string) + if pkg_diffs: + for pkg_diff in pkg_diffs: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + print('< %s' % pkg_diff.a.diff_subidentity_string) + print('---') + print('> %s' % pkg_diff.b.diff_subidentity_string) status = 3 if result['files_1_only'] or result['files_2_only']: From 17525c752a34e7e6a338fb5d1b72ffed0dbfbadd Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Tue, 16 Apr 2019 17:16:40 +0000 Subject: [PATCH 08/21] supporting files lists in SpecDiff --- reproman/distributions/base.py | 49 +++++++++++++++++++++++----------- reproman/interface/diff.py | 17 +++++++++--- 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 64d537360..e5b15aba9 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -452,11 +452,15 @@ class SpecDiff: collection: a list of SpecDiff objects for the contained SpecObjects. + + If a and b are lists, they are treated as files specifications, and + self.collection is a list of (fname_a, fname_b) tuples. + TODO: give files and file collections their own specobjects """ def __init__(self, a, b): - if not isinstance(a, (SpecObject, type(None))) \ - or not isinstance(b, (SpecObject, type(None))): + if not isinstance(a, (SpecObject, list, type(None))) \ + or not isinstance(b, (SpecObject, list, type(None))): raise TypeError('objects must be SpecObjects or None') if not a and not b: raise TypeError('objects cannot both be None') @@ -465,19 +469,32 @@ def __init__(self, a, b): self.cls = type(a) if a is not None else type(b) self.a = a self.b = b - if self.cls._diff_cmp_fields: - if a and b and a._diff_cmp_id != b._diff_cmp_id: - raise ValueError('objects\' _diff_cmp_id differ') - self.diff_vals_a = a._diff_vals if a else None - self.diff_vals_b = b._diff_vals if b else None - self.diff_cmp_id = a._diff_cmp_id if a else b._diff_cmp_id - if hasattr(a, '_collection_attribute'): + if self.cls == list: + a_collection = set(a) + b_collection = set(b) self.collection = [] - a_collection = { c._diff_cmp_id: c for c in a.collection } if a else {} - b_collection = { c._diff_cmp_id: c for c in b.collection } if b else {} - all_cmp_ids = set(a_collection).union(b_collection) - for cmp_id in all_cmp_ids: - ac = a_collection[cmp_id] if cmp_id in a_collection else None - bc = b_collection[cmp_id] if cmp_id in b_collection else None - self.collection.append(SpecDiff(ac, bc)) + for fname in set(a_collection).union(b_collection): + if fname not in a_collection: + self.collection.append((None, fname)) + elif fname not in b_collection: + self.collection.append((fname, None)) + else: + self.collection.append((fname, fname)) + pass + else: + if self.cls._diff_cmp_fields: + if a and b and a._diff_cmp_id != b._diff_cmp_id: + raise ValueError('objects\' _diff_cmp_id differ') + self.diff_vals_a = a._diff_vals if a else None + self.diff_vals_b = b._diff_vals if b else None + self.diff_cmp_id = a._diff_cmp_id if a else b._diff_cmp_id + if hasattr(a, '_collection_attribute'): + self.collection = [] + a_collection = { c._diff_cmp_id: c for c in a.collection } if a else {} + b_collection = { c._diff_cmp_id: c for c in b.collection } if b else {} + all_cmp_ids = set(a_collection).union(b_collection) + for cmp_id in all_cmp_ids: + ac = a_collection[cmp_id] if cmp_id in a_collection else None + bc = b_collection[cmp_id] if cmp_id in b_collection else None + self.collection.append(SpecDiff(ac, bc)) return diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index 2aa40402b..1a6a76d34 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -107,6 +107,9 @@ def diff(env_1, env_2): diffs.append({'diff': SpecDiff(dist_1, dist_2), 'pkg_type': supported_distributions[dist_type]}) + diffs.append({'diff': SpecDiff(env_1.files, env_2.files), + 'pkg_type': 'files'}) + files1 = set(env_1.files) files2 = set(env_2.files) @@ -169,6 +172,10 @@ def render_cmdline_diff(result): diff = diff_d['diff'] pkg_type = diff_d['pkg_type'] + if pkg_type == 'files': + files_diff = diff_d['diff'] + continue + pkgs_only_1 = {} pkgs_only_2 = {} pkg_diffs = [] @@ -204,13 +211,15 @@ def render_cmdline_diff(result): print('> %s' % pkg_diff.b.diff_subidentity_string) status = 3 - if result['files_1_only'] or result['files_2_only']: + files_1_only = [ t[0] for t in files_diff.collection if t[1] is None ] + files_2_only = [ t[1] for t in files_diff.collection if t[0] is None ] + if files_1_only or files_2_only: print('Files:') - for fname in result['files_1_only']: + for fname in files_1_only: print('< %s' % fname) - if result['files_1_only'] and result['files_2_only']: + if files_1_only and files_2_only: print('---') - for fname in result['files_2_only']: + for fname in files_2_only: print('> %s' % fname) status = 3 From 138c2a6eef79fdc7686a8f5ba96e46c9b472c82b Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Tue, 16 Apr 2019 17:52:53 +0000 Subject: [PATCH 09/21] BF: in SpecDiff, checking for collection attribute in specobject class, not instance (which may be None) --- reproman/distributions/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index e5b15aba9..2cfbe2176 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -488,7 +488,7 @@ def __init__(self, a, b): self.diff_vals_a = a._diff_vals if a else None self.diff_vals_b = b._diff_vals if b else None self.diff_cmp_id = a._diff_cmp_id if a else b._diff_cmp_id - if hasattr(a, '_collection_attribute'): + if hasattr(self.cls, '_collection_attribute'): self.collection = [] a_collection = { c._diff_cmp_id: c for c in a.collection } if a else {} b_collection = { c._diff_cmp_id: c for c in b.collection } if b else {} From 39f5f4e3ea881de587672d918e5686e6bf707870 Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Tue, 16 Apr 2019 20:57:25 +0000 Subject: [PATCH 10/21] added SpecDiff attributes a_only, b_only, and diffs --- reproman/distributions/base.py | 26 +++++++++++++++++++++- reproman/interface/diff.py | 40 ++++++++++------------------------ 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 2cfbe2176..6b43b660d 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -453,6 +453,15 @@ class SpecDiff: collection: a list of SpecDiff objects for the contained SpecObjects. + a_only: SpecObjects from collection that only appear in the first + passed SpecObject + + b_only: SpecObjects from collection that only appear in the second + passed SpecObject + + diffs: SpecDiff objects in collection that appear in both + passed SpecObject + If a and b are lists, they are treated as files specifications, and self.collection is a list of (fname_a, fname_b) tuples. TODO: give files and file collections their own specobjects @@ -480,7 +489,6 @@ def __init__(self, a, b): self.collection.append((fname, None)) else: self.collection.append((fname, fname)) - pass else: if self.cls._diff_cmp_fields: if a and b and a._diff_cmp_id != b._diff_cmp_id: @@ -497,4 +505,20 @@ def __init__(self, a, b): ac = a_collection[cmp_id] if cmp_id in a_collection else None bc = b_collection[cmp_id] if cmp_id in b_collection else None self.collection.append(SpecDiff(ac, bc)) + if hasattr(self, 'collection'): + self.a_only = [] + self.b_only = [] + self.diffs = [] + for pkg_diff in self.collection: + if isinstance(pkg_diff, tuple): + (a, b) = pkg_diff + else: + a = pkg_diff.a + b = pkg_diff.b + if not a: + self.b_only.append(b) + elif not b: + self.a_only.append(a) + elif not isinstance(pkg_diff, tuple) and pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: + self.diffs.append(pkg_diff) return diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index 1a6a76d34..22e857fd7 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -176,39 +176,21 @@ def render_cmdline_diff(result): files_diff = diff_d['diff'] continue - pkgs_only_1 = {} - pkgs_only_2 = {} - pkg_diffs = [] - - for pkg_diff in diff.collection: - if pkg_diff.a is None: - pkgs_only_2[pkg_diff.b._diff_cmp_id] = pkg_diff.b - elif pkg_diff.b is None: - pkgs_only_1[pkg_diff.a._diff_cmp_id] = pkg_diff.a - elif pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: - pkg_diffs.append(pkg_diff) - if pkgs_only_1 or pkgs_only_2: + if diff.a_only or diff.b_only: print(_make_plural(pkg_type) + ':') - - if pkgs_only_1: - for cmp_key in sorted(pkgs_only_1): - package = pkgs_only_1[cmp_key] - print('< %s' % package.diff_identity_string) status = 3 - if pkgs_only_1 and pkgs_only_2: + for package in diff.a_only: + print('< %s' % package.diff_identity_string) + if diff.a_only and diff.b_only: print('---') - if pkgs_only_2: - for cmp_key in sorted(pkgs_only_2): - package = pkgs_only_2[cmp_key] - print('> %s' % package.diff_identity_string) - status = 3 + for package in diff.b_only: + print('> %s' % package.diff_identity_string) - if pkg_diffs: - for pkg_diff in pkg_diffs: - print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) - print('< %s' % pkg_diff.a.diff_subidentity_string) - print('---') - print('> %s' % pkg_diff.b.diff_subidentity_string) + for pkg_diff in diff.diffs: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + print('< %s' % pkg_diff.a.diff_subidentity_string) + print('---') + print('> %s' % pkg_diff.b.diff_subidentity_string) status = 3 files_1_only = [ t[0] for t in files_diff.collection if t[1] is None ] From cf5db619c2176029aefc2980ec90c0735cf929fc Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Wed, 17 Apr 2019 15:53:04 +0000 Subject: [PATCH 11/21] in SpecDiff: changed diff to a_and_b collecting diff objects in a_only and b_only --- reproman/distributions/base.py | 22 +++++++++++----------- reproman/interface/diff.py | 23 ++++++++++++----------- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 6b43b660d..2ac66a9c9 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -453,14 +453,14 @@ class SpecDiff: collection: a list of SpecDiff objects for the contained SpecObjects. - a_only: SpecObjects from collection that only appear in the first - passed SpecObject + a_only: SpecDiff objects from collection that only appear in the + first passed SpecObject - b_only: SpecObjects from collection that only appear in the second - passed SpecObject + b_only: SpecDiff objects from collection that only appear in the + second passed SpecObject - diffs: SpecDiff objects in collection that appear in both - passed SpecObject + a_and_b: SpecDiff objects in collection that appear in both + passed SpecObjects If a and b are lists, they are treated as files specifications, and self.collection is a list of (fname_a, fname_b) tuples. @@ -508,7 +508,7 @@ def __init__(self, a, b): if hasattr(self, 'collection'): self.a_only = [] self.b_only = [] - self.diffs = [] + self.a_and_b = [] for pkg_diff in self.collection: if isinstance(pkg_diff, tuple): (a, b) = pkg_diff @@ -516,9 +516,9 @@ def __init__(self, a, b): a = pkg_diff.a b = pkg_diff.b if not a: - self.b_only.append(b) + self.b_only.append(pkg_diff) elif not b: - self.a_only.append(a) - elif not isinstance(pkg_diff, tuple) and pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: - self.diffs.append(pkg_diff) + self.a_only.append(pkg_diff) + else: + self.a_and_b.append(pkg_diff) return diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index 22e857fd7..06be43f04 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -179,19 +179,20 @@ def render_cmdline_diff(result): if diff.a_only or diff.b_only: print(_make_plural(pkg_type) + ':') status = 3 - for package in diff.a_only: - print('< %s' % package.diff_identity_string) + for pkg_diff in diff.a_only: + print('< %s' % pkg_diff.a.diff_identity_string) if diff.a_only and diff.b_only: print('---') - for package in diff.b_only: - print('> %s' % package.diff_identity_string) - - for pkg_diff in diff.diffs: - print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) - print('< %s' % pkg_diff.a.diff_subidentity_string) - print('---') - print('> %s' % pkg_diff.b.diff_subidentity_string) - status = 3 + for pkg_diff in diff.b_only: + print('> %s' % pkg_diff.b.diff_identity_string) + + for pkg_diff in diff.a_and_b: + if pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + print('< %s' % pkg_diff.a.diff_subidentity_string) + print('---') + print('> %s' % pkg_diff.b.diff_subidentity_string) + status = 3 files_1_only = [ t[0] for t in files_diff.collection if t[1] is None ] files_2_only = [ t[1] for t in files_diff.collection if t[0] is None ] From df6d69df1e3a4ab64a661525c1eb5461e0a8b9a6 Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Wed, 17 Apr 2019 16:00:06 +0000 Subject: [PATCH 12/21] added venv hierarchy to diff --- reproman/distributions/venv.py | 3 +++ reproman/interface/diff.py | 30 +++++++++++++++++----- reproman/interface/tests/files/diff_1.yaml | 19 ++++++++++++++ reproman/interface/tests/files/diff_2.yaml | 19 ++++++++++++++ 4 files changed, 65 insertions(+), 6 deletions(-) diff --git a/reproman/distributions/venv.py b/reproman/distributions/venv.py index 92a6c41dd..e017bdfcd 100644 --- a/reproman/distributions/venv.py +++ b/reproman/distributions/venv.py @@ -38,6 +38,8 @@ class VenvPackage(Package): location = attrib() editable = attrib(default=False) files = attrib(default=attr.Factory(list)) + _diff_cmp_fields = ('name', ) + _diff_fields = ('version', ) @attr.s @@ -46,6 +48,7 @@ class VenvEnvironment(SpecObject): python_version = attrib() packages = TypedList(VenvPackage) _diff_cmp_fields = ('path', 'python_version') + _collection_attribute = 'packages' @attr.s diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index 06be43f04..57a493cdd 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -187,12 +187,30 @@ def render_cmdline_diff(result): print('> %s' % pkg_diff.b.diff_identity_string) for pkg_diff in diff.a_and_b: - if pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: - print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) - print('< %s' % pkg_diff.a.diff_subidentity_string) - print('---') - print('> %s' % pkg_diff.b.diff_subidentity_string) - status = 3 + if not hasattr(pkg_diff, 'collection'): + if pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + print('< %s' % pkg_diff.a.diff_subidentity_string) + print('---') + print('> %s' % pkg_diff.b.diff_subidentity_string) + status = 3 + else: + a_only = [ pd.a for pd in pkg_diff.a_only ] + b_only = [ pd.b for pd in pkg_diff.b_only ] + ab = [ pd for pd in pkg_diff.a_and_b + if pd.diff_vals_a != pd.diff_vals_b ] + if a_only or b_only or ab: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + for pkg in a_only: + print('< %s' % pkg.diff_identity_string) + for pd in ab: + print('< %s %s' % (pd.a.diff_identity_string, pd.a.diff_subidentity_string)) + if (a_only and b_only) or ab: + print('---') + for pkg in b_only: + print('> %s' % pkg.diff_identity_string) + for pd in ab: + print('> %s %s' % (pd.b.diff_identity_string, pd.b.diff_subidentity_string)) files_1_only = [ t[0] for t in files_diff.collection if t[1] is None ] files_2_only = [ t[1] for t in files_diff.collection if t[0] is None ] diff --git a/reproman/interface/tests/files/diff_1.yaml b/reproman/interface/tests/files/diff_1.yaml index 1873b86ca..40c498f02 100644 --- a/reproman/interface/tests/files/diff_1.yaml +++ b/reproman/interface/tests/files/diff_1.yaml @@ -73,6 +73,25 @@ distributions: - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA - lib/python2.7/site-packages/six.pyc - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/bth_2 + python_version: 2.7.15+ + packages: + - name: one_only + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - name: version_mismatch + version: 1.1.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py - path: /home/user/venvs/test/1_only python_version: 2.7.15+ packages: diff --git a/reproman/interface/tests/files/diff_2.yaml b/reproman/interface/tests/files/diff_2.yaml index 21eb718a7..ff9939fc3 100644 --- a/reproman/interface/tests/files/diff_2.yaml +++ b/reproman/interface/tests/files/diff_2.yaml @@ -73,6 +73,25 @@ distributions: - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA - lib/python2.7/site-packages/six.pyc - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/bth_2 + python_version: 2.7.15+ + packages: + - name: two_only + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - name: version_mismatch + version: 1.2.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py - path: /home/user/venvs/test/2_only python_version: 2.7.15+ packages: From 6e9178f3652ec1d7ced44dbbed71c16ec050e1f7 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 16 May 2019 13:53:02 -0400 Subject: [PATCH 13/21] NF(WiP): SpecObject .find and .__iadd__ Should make it possible to "join" multiple specifications, and also avoid duplicates while retracing. Seems to run without crashing on a sample but I do not think it works correctly yet --- reproman/distributions/base.py | 117 ++++++++++++++++++++++++++++++- reproman/distributions/debian.py | 4 ++ reproman/interface/retrace.py | 28 +++++--- 3 files changed, 140 insertions(+), 9 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 2ac66a9c9..f51a3c5aa 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -42,7 +42,8 @@ class SpecObject(object): # needed (or make sure the trivial case is handled) # Fields used to establish the "identity" of the specobject for the - # purposes of diff + # purposes of finding the one and diff + # TODO: rename to e.g. _id_fields? _diff_cmp_fields = tuple() # Fields of the primary interest when showing diff _diff_fields = tuple() @@ -186,6 +187,113 @@ def _identical_to(self, other): return False return all(sv==ov for sv, ov in zip(self._cmp_id, other._cmp_id)) + def _find_attr(self, child): + """Find an attribute among TypedList attrs which might contain the child + + Checks are done based on the type. Returns None if no appropriate + attribute is found + """ + child_class = child.__class__ + assert issubclass(child_class, SpecObject) # only those are supported ATM + compatible_attrs = [] + + # For paranoids - do exhaustive search + for attr in self.__attrs_attrs__: + metadata_type = attr.metadata.get('type', None) + if metadata_type \ + and issubclass(child_class, metadata_type) \ + and issubclass(attr.default.factory, list): # TODO: fragile + # we got our hit for a possible container containing our + # classes + # TODO: cache mapping from child_class to known types + compatible_attrs.append(attr) + + if not compatible_attrs: + return + + # If multiple provide list of the same class -- blow for now? + if len(compatible_attrs) > 1: + raise ValueError( + "Multiple attributes seems to contain instances of %s: %s " + "ATM we are not supporting that." + % (child_class, compatible_attrs) + ) + return compatible_attrs[0] + + def _find_in_attr(self, child, attr): + """Given an attribute (should be containing an iterable) find a child + + No checks for attribute appropriateness (type check etc) for the child + is carried out + """ + + if not attr: + return # Found no attribute which might have contained it + + child_id = child._diff_cmp_id + values = getattr(self, attr.name) # TODO: isn't there a better way? + hits = [ + v for v in values + if v._diff_cmp_id == child_id + ] + + # checks + # TODO: parametrize etc to generate/allow for multiple hits? + if not hits: + return None + elif len(hits) > 1: + raise RuntimeError( + "Found multiple hits for %s in %s: %s . ATM we expect only a " + "single one", + child, attr.name, hits + ) + return hits[0] + + def find(self, child): + """Find an object among TypedList attrs which matches (identity vice) + """ + attr = self._find_attr(child) + if not attr: + return None + return self._find_in_attr(child, attr) + + def __iadd__(self, other): + """Add information from another object into this one""" + # Check the other one is of the same kind + assert isinstance(other, self.__class__) + # TEMP -- to decide what to do when subclass. For now take only + # exactly the same + assert other.__class__ is self.__class__ + # go through attrs, and if the other one defines a single attr which + # is not defined here -- blow + # For the lists - add those which aren't found, and found should get + # += 'ed if spec objects, appended otherwise + for a in self.__attrs_attrs__: + a_self = getattr(self, a.name) + a_other = getattr(other, a.name) + if a_self == a_other: + # All good and nothing for us to do here + continue + if isinstance(a.default, attr.Factory) and a.default.factory is list: + # we have a list of things... + if issubclass(a.metadata['type'], SpecObject): + # we might know what to do + for a_other_value in a_other: + a_self_value = self._find_in_attr(a_other_value, a) + if a_self_value is None: + # a new one! + a.append(a_other_value) + else: + # Delegate doing the right thing to the child's __iadd__ + a_self_value += a_other_value + else: + raise NotImplementedError( + "For now joining only lists of our own spec objects" + ) + # import pdb; pdb.set_trace() + pass + + def _register_with_representer(cls): # TODO: check if we could/should just inherit from yaml.YAMLObject @@ -211,6 +319,13 @@ class Distribution(SpecObject, metaclass=abc.ABCMeta): # name and looks awkward name = attrib(default=attr.NOTHING) + # Distributions are typically a singular entity on a system, although + # in some cases there could be multiple (e.g. conda installations) which + # managed to get used. + # So their identification by default will be done just based on the name + _diff_cmp_fields = ('name',) + + @staticmethod def factory(distribution_type, provenance=None): """ diff --git a/reproman/distributions/debian.py b/reproman/distributions/debian.py index 3d33d4a5b..0472e346c 100644 --- a/reproman/distributions/debian.py +++ b/reproman/distributions/debian.py @@ -66,6 +66,10 @@ class APTSource(SpecObject): site = attrib() archive_uri = attrib() date = attrib() + + # name contains a suffix so we probably should not use it + _diff_cmp_fields = ('origin', 'codename', 'component', 'architecture') + _register_with_representer(APTSource) diff --git a/reproman/interface/retrace.py b/reproman/interface/retrace.py index 1f4019b6b..55a9cc2d4 100644 --- a/reproman/interface/retrace.py +++ b/reproman/interface/retrace.py @@ -169,7 +169,11 @@ def identify_distributions(files, session=None, tracer_classes=None): # as they identify files beloning to them files_to_consider = set(files) - distibutions = [] + from reproman.distributions.base import EnvironmentSpec + environment_spec = EnvironmentSpec( + distributions=[] + ) + files_processed = set() files_to_trace = files_to_consider @@ -207,16 +211,23 @@ def identify_distributions(files, session=None, tracer_classes=None): # files_to_trace if files_to_trace: remaining_files_to_trace = files_to_trace - nenvs = 0 - for env, remaining_files_to_trace in tracer.identify_distributions( + nnewdists, nolddists = 0, 0 + for dist, remaining_files_to_trace in tracer.identify_distributions( files_to_trace): - distibutions.append(env) - nenvs += 1 + old_dist = environment_spec.find(dist) + if not old_dist: + nnewdists += 1 + environment_spec.distributions.append(dist) + else: + nolddists += 1 + old_dist += dist + nnewdists += 1 files_processed |= files_to_trace - remaining_files_to_trace files_to_trace = remaining_files_to_trace - lgr.info("%s: %d envs with %d other files remaining", + lgr.info("%s: %d new and %d old distributions with %d other files remaining", Tracer.__name__, - nenvs, + nnewdists, + nolddists, len(files_to_trace)) # Re-combine any files that were skipped @@ -230,7 +241,8 @@ def identify_distributions(files, session=None, tracer_classes=None): lgr.info("No more changes or files to track. Exiting the loop") break - return distibutions, files_to_consider + # TODO: -- return environment_spec itself? + return environment_spec.distributions, files_to_consider def get_tracer_classes(): From 85cae7ff8d6f073b03cfb351c3e9e7dfafe912c6 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 16 May 2019 14:41:08 -0400 Subject: [PATCH 14/21] temp comments --- reproman/distributions/base.py | 4 ++++ reproman/interface/retrace.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index f51a3c5aa..c1922024f 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -264,6 +264,10 @@ def __iadd__(self, other): # TEMP -- to decide what to do when subclass. For now take only # exactly the same assert other.__class__ is self.__class__ + # TODO: consider being able to add objects this class knows about. + # e.g. Adding DebianPackage's to .packages, or DebianDistribution + # to .distributions. That might eliminate if/then/else logic in + # retrace # go through attrs, and if the other one defines a single attr which # is not defined here -- blow # For the lists - add those which aren't found, and found should get diff --git a/reproman/interface/retrace.py b/reproman/interface/retrace.py index 55a9cc2d4..6c4b2dbd8 100644 --- a/reproman/interface/retrace.py +++ b/reproman/interface/retrace.py @@ -212,6 +212,12 @@ def identify_distributions(files, session=None, tracer_classes=None): if files_to_trace: remaining_files_to_trace = files_to_trace nnewdists, nolddists = 0, 0 + # TODO: note that here we start with the full list, but then + # do not shrink it (use remaining_files_to_trace) while looping + # The issue https://github.com/ReproNim/reproman/issues/417 + # TODO: + # refactor to return a list of dists, and a single remaining_files_to_trace + # Closes #417 when done ;) for dist, remaining_files_to_trace in tracer.identify_distributions( files_to_trace): old_dist = environment_spec.find(dist) From c4eb4fb679649c7d1a5f7a5454385c77cf8951b2 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 23 May 2019 12:03:29 -0400 Subject: [PATCH 15/21] BF(TST): establish FakeDistribution to mock test our distributions discovery --- reproman/interface/tests/test_retrace.py | 37 +++++++++++++++++------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/reproman/interface/tests/test_retrace.py b/reproman/interface/tests/test_retrace.py index f70057edc..64ce860aa 100644 --- a/reproman/interface/tests/test_retrace.py +++ b/reproman/interface/tests/test_retrace.py @@ -6,11 +6,14 @@ # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +import attr + from reproman.cmdline.main import main from reproman.formats import Provenance import logging +from reproman.distributions import Distribution from reproman.utils import swallow_logs, swallow_outputs, make_tempfile from reproman.tests.utils import assert_in from reproman.tests.skip import mark @@ -50,6 +53,20 @@ def test_retrace_normalize_paths(): assert "name: debian" in cm.out +@attr.s(cmp=False) +class FakeDistribution(Distribution): + # Allow to compare by name + def __eq__(self, other): + if isinstance(other, str): + return self.name == other + return super().__eq__(other) + + def initiate(self): + pass + def install_packages(self): + pass + + def get_tracer_session(protocols): class FakeSession(object): """A fake session attributes and methods of which should not @@ -95,12 +112,12 @@ def test_retrace_loop_over_tracers(): [ # Tracers [ # Tracer passes [ # what to yield - ("Env1", {"thefile"}) + (FakeDistribution("Dist1"), {"thefile"}) ] ] ], files=["thefile"], - tenvs=['Env1'], + tenvs=['Dist1'], tfiles={"thefile"}) # The 2nd tracer consumes everything @@ -108,17 +125,17 @@ def test_retrace_loop_over_tracers(): [ # Tracers [ # Tracer passes [ # what to yield - ("Env1", {"thefile"}) + (FakeDistribution("Dist1"), {"thefile"}) ], ], [ # Tracer passes [ # what to yield - ("Env2", set()) + (FakeDistribution("Dist2"), set()) ], ] ], files=["thefile"], - tenvs=['Env1', 'Env2'], + tenvs=['Dist1', 'Dist2'], tfiles=set()) # The fancy multi-yield and producing stuff @@ -126,17 +143,17 @@ def test_retrace_loop_over_tracers(): [ # Tracers [ # Tracer passes [ # what to yield - ("Env1", {"file2", "file3"}), + (FakeDistribution("Dist1"), {"file2", "file3"}), ], [ - ("Env3", {"file3"}) # consume file4 + (FakeDistribution("Dist3"), {"file3"}) # consume file4 ], [] # finale ], [ # Tracer passes [ # what to yield - ("Env2", {"file3", "file4", "file5"}), - ("Env2.1", {"file3", "file4"}) + (FakeDistribution("Dist2"), {"file3", "file4", "file5"}), + (FakeDistribution("Dist2.1"), {"file3", "file4"}) ], [ @@ -145,5 +162,5 @@ def test_retrace_loop_over_tracers(): ] ], files=["file1", "file2"], - tenvs=['Env1', 'Env2', 'Env2.1', 'Env3'], + tenvs=['Dist1', 'Dist2', 'Dist2.1', 'Dist3'], tfiles={'file3'}) \ No newline at end of file From 0d1209e61bbe471b9feceeb0e41263ba16b85b7f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 30 May 2019 10:28:57 -0400 Subject: [PATCH 16/21] NF: cfg retrace.only_with_files to make possible including packages only with files This is a very preliminary crude implementation, needed to bring sanity to test tracing conda and venv. Even without tests ATM :-/ Ideally we should rewamp configuration management so all those options could be specified at command line and have some centralized definition/management and easy ways to mock/patch for testing --- reproman.cfg | 6 ++++++ reproman/formats/reproman.py | 37 +++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/reproman.cfg b/reproman.cfg index 6ac3c7eb7..b19c7b7fa 100644 --- a/reproman.cfg +++ b/reproman.cfg @@ -1,6 +1,12 @@ [general] inventory_file = ~/inventory.yml +[retrace] +# report only the "packages" which have associated files +# It is cheap to gather information on all packages in some environments +# (conda, virtualenv, etc) so by default we would report all +only_with_files = false + [aws] access_key_id = AWS_ACCESS_KEY secret_access_key = AWS_SECRET_ACCESS_KEY diff --git a/reproman/formats/reproman.py b/reproman/formats/reproman.py index 6042a35bd..c2d865d27 100644 --- a/reproman/formats/reproman.py +++ b/reproman/formats/reproman.py @@ -12,6 +12,7 @@ from __future__ import absolute_import import collections +import copy import datetime import logging from collections import OrderedDict @@ -25,7 +26,10 @@ from reproman.utils import instantiate_attr_object from .base import Provenance from .utils import write_config -from .. import utils +from .. import ( + cfg, + utils, +) from ..distributions import Distribution from ..dochelpers import exc_str @@ -283,11 +287,42 @@ def write(cls, output, spec): out = OrderedDict() out['version'] = __version__ + if cfg.getboolean('retrace', 'only_with_files', False): + spec = copy.deepcopy(spec) + filter_spec(spec, packages_only_with_files=True) out.update(spec_to_dict(spec)) write_config(output, out) return out +def filter_spec(spec, packages_only_with_files=False): + """Filter out spec + + Parameters + ---------- + packages_only_with_files: bool, optional + Will return a spec with packages which have no files associated removed + """ + if not hasattr(spec, '__attrs_attrs__'): + return # nothing for us to do with this one + spec_attrs = spec.__attrs_attrs__ # as is -- list of them + for attr in spec_attrs: + value_in = getattr(spec, attr.name, None) + if attr.name.lower() == 'packages' and value_in and packages_only_with_files: + # go through them + assert isinstance(value_in, list) + value_out = [ + p for p in value_in + if getattr(p, 'files', None) + ] + setattr(spec, attr.name, value_out) + # we might need to recurse if it is a list + if isinstance(value_in, (list, dict, tuple)): + # might be specs too, recurse + for v in (value_in.values() if isinstance(value_in, dict) else value_in): + filter_spec(v, packages_only_with_files=packages_only_with_files) + + # TODO: RF into SpecObject._as_dict() def spec_to_dict(spec): From 14065c7c7ec19d470e48698ae9c1f04fd312445c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 30 May 2019 11:06:03 -0400 Subject: [PATCH 17/21] ENH: -t, --tracer option for retrace to be able to choose which ones to use TODOs: - possibly RF for better API (changed to return dict in get_tracer_classes) - we should know which tracers are available without importing the classes which should be delayed until use - might as well take care about # TODO: automate discovery of available tracers which should be possible if we unify naming for submodules/tracer classes - that list of tracers should appear in the --help for -t, --tracer - add basic tests for selection --- reproman/interface/retrace.py | 48 ++++++++++++++++++++---- reproman/interface/tests/test_retrace.py | 4 +- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/reproman/interface/retrace.py b/reproman/interface/retrace.py index 6c4b2dbd8..cd37debdf 100644 --- a/reproman/interface/retrace.py +++ b/reproman/interface/retrace.py @@ -11,6 +11,8 @@ from __future__ import unicode_literals +# Since 3.7 shouldn't be needed +from collections import OrderedDict from os.path import normpath import sys import time @@ -75,13 +77,22 @@ class Retrace(Interface): instance can be passed as the value for `resref`. PY]""", constraints=EnsureStr() | EnsureNone()), resref_type=resref_type_opt, + tracers=Parameter( + args=("-t", "--tracer"), + dest="tracers", + doc="Tracer(s) to use. Could be specified multiple times. " + "They will be used in the order specified.", + metavar='TRACER', + action='append', + constraints=EnsureStr() | EnsureNone(), + ), ) # TODO: add a session/resource so we could trace within # arbitrary sessions @staticmethod def __call__(path=None, spec=None, output_file=None, - resref=None, resref_type="auto"): + resref=None, resref_type="auto", tracers=None): # heavy import -- should be delayed until actually used if not (spec or path): @@ -119,9 +130,24 @@ def __call__(path=None, spec=None, output_file=None, # Generalize # TODO: RF so that only the above portion is reprozip specific. # If we are to reuse their layout largely -- the rest should stay as is + if tracers: + # specific ones asked for + all_tracer_classes = get_tracer_classes() + + tracer_classes = OrderedDict() + for t in tracers: + if t not in all_tracer_classes: + raise ValueError( + "Do not know tracer type '%s'. Known are %s" + % (t, ', '.join(all_tracer_classes))) + tracer_classes[t] = all_tracer_classes[t] + else: + tracer_classes = None + (distributions, files) = identify_distributions( paths, - session=session + session=session, + tracer_classes=tracer_classes, ) from reproman.distributions.base import EnvironmentSpec spec = EnvironmentSpec( @@ -148,6 +174,8 @@ def identify_distributions(files, session=None, tracer_classes=None): ---------- files : iterable Files to consider + tracer_classes: dict, optional + a dict with tracers (name: class) Returns ------- @@ -190,7 +218,7 @@ def identify_distributions(files, session=None, tracer_classes=None): % max_niter) break - for Tracer in tracer_classes: + for Tracer in tracer_classes.values(): lgr.debug("Tracing using %s", Tracer.__name__) # TODO: memoize across all loops # Identify directories from the files_to_consider @@ -252,7 +280,7 @@ def identify_distributions(files, session=None, tracer_classes=None): def get_tracer_classes(): - """A helper which returns a list of all available Tracers + """A helper which returns a dict (name: class) of all available Tracers The order should not but does matter and ATM is magically provided """ @@ -264,6 +292,12 @@ def get_tracer_classes(): from reproman.distributions.vcs import VCSTracer from reproman.distributions.docker import DockerTracer from reproman.distributions.singularity import SingularityTracer - Tracers = [DebTracer, RPMTracer, CondaTracer, VenvTracer, VCSTracer, - DockerTracer, SingularityTracer] - return Tracers + tracers = OrderedDict() + tracers['deb'] = DebTracer + tracers['rpm'] = RPMTracer + tracers['conda'] = CondaTracer + tracers['venv'] = VenvTracer + tracers['vcs'] = VCSTracer + tracers['docker'] = DockerTracer + tracers['singularity'] = SingularityTracer + return tracers diff --git a/reproman/interface/tests/test_retrace.py b/reproman/interface/tests/test_retrace.py index 1e963eda6..ca00e1e9c 100644 --- a/reproman/interface/tests/test_retrace.py +++ b/reproman/interface/tests/test_retrace.py @@ -80,7 +80,7 @@ class FakeSession(object): def isdir(self, _): return False # TODO: make it parametric - tracer_classes = [] + tracer_classes = {} for itracer, protocol in enumerate(protocols): # Test the loop logic class FakeTracer(object): @@ -97,7 +97,7 @@ def identify_distributions(self, files): for item in self._current_protocol: yield item FakeTracer.__name__ = "FakeTracer%d" % itracer - tracer_classes.append(FakeTracer) + tracer_classes['fake'] = FakeTracer return tracer_classes, FakeSession() From d6bf28b3b4cd10bf30e9e8cce77db37bd414a0b4 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 12 Sep 2019 17:37:14 -0400 Subject: [PATCH 18/21] BF: tracer_classes is a dict now, go through .values --- reproman/interface/tests/test_retrace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reproman/interface/tests/test_retrace.py b/reproman/interface/tests/test_retrace.py index ca00e1e9c..981cb3f57 100644 --- a/reproman/interface/tests/test_retrace.py +++ b/reproman/interface/tests/test_retrace.py @@ -105,7 +105,7 @@ def _check_loop_protocol(protocols, files, tenvs, tfiles): tracer_classes, session = get_tracer_session(protocols) dists, unknown_files = identify_distributions( files, session, tracer_classes=tracer_classes) - assert not any(t._protocol for t in tracer_classes), "we exhausted the protocol" + assert not any(t._protocol for t in tracer_classes.values()), "we exhausted the protocol" assert dists == tenvs assert unknown_files == tfiles From e64f317d8ea93a5b86f9891729a5a823f5467b8a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 12 Sep 2019 18:14:14 -0400 Subject: [PATCH 19/21] BF: give a fake tracer a unique index Before RFing into dict it was a list to which we were appending. With switching to dict we started to override the same tracer. Now they all will exist and in the order they were added --- reproman/interface/tests/test_retrace.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reproman/interface/tests/test_retrace.py b/reproman/interface/tests/test_retrace.py index 981cb3f57..6067f6be2 100644 --- a/reproman/interface/tests/test_retrace.py +++ b/reproman/interface/tests/test_retrace.py @@ -8,6 +8,8 @@ import attr +from collections import OrderedDict + from reproman.cmdline.main import main from reproman.formats import Provenance @@ -80,7 +82,7 @@ class FakeSession(object): def isdir(self, _): return False # TODO: make it parametric - tracer_classes = {} + tracer_classes = OrderedDict() for itracer, protocol in enumerate(protocols): # Test the loop logic class FakeTracer(object): @@ -97,7 +99,7 @@ def identify_distributions(self, files): for item in self._current_protocol: yield item FakeTracer.__name__ = "FakeTracer%d" % itracer - tracer_classes['fake'] = FakeTracer + tracer_classes['fake%d' % itracer] = FakeTracer return tracer_classes, FakeSession() From db188c2b49f4975bf3daf8ace39551066fde441c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 12 Sep 2019 18:39:57 -0400 Subject: [PATCH 20/21] BF: if there is no value, we just assign it, not try to append --- reproman/distributions/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index c1922024f..a5b9e6428 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -286,7 +286,7 @@ def __iadd__(self, other): a_self_value = self._find_in_attr(a_other_value, a) if a_self_value is None: # a new one! - a.append(a_other_value) + setattr(self, a.name, [a_other_value]) else: # Delegate doing the right thing to the child's __iadd__ a_self_value += a_other_value @@ -294,6 +294,8 @@ def __iadd__(self, other): raise NotImplementedError( "For now joining only lists of our own spec objects" ) + else: + raise NotImplementedError("I think") # import pdb; pdb.set_trace() pass From ba9cd2098b9053580f5295c29895b0d19606c444 Mon Sep 17 00:00:00 2001 From: Christian Haselgrove Date: Wed, 9 Oct 2019 13:11:28 +0000 Subject: [PATCH 21/21] renamed SpecObject._cmp_id to _diff_id (for consistency) returning actual attribute values, not strings, in SpecObject._diff_vals --- reproman/distributions/base.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index a5b9e6428..9c57549a0 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -58,27 +58,15 @@ def _diff_cmp_id(self): return tuple(getattr(self, a) for a in self._diff_cmp_fields) @property - def _cmp_id(self): - if not self._diff_cmp_fields: - # Might need to be gone or some custom exception - raise RuntimeError( - "Cannot establish identity of %r since _diff_cmp_fields " - "are not defined" % self) - if not self._diff_fields: - # Might need to be gone or some custom exception - raise RuntimeError( - "Cannot establish identity of %r since _diff_fields " - "are not defined" % self) - vals = [ getattr(self, a) for a in self._diff_cmp_fields ] - vals.extend([ getattr(self, a) for a in self._diff_fields ]) - return tuple(vals) + def _diff_id(self): + return self._diff_cmp_id + self._diff_vals @property def _diff_vals(self): """gives the values of the attributes defined by _diff_fields (like _diff_cmp_id for _diff_cmp_fields) """ - return tuple(str(getattr(self, a)) for a in self._diff_fields) + return tuple(getattr(self, a) for a in self._diff_fields) @property def diff_identity_string(self): @@ -103,7 +91,7 @@ def identity_string(self): """like diff_identity_string, but for both _diff_cmp_fields and _diff_fields (used in satisfied_by comparisons) """ - return " ".join(str(el) for el in self._cmp_id if el is not None) + return " ".join(str(el) for el in self._diff_id if el is not None) # TODO: make it "lazy" or may be there is already a helper in attrs? @property @@ -168,7 +156,7 @@ def _satisfied_by(self, other): raise TypeError('don''t know how to determine if a %s is satisfied by a %s' % (self.__class__, other_collection_type)) if not isinstance(other, self.__class__): raise TypeError('incompatible specobject types') - for (self_value, other_value) in zip(self._cmp_id, other._cmp_id): + for (self_value, other_value) in zip(self._diff_id, other._diff_id): if self_value is None: continue if self_value != other_value: @@ -181,11 +169,11 @@ def _identical_to(self, other): We require that the objects are of the same type and that the values of the attributes given by _diff_cmp_fields and _diff_fields - (dereferenced in _cmp_id) are the same. + (dereferenced in _diff_id) are the same. """ if not isinstance(other, self.__class__): return False - return all(sv==ov for sv, ov in zip(self._cmp_id, other._cmp_id)) + return all(sv==ov for sv, ov in zip(self._diff_id, other._diff_id)) def _find_attr(self, child): """Find an attribute among TypedList attrs which might contain the child