diff --git a/reproman.cfg b/reproman.cfg index 6ac3c7eb7..b19c7b7fa 100644 --- a/reproman.cfg +++ b/reproman.cfg @@ -1,6 +1,12 @@ [general] inventory_file = ~/inventory.yml +[retrace] +# report only the "packages" which have associated files +# It is cheap to gather information on all packages in some environments +# (conda, virtualenv, etc) so by default we would report all +only_with_files = false + [aws] access_key_id = AWS_ACCESS_KEY secret_access_key = AWS_SECRET_ACCESS_KEY diff --git a/reproman/distributions/base.py b/reproman/distributions/base.py index 6f56083bb..9c57549a0 100644 --- a/reproman/distributions/base.py +++ b/reproman/distributions/base.py @@ -42,12 +42,11 @@ class SpecObject(object): # needed (or make sure the trivial case is handled) # Fields used to establish the "identity" of the specobject for the - # purposes of diff + # purposes of finding the one and diff + # TODO: rename to e.g. _id_fields? _diff_cmp_fields = tuple() # Fields of the primary interest when showing diff _diff_fields = tuple() - # Fields used in determination of comparison (satisfied_by and identical_to) - _comparison_fields = tuple() @property def _diff_cmp_id(self): @@ -59,20 +58,15 @@ def _diff_cmp_id(self): return tuple(getattr(self, a) for a in self._diff_cmp_fields) @property - def _cmp_id(self): - if not self._comparison_fields: - # Might need to be gone or some custom exception - raise RuntimeError( - "Cannot establish identity of %r since _comaprison_fields " - "are not defined" % self) - return tuple(getattr(self, a) for a in self._comparison_fields) + def _diff_id(self): + return self._diff_cmp_id + self._diff_vals @property def _diff_vals(self): """gives the values of the attributes defined by _diff_fields (like _diff_cmp_id for _diff_cmp_fields) """ - return tuple(str(getattr(self, a)) for a in self._diff_fields) + return tuple(getattr(self, a) for a in self._diff_fields) @property def diff_identity_string(self): @@ -94,10 +88,10 @@ def diff_subidentity_string(self): @property def identity_string(self): - """like diff_identity_string, but for _comparison_fields (used in - satisfied_by comparisons) + """like diff_identity_string, but for both _diff_cmp_fields and + _diff_fields (used in satisfied_by comparisons) """ - return " ".join(str(el) for el in self._cmp_id if el is not None) + return " ".join(str(el) for el in self._diff_id if el is not None) # TODO: make it "lazy" or may be there is already a helper in attrs? @property @@ -137,15 +131,15 @@ def _satisfied_by(self, other): spec object. We require that the values of the attributes given by - _comparison_fields are the same. A specobject with a value of None - for one of these attributes is less specific than one with - a specific value; the former cannot satisfy the latter, - but the latter can satisfy the former. + _diff_cmp_fields and _diff_fields are the same. A specobject + with a value of None for one of these attributes is less specific + than one with a specific value; the former cannot satisfy the + latter, but the latter can satisfy the former. TODO: Ensure we don't encounter the case where self is completely unspecified (all values are None), in which case satisfied_by() returns True by default. Perhaps this is done by making - sure that at least one of the _comparison_fields cannot be None. + sure that at least one of the _diff_cmp_fields cannot be None. TODO: derive _collection_type directly from _collection. This isn't possible at the moment because DebianDistribution.packages is @@ -162,9 +156,7 @@ def _satisfied_by(self, other): raise TypeError('don''t know how to determine if a %s is satisfied by a %s' % (self.__class__, other_collection_type)) if not isinstance(other, self.__class__): raise TypeError('incompatible specobject types') - for attr_name in self._comparison_fields: - self_value = getattr(self, attr_name) - other_value = getattr(other, attr_name) + for (self_value, other_value) in zip(self._diff_id, other._diff_id): if self_value is None: continue if self_value != other_value: @@ -176,14 +168,125 @@ def _identical_to(self, other): """Determine if the other object is identical to the spec object. We require that the objects are of the same type and that the - values of the attributes given by _comparison_fields are the same. + values of the attributes given by _diff_cmp_fields and _diff_fields + (dereferenced in _diff_id) are the same. """ if not isinstance(other, self.__class__): return False - for attr_name in self._comparison_fields: - if getattr(self, attr_name) != getattr(other, attr_name): - return False - return True + return all(sv==ov for sv, ov in zip(self._diff_id, other._diff_id)) + + def _find_attr(self, child): + """Find an attribute among TypedList attrs which might contain the child + + Checks are done based on the type. Returns None if no appropriate + attribute is found + """ + child_class = child.__class__ + assert issubclass(child_class, SpecObject) # only those are supported ATM + compatible_attrs = [] + + # For paranoids - do exhaustive search + for attr in self.__attrs_attrs__: + metadata_type = attr.metadata.get('type', None) + if metadata_type \ + and issubclass(child_class, metadata_type) \ + and issubclass(attr.default.factory, list): # TODO: fragile + # we got our hit for a possible container containing our + # classes + # TODO: cache mapping from child_class to known types + compatible_attrs.append(attr) + + if not compatible_attrs: + return + + # If multiple provide list of the same class -- blow for now? + if len(compatible_attrs) > 1: + raise ValueError( + "Multiple attributes seems to contain instances of %s: %s " + "ATM we are not supporting that." + % (child_class, compatible_attrs) + ) + return compatible_attrs[0] + + def _find_in_attr(self, child, attr): + """Given an attribute (should be containing an iterable) find a child + + No checks for attribute appropriateness (type check etc) for the child + is carried out + """ + + if not attr: + return # Found no attribute which might have contained it + + child_id = child._diff_cmp_id + values = getattr(self, attr.name) # TODO: isn't there a better way? + hits = [ + v for v in values + if v._diff_cmp_id == child_id + ] + + # checks + # TODO: parametrize etc to generate/allow for multiple hits? + if not hits: + return None + elif len(hits) > 1: + raise RuntimeError( + "Found multiple hits for %s in %s: %s . ATM we expect only a " + "single one", + child, attr.name, hits + ) + return hits[0] + + def find(self, child): + """Find an object among TypedList attrs which matches (identity vice) + """ + attr = self._find_attr(child) + if not attr: + return None + return self._find_in_attr(child, attr) + + def __iadd__(self, other): + """Add information from another object into this one""" + # Check the other one is of the same kind + assert isinstance(other, self.__class__) + # TEMP -- to decide what to do when subclass. For now take only + # exactly the same + assert other.__class__ is self.__class__ + # TODO: consider being able to add objects this class knows about. + # e.g. Adding DebianPackage's to .packages, or DebianDistribution + # to .distributions. That might eliminate if/then/else logic in + # retrace + # go through attrs, and if the other one defines a single attr which + # is not defined here -- blow + # For the lists - add those which aren't found, and found should get + # += 'ed if spec objects, appended otherwise + for a in self.__attrs_attrs__: + a_self = getattr(self, a.name) + a_other = getattr(other, a.name) + if a_self == a_other: + # All good and nothing for us to do here + continue + if isinstance(a.default, attr.Factory) and a.default.factory is list: + # we have a list of things... + if issubclass(a.metadata['type'], SpecObject): + # we might know what to do + for a_other_value in a_other: + a_self_value = self._find_in_attr(a_other_value, a) + if a_self_value is None: + # a new one! + setattr(self, a.name, [a_other_value]) + else: + # Delegate doing the right thing to the child's __iadd__ + a_self_value += a_other_value + else: + raise NotImplementedError( + "For now joining only lists of our own spec objects" + ) + else: + raise NotImplementedError("I think") + # import pdb; pdb.set_trace() + pass + def _register_with_representer(cls): @@ -210,6 +313,13 @@ class Distribution(SpecObject, metaclass=abc.ABCMeta): # name and looks awkward name = attrib(default=attr.NOTHING) + # Distributions are typically a singular entity on a system, although + # in some cases there could be multiple (e.g. conda installations) which + # managed to get used. + # So their identification by default will be done just based on the name + _diff_cmp_fields = ('name',) + + @staticmethod def factory(distribution_type, provenance=None): """ @@ -423,3 +533,101 @@ def _create_package(self, **package_fields): provided by _get_packagefields_for_files """ return + + +class SpecDiff: + + """Difference object for SpecObjects. + + Instantiate with SpecDiff(a, b). a and b must be of the same type + or TypeError is raised. Either (but not both) may be None. + + Attributes: + + a, b: The two objects being compared. + + If _diff_cmp_fields is defined for the SpecObjects: + + diff_cmp_id: The _diff_cmp_id of the two objects. If + _diff_cmp_id are different for the two objects, + they cannot be compared and ValueError is raised. + + diff_vals_a, diff_vals_b: _diff_vals for a and b, respectively, + or None if a or b is None. + + For collection SpecObjects (e.g. DebianDistribution, containing + DEBPackages; these have _collection_attribute defined), we also + have: + + collection: a list of SpecDiff objects for the contained + SpecObjects. + + a_only: SpecDiff objects from collection that only appear in the + first passed SpecObject + + b_only: SpecDiff objects from collection that only appear in the + second passed SpecObject + + a_and_b: SpecDiff objects in collection that appear in both + passed SpecObjects + + If a and b are lists, they are treated as files specifications, and + self.collection is a list of (fname_a, fname_b) tuples. + TODO: give files and file collections their own specobjects + """ + + def __init__(self, a, b): + if not isinstance(a, (SpecObject, list, type(None))) \ + or not isinstance(b, (SpecObject, list, type(None))): + raise TypeError('objects must be SpecObjects or None') + if not a and not b: + raise TypeError('objects cannot both be None') + if a and b and type(a) != type(b): + raise TypeError('objects must be of the same type') + self.cls = type(a) if a is not None else type(b) + self.a = a + self.b = b + if self.cls == list: + a_collection = set(a) + b_collection = set(b) + self.collection = [] + for fname in set(a_collection).union(b_collection): + if fname not in a_collection: + self.collection.append((None, fname)) + elif fname not in b_collection: + self.collection.append((fname, None)) + else: + self.collection.append((fname, fname)) + else: + if self.cls._diff_cmp_fields: + if a and b and a._diff_cmp_id != b._diff_cmp_id: + raise ValueError('objects\' _diff_cmp_id differ') + self.diff_vals_a = a._diff_vals if a else None + self.diff_vals_b = b._diff_vals if b else None + self.diff_cmp_id = a._diff_cmp_id if a else b._diff_cmp_id + if hasattr(self.cls, '_collection_attribute'): + self.collection = [] + a_collection = { c._diff_cmp_id: c for c in a.collection } if a else {} + b_collection = { c._diff_cmp_id: c for c in b.collection } if b else {} + all_cmp_ids = set(a_collection).union(b_collection) + for cmp_id in all_cmp_ids: + ac = a_collection[cmp_id] if cmp_id in a_collection else None + bc = b_collection[cmp_id] if cmp_id in b_collection else None + self.collection.append(SpecDiff(ac, bc)) + if hasattr(self, 'collection'): + self.a_only = [] + self.b_only = [] + self.a_and_b = [] + for pkg_diff in self.collection: + if isinstance(pkg_diff, tuple): + (a, b) = pkg_diff + else: + a = pkg_diff.a + b = pkg_diff.b + if not a: + self.b_only.append(pkg_diff) + elif not b: + self.a_only.append(pkg_diff) + else: + self.a_and_b.append(pkg_diff) + return diff --git a/reproman/distributions/conda.py b/reproman/distributions/conda.py index e20428645..46d4dc714 100644 --- a/reproman/distributions/conda.py +++ b/reproman/distributions/conda.py @@ -138,6 +138,7 @@ class CondaDistribution(Distribution): environments = TypedList(CondaEnvironment) _cmp_field = ('path',) + _collection_attribute = 'packages' def initiate(self, environment): """ diff --git a/reproman/distributions/debian.py b/reproman/distributions/debian.py index 776aae9c2..8f02bb8e5 100644 --- a/reproman/distributions/debian.py +++ b/reproman/distributions/debian.py @@ -66,6 +66,10 @@ class APTSource(SpecObject): site = attrib() archive_uri = attrib() date = attrib() + + # name contains a suffix so we probably should not use it + _diff_cmp_fields = ('origin', 'codename', 'component', 'architecture') + _register_with_representer(APTSource) @@ -89,7 +93,6 @@ class DEBPackage(Package): _diff_cmp_fields = ('name', 'architecture') _diff_fields = ('version',) - _comparison_fields = ('name', 'architecture', 'version') _register_with_representer(DEBPackage) diff --git a/reproman/distributions/redhat.py b/reproman/distributions/redhat.py index cec4b61c4..9671217db 100644 --- a/reproman/distributions/redhat.py +++ b/reproman/distributions/redhat.py @@ -65,7 +65,8 @@ class RPMPackage(Package): vendor = attrib() url = attrib() files = attrib(default=attr.Factory(list), hash=False) - _comparison_fields = ('name', 'version', 'architecture') + _diff_cmp_fields = ('name', 'architecture') + _diff_fields = ('version',) _register_with_representer(RPMPackage) diff --git a/reproman/distributions/vcs.py b/reproman/distributions/vcs.py index 78c77423f..a52d19d6a 100644 --- a/reproman/distributions/vcs.py +++ b/reproman/distributions/vcs.py @@ -123,6 +123,8 @@ class GitDistribution(VCSDistribution): _cmd = "git" packages = TypedList(GitRepo) + _collection_attribute = 'packages' + def initiate(self, session=None): pass @@ -294,6 +296,8 @@ class SVNDistribution(VCSDistribution): _cmd = "svn" packages = TypedList(SVNRepo) + _collection_attribute = 'packages' + def install_packages(self, session, use_version=True): raise NotImplementedError SVNRepo._distribution = SVNDistribution diff --git a/reproman/distributions/venv.py b/reproman/distributions/venv.py index c3f71eaf2..526e549fc 100644 --- a/reproman/distributions/venv.py +++ b/reproman/distributions/venv.py @@ -38,6 +38,8 @@ class VenvPackage(Package): location = attrib() editable = attrib(default=False) files = attrib(default=attr.Factory(list)) + _diff_cmp_fields = ('name', ) + _diff_fields = ('version', ) @attr.s @@ -48,6 +50,8 @@ class VenvEnvironment(SpecObject): default=False, doc="Are any non-local packages present in the environment?") packages = TypedList(VenvPackage) + _diff_cmp_fields = ('path', 'python_version') + _collection_attribute = 'packages' @attr.s @@ -57,6 +61,7 @@ class VenvDistribution(Distribution): path = attrib() venv_version = attrib() environments = TypedList(VenvEnvironment) + _collection_attribute = 'environments' def initiate(self, _): return diff --git a/reproman/formats/reproman.py b/reproman/formats/reproman.py index 6042a35bd..c2d865d27 100644 --- a/reproman/formats/reproman.py +++ b/reproman/formats/reproman.py @@ -12,6 +12,7 @@ from __future__ import absolute_import import collections +import copy import datetime import logging from collections import OrderedDict @@ -25,7 +26,10 @@ from reproman.utils import instantiate_attr_object from .base import Provenance from .utils import write_config -from .. import utils +from .. import ( + cfg, + utils, +) from ..distributions import Distribution from ..dochelpers import exc_str @@ -283,11 +287,42 @@ def write(cls, output, spec): out = OrderedDict() out['version'] = __version__ + if cfg.getboolean('retrace', 'only_with_files', False): + spec = copy.deepcopy(spec) + filter_spec(spec, packages_only_with_files=True) out.update(spec_to_dict(spec)) write_config(output, out) return out +def filter_spec(spec, packages_only_with_files=False): + """Filter out spec + + Parameters + ---------- + packages_only_with_files: bool, optional + Will return a spec with packages which have no files associated removed + """ + if not hasattr(spec, '__attrs_attrs__'): + return # nothing for us to do with this one + spec_attrs = spec.__attrs_attrs__ # as is -- list of them + for attr in spec_attrs: + value_in = getattr(spec, attr.name, None) + if attr.name.lower() == 'packages' and value_in and packages_only_with_files: + # go through them + assert isinstance(value_in, list) + value_out = [ + p for p in value_in + if getattr(p, 'files', None) + ] + setattr(spec, attr.name, value_out) + # we might need to recurse if it is a list + if isinstance(value_in, (list, dict, tuple)): + # might be specs too, recurse + for v in (value_in.values() if isinstance(value_in, dict) else value_in): + filter_spec(v, packages_only_with_files=packages_only_with_files) + + # TODO: RF into SpecObject._as_dict() def spec_to_dict(spec): diff --git a/reproman/interface/diff.py b/reproman/interface/diff.py index 5ab4e2704..57a493cdd 100644 --- a/reproman/interface/diff.py +++ b/reproman/interface/diff.py @@ -20,6 +20,8 @@ from ..distributions.debian import DebianDistribution from ..distributions.conda import CondaDistribution from ..distributions.vcs import GitDistribution, SVNDistribution +from ..distributions.venv import VenvDistribution, VenvEnvironment +from ..distributions.base import SpecDiff __docformat__ = 'restructuredtext' @@ -81,55 +83,40 @@ def __call__(prov1, prov2, satisfies): @staticmethod def diff(env_1, env_2): - result = {'method': 'diff', 'distributions': []} - # distribution type -> package type string supported_distributions = { DebianDistribution: 'Debian package', CondaDistribution: 'Conda package', GitDistribution: 'Git repository', - SVNDistribution: 'SVN repository' + SVNDistribution: 'SVN repository', + VenvDistribution: 'Venv environment', } env_1_dist_types = { d.__class__ for d in env_1.distributions } env_2_dist_types = { d.__class__ for d in env_2.distributions } all_dist_types = env_1_dist_types.union(env_2_dist_types) + diffs = [] + for dist_type in all_dist_types: if dist_type not in supported_distributions: msg = 'diff doesn\'t know how to handle %s' % str(dist_type) raise ValueError(msg) - dist_res = {'pkg_type': supported_distributions[dist_type], - 'pkg_diffs': []} dist_1 = env_1.get_distribution(dist_type) - if dist_1: - pkgs_1 = {p._diff_cmp_id: p for p in dist_1.packages} - else: - pkgs_1 = {} dist_2 = env_2.get_distribution(dist_type) - if dist_2: - pkgs_2 = {p._diff_cmp_id: p for p in dist_2.packages} - else: - pkgs_2 = {} - dist_res['pkgs_1'] = pkgs_1 - dist_res['pkgs_2'] = pkgs_2 - pkgs_1_s = set(pkgs_1) - pkgs_2_s = set(pkgs_2) - dist_res['pkgs_only_1'] = pkgs_1_s - pkgs_2_s - dist_res['pkgs_only_2'] = pkgs_2_s - pkgs_1_s - for cmp_key in pkgs_1_s.intersection(pkgs_2_s): - package_1 = pkgs_1[cmp_key] - package_2 = pkgs_2[cmp_key] - if package_1._diff_vals != package_2._diff_vals: - dist_res['pkg_diffs'].append((package_1, package_2)) - result['distributions'].append(dist_res) + diffs.append({'diff': SpecDiff(dist_1, dist_2), + 'pkg_type': supported_distributions[dist_type]}) + + diffs.append({'diff': SpecDiff(env_1.files, env_2.files), + 'pkg_type': 'files'}) files1 = set(env_1.files) files2 = set(env_2.files) - result['files_1_only'] = files1 - files2 - result['files_2_only'] = files2 - files1 - return result + return {'method': 'diff', + 'diffs': diffs, + 'files_1_only': files1 - files2, + 'files_2_only': files2 - files1} @staticmethod def satisfies(env_1, env_2): @@ -180,39 +167,60 @@ def render_cmdline_diff(result): status = 0 - for dist_res in result['distributions']: + for diff_d in result['diffs']: - if dist_res['pkgs_only_1'] or dist_res['pkgs_only_2']: - print(_make_plural(dist_res['pkg_type']) + ':') + diff = diff_d['diff'] + pkg_type = diff_d['pkg_type'] - if dist_res['pkgs_only_1']: - for cmp_key in sorted(dist_res['pkgs_only_1']): - package = dist_res['pkgs_1'][cmp_key] - print('< %s' % package.diff_identity_string) - status = 3 - if dist_res['pkgs_only_1'] and dist_res['pkgs_only_2']: - print('---') - if dist_res['pkgs_only_2']: - for cmp_key in sorted(dist_res['pkgs_only_2']): - package = dist_res['pkgs_2'][cmp_key] - print('> %s' % package.diff_identity_string) - status = 3 + if pkg_type == 'files': + files_diff = diff_d['diff'] + continue - for (package_1, package_2) in dist_res['pkg_diffs']: - print('%s %s:' % (dist_res['pkg_type'], - " ".join(package_1._diff_cmp_id))) - print('< %s' % package_1.diff_subidentity_string) - print('---') - print('> %s' % package_2.diff_subidentity_string) + if diff.a_only or diff.b_only: + print(_make_plural(pkg_type) + ':') status = 3 - - if result['files_1_only'] or result['files_2_only']: + for pkg_diff in diff.a_only: + print('< %s' % pkg_diff.a.diff_identity_string) + if diff.a_only and diff.b_only: + print('---') + for pkg_diff in diff.b_only: + print('> %s' % pkg_diff.b.diff_identity_string) + + for pkg_diff in diff.a_and_b: + if not hasattr(pkg_diff, 'collection'): + if pkg_diff.diff_vals_a != pkg_diff.diff_vals_b: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + print('< %s' % pkg_diff.a.diff_subidentity_string) + print('---') + print('> %s' % pkg_diff.b.diff_subidentity_string) + status = 3 + else: + a_only = [ pd.a for pd in pkg_diff.a_only ] + b_only = [ pd.b for pd in pkg_diff.b_only ] + ab = [ pd for pd in pkg_diff.a_and_b + if pd.diff_vals_a != pd.diff_vals_b ] + if a_only or b_only or ab: + print('%s %s:' % (pkg_type, ' '.join(pkg_diff.diff_cmp_id))) + for pkg in a_only: + print('< %s' % pkg.diff_identity_string) + for pd in ab: + print('< %s %s' % (pd.a.diff_identity_string, pd.a.diff_subidentity_string)) + if (a_only and b_only) or ab: + print('---') + for pkg in b_only: + print('> %s' % pkg.diff_identity_string) + for pd in ab: + print('> %s %s' % (pd.b.diff_identity_string, pd.b.diff_subidentity_string)) + + files_1_only = [ t[0] for t in files_diff.collection if t[1] is None ] + files_2_only = [ t[1] for t in files_diff.collection if t[0] is None ] + if files_1_only or files_2_only: print('Files:') - for fname in result['files_1_only']: + for fname in files_1_only: print('< %s' % fname) - if result['files_1_only'] and result['files_2_only']: + if files_1_only and files_2_only: print('---') - for fname in result['files_2_only']: + for fname in files_2_only: print('> %s' % fname) status = 3 diff --git a/reproman/interface/retrace.py b/reproman/interface/retrace.py index 62f1be60d..27a814057 100644 --- a/reproman/interface/retrace.py +++ b/reproman/interface/retrace.py @@ -9,6 +9,8 @@ """Analyze existing spec or session file system to gather more detailed information """ +# Since 3.7 shouldn't be needed +from collections import OrderedDict from os.path import normpath import sys import time @@ -73,13 +75,22 @@ class Retrace(Interface): instance can be passed as the value for `resref`. PY]""", constraints=EnsureStr() | EnsureNone()), resref_type=resref_type_opt, + tracers=Parameter( + args=("-t", "--tracer"), + dest="tracers", + doc="Tracer(s) to use. Could be specified multiple times. " + "They will be used in the order specified.", + metavar='TRACER', + action='append', + constraints=EnsureStr() | EnsureNone(), + ), ) # TODO: add a session/resource so we could trace within # arbitrary sessions @staticmethod def __call__(path=None, spec=None, output_file=None, - resref=None, resref_type="auto"): + resref=None, resref_type="auto", tracers=None): # heavy import -- should be delayed until actually used if not (spec or path): @@ -117,9 +128,24 @@ def __call__(path=None, spec=None, output_file=None, # Generalize # TODO: RF so that only the above portion is reprozip specific. # If we are to reuse their layout largely -- the rest should stay as is + if tracers: + # specific ones asked for + all_tracer_classes = get_tracer_classes() + + tracer_classes = OrderedDict() + for t in tracers: + if t not in all_tracer_classes: + raise ValueError( + "Do not know tracer type '%s'. Known are %s" + % (t, ', '.join(all_tracer_classes))) + tracer_classes[t] = all_tracer_classes[t] + else: + tracer_classes = None + (distributions, files) = identify_distributions( paths, - session=session + session=session, + tracer_classes=tracer_classes, ) from reproman.distributions.base import EnvironmentSpec spec = EnvironmentSpec( @@ -146,6 +172,8 @@ def identify_distributions(files, session=None, tracer_classes=None): ---------- files : iterable Files to consider + tracer_classes: dict, optional + a dict with tracers (name: class) Returns ------- @@ -167,7 +195,11 @@ def identify_distributions(files, session=None, tracer_classes=None): # as they identify files beloning to them files_to_consider = set(files) - distibutions = [] + from reproman.distributions.base import EnvironmentSpec + environment_spec = EnvironmentSpec( + distributions=[] + ) + files_processed = set() files_to_trace = files_to_consider @@ -184,7 +216,7 @@ def identify_distributions(files, session=None, tracer_classes=None): % max_niter) break - for Tracer in tracer_classes: + for Tracer in tracer_classes.values(): lgr.debug("Tracing using %s", Tracer.__name__) # TODO: memoize across all loops # Identify directories from the files_to_consider @@ -205,16 +237,29 @@ def identify_distributions(files, session=None, tracer_classes=None): # files_to_trace if files_to_trace: remaining_files_to_trace = files_to_trace - nenvs = 0 - for env, remaining_files_to_trace in tracer.identify_distributions( + nnewdists, nolddists = 0, 0 + # TODO: note that here we start with the full list, but then + # do not shrink it (use remaining_files_to_trace) while looping + # The issue https://github.com/ReproNim/reproman/issues/417 + # TODO: + # refactor to return a list of dists, and a single remaining_files_to_trace + # Closes #417 when done ;) + for dist, remaining_files_to_trace in tracer.identify_distributions( files_to_trace): - distibutions.append(env) - nenvs += 1 + old_dist = environment_spec.find(dist) + if not old_dist: + nnewdists += 1 + environment_spec.distributions.append(dist) + else: + nolddists += 1 + old_dist += dist + nnewdists += 1 files_processed |= files_to_trace - remaining_files_to_trace files_to_trace = remaining_files_to_trace - lgr.info("%s: %d envs with %d other files remaining", + lgr.info("%s: %d new and %d old distributions with %d other files remaining", Tracer.__name__, - nenvs, + nnewdists, + nolddists, len(files_to_trace)) # Re-combine any files that were skipped @@ -228,11 +273,12 @@ def identify_distributions(files, session=None, tracer_classes=None): lgr.info("No more changes or files to track. Exiting the loop") break - return distibutions, files_to_consider + # TODO: -- return environment_spec itself? + return environment_spec.distributions, files_to_consider def get_tracer_classes(): - """A helper which returns a list of all available Tracers + """A helper which returns a dict (name: class) of all available Tracers The order should not but does matter and ATM is magically provided """ @@ -244,6 +290,12 @@ def get_tracer_classes(): from reproman.distributions.vcs import VCSTracer from reproman.distributions.docker import DockerTracer from reproman.distributions.singularity import SingularityTracer - Tracers = [DebTracer, RPMTracer, CondaTracer, VenvTracer, VCSTracer, - DockerTracer, SingularityTracer] - return Tracers + tracers = OrderedDict() + tracers['deb'] = DebTracer + tracers['rpm'] = RPMTracer + tracers['conda'] = CondaTracer + tracers['venv'] = VenvTracer + tracers['vcs'] = VCSTracer + tracers['docker'] = DockerTracer + tracers['singularity'] = SingularityTracer + return tracers diff --git a/reproman/interface/tests/files/diff_1.yaml b/reproman/interface/tests/files/diff_1.yaml index 08a098312..40c498f02 100644 --- a/reproman/interface/tests/files/diff_1.yaml +++ b/reproman/interface/tests/files/diff_1.yaml @@ -58,4 +58,53 @@ distributions: - path: /path/1/to/different/svn/commit revision: 12 uuid: 95e4b738-84c7-154c-f082-34d40e21fdd4 +- name: venv + path: /usr/bin/virtualenv + venv_version: 15.1.0 + environments: + - path: /home/user/venvs/test/both + python_version: 2.7.15+ + packages: + - name: six + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/bth_2 + python_version: 2.7.15+ + packages: + - name: one_only + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - name: version_mismatch + version: 1.1.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/1_only + python_version: 2.7.15+ + packages: + - name: funcsigs + version: 1.0.2 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/funcsigs-1.0.2.dist-info/METADATA + - name: rpaths + version: '0.13' + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/rpaths-0.13.dist-info/METADATA files: [/etc/a, /etc/b] diff --git a/reproman/interface/tests/files/diff_2.yaml b/reproman/interface/tests/files/diff_2.yaml index bed05c111..ff9939fc3 100644 --- a/reproman/interface/tests/files/diff_2.yaml +++ b/reproman/interface/tests/files/diff_2.yaml @@ -58,4 +58,53 @@ distributions: - path: /path/2/to/different/svn/commit revision: 14 uuid: 95e4b738-84c7-154c-f082-34d40e21fdd4 +- name: venv + path: /usr/bin/virtualenv + venv_version: 15.1.0 + environments: + - path: /home/user/venvs/test/both + python_version: 2.7.15+ + packages: + - name: six + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/bth_2 + python_version: 2.7.15+ + packages: + - name: two_only + version: 1.12.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - name: version_mismatch + version: 1.2.0 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/six-1.12.0.dist-info/METADATA + - lib/python2.7/site-packages/six.pyc + - lib/python2.7/site-packages/six.py + - path: /home/user/venvs/test/2_only + python_version: 2.7.15+ + packages: + - name: funcsigs + version: 1.0.2 + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/funcsigs-1.0.2.dist-info/METADATA + - name: rpaths + version: '0.13' + local: true + location: /home/user/venvs/test/lib/python2.7/site-packages + files: + - lib/python2.7/site-packages/rpaths-0.13.dist-info/METADATA files: [/etc/c, /etc/b] diff --git a/reproman/interface/tests/test_diff.py b/reproman/interface/tests/test_diff.py index 6248031aa..1668f6d05 100644 --- a/reproman/interface/tests/test_diff.py +++ b/reproman/interface/tests/test_diff.py @@ -152,6 +152,18 @@ def test_diff_svn(): assert_not_in('(/path/2/to/common/svn/repo)', outputs.out) +def test_diff_venv(): + with swallow_outputs() as outputs: + args = ['diff', diff_1_yaml, diff_2_yaml] + rv = main(args) + assert_equal(rv, 3) + assert_equal(outputs.err, '') + assert_in('Venv environments:', outputs.out) + assert_in('< /home/user/venvs/test/1_only 2.7.15+', outputs.out) + assert_in('> /home/user/venvs/test/2_only 2.7.15+', outputs.out) + assert_not_in('/home/user/venvs/test/both', outputs.out) + + def test_diff_satisfies_unsupported_distribution(): # using subprocess.call() here because we're looking for a condition # that raises an exception in main(), so it doesn't return and we diff --git a/reproman/interface/tests/test_retrace.py b/reproman/interface/tests/test_retrace.py index 62314a620..6067f6be2 100644 --- a/reproman/interface/tests/test_retrace.py +++ b/reproman/interface/tests/test_retrace.py @@ -6,11 +6,16 @@ # # ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## +import attr + +from collections import OrderedDict + from reproman.cmdline.main import main from reproman.formats import Provenance import logging +from reproman.distributions import Distribution from reproman.utils import swallow_logs, swallow_outputs, make_tempfile from reproman.tests.utils import ( assert_in, @@ -53,6 +58,20 @@ def test_retrace_normalize_paths(): assert "name: debian" in cm.out +@attr.s(cmp=False) +class FakeDistribution(Distribution): + # Allow to compare by name + def __eq__(self, other): + if isinstance(other, str): + return self.name == other + return super().__eq__(other) + + def initiate(self): + pass + def install_packages(self): + pass + + def get_tracer_session(protocols): class FakeSession(object): """A fake session attributes and methods of which should not @@ -63,7 +82,7 @@ class FakeSession(object): def isdir(self, _): return False # TODO: make it parametric - tracer_classes = [] + tracer_classes = OrderedDict() for itracer, protocol in enumerate(protocols): # Test the loop logic class FakeTracer(object): @@ -80,7 +99,7 @@ def identify_distributions(self, files): for item in self._current_protocol: yield item FakeTracer.__name__ = "FakeTracer%d" % itracer - tracer_classes.append(FakeTracer) + tracer_classes['fake%d' % itracer] = FakeTracer return tracer_classes, FakeSession() @@ -88,7 +107,7 @@ def _check_loop_protocol(protocols, files, tenvs, tfiles): tracer_classes, session = get_tracer_session(protocols) dists, unknown_files = identify_distributions( files, session, tracer_classes=tracer_classes) - assert not any(t._protocol for t in tracer_classes), "we exhausted the protocol" + assert not any(t._protocol for t in tracer_classes.values()), "we exhausted the protocol" assert dists == tenvs assert unknown_files == tfiles @@ -98,12 +117,12 @@ def test_retrace_loop_over_tracers(): [ # Tracers [ # Tracer passes [ # what to yield - ("Env1", {"thefile"}) + (FakeDistribution("Dist1"), {"thefile"}) ] ] ], files=["thefile"], - tenvs=['Env1'], + tenvs=['Dist1'], tfiles={"thefile"}) # The 2nd tracer consumes everything @@ -111,17 +130,17 @@ def test_retrace_loop_over_tracers(): [ # Tracers [ # Tracer passes [ # what to yield - ("Env1", {"thefile"}) + (FakeDistribution("Dist1"), {"thefile"}) ], ], [ # Tracer passes [ # what to yield - ("Env2", set()) + (FakeDistribution("Dist2"), set()) ], ] ], files=["thefile"], - tenvs=['Env1', 'Env2'], + tenvs=['Dist1', 'Dist2'], tfiles=set()) # The fancy multi-yield and producing stuff @@ -129,17 +148,17 @@ def test_retrace_loop_over_tracers(): [ # Tracers [ # Tracer passes [ # what to yield - ("Env1", {"file2", "file3"}), + (FakeDistribution("Dist1"), {"file2", "file3"}), ], [ - ("Env3", {"file3"}) # consume file4 + (FakeDistribution("Dist3"), {"file3"}) # consume file4 ], [] # finale ], [ # Tracer passes [ # what to yield - ("Env2", {"file3", "file4", "file5"}), - ("Env2.1", {"file3", "file4"}) + (FakeDistribution("Dist2"), {"file3", "file4", "file5"}), + (FakeDistribution("Dist2.1"), {"file3", "file4"}) ], [ @@ -148,5 +167,5 @@ def test_retrace_loop_over_tracers(): ] ], files=["file1", "file2"], - tenvs=['Env1', 'Env2', 'Env2.1', 'Env3'], + tenvs=['Dist1', 'Dist2', 'Dist2.1', 'Dist3'], tfiles={'file3'}) \ No newline at end of file