diff --git a/automatminer/featurization/.ipynb_checkpoints/core-checkpoint.py b/automatminer/featurization/.ipynb_checkpoints/core-checkpoint.py new file mode 100644 index 00000000..768ed23c --- /dev/null +++ b/automatminer/featurization/.ipynb_checkpoints/core-checkpoint.py @@ -0,0 +1,703 @@ +""" +Classes for automatic featurization and core featurizer functionality. +""" + +import os +import math +import logging + +from pymatgen.core import Composition +from matminer.featurizers.conversions import ( + StrToComposition, + DictToObject, + StructureToComposition, + StructureToOxidStructure, + CompositionToOxidComposition, +) +from matminer.featurizers.function import FunctionFeaturizer +from matminer.utils.io import store_dataframe_as_json, load_dataframe_from_json + +from automatminer.utils.log import ( + log_progress, + AMM_LOG_FIT_STR, + AMM_LOG_TRANSFORM_STR, +) +from automatminer.utils.pkg import check_fitted, set_fitted +from automatminer.base import DFTransformer +from automatminer.featurization.sets import ( + CompositionFeaturizers, + StructureFeaturizers, + BSFeaturizers, + DOSFeaturizers, +) +from automatminer.utils.pkg import AutomatminerError +from automatminer.utils.ml import regression_or_classification + +__author__ = [ + "Alex Dunn ", + "Alireza Faghaninia ", + "Qi Wang ", +] + +_COMMON_COL_ERR_STR = "composition_col/structure_col/bandstructure_col/dos_col" + +logger = logging.getLogger(__name__) + + +class AutoFeaturizer(DFTransformer): + """ + Automatically featurize a dataframe. + + Use this object first by calling fit, then by calling transform. + + AutoFeaturizer requires you to specify the column names for each type of + featurization, or just use the defaults: + + "composition": To use composition features + "structure": To use structure features + "bandstructure": To use bandstructure features + "dos": To use density of states features + + The featurizers corresponding to each featurizer type cannot be used if + the correct column name is not present. + + Args: + cache_src (str): An absolute path to a json file holding feature + information. If file exists, will read features (loc indexwise) + from this file instead of featurizing. If this file does not exist, + AutoFeaturizer will featurize normally, then save the features to a + new file. Only features (not featurizer input objects) will be saved + preset (str): "express" or "heavy" or "debug" or "all. Determines by + preset the featurizers that should be applied. See the Featurizer + sets for specifics of each. Default is "express". Incompatible with + the featurizers arg. + featurizers (dict): Use this option if you want to manually specify + the featurizers to use. Keys are the featurizer types you want + applied (e.g., "structure", "composition"). The corresponding values + are lists of featurizer objects you want for each featurizer type. + + Example: + {"composition": [ElementProperty.from_preset("matminer"), + EwaldEnergy()] + "structure": [BagofBonds(), GlobalSymmetryFeatures()]} + + exclude ([str]): Class names of featurizers to exclude. Only used if + you use a preset. + ignore_cols ([str]): Column names to be ignored/removed from any + dataframe undergoing fitting or transformation. If columns are + not ignored, they may be used later on for learning. + ignore_errors (bool): If True, each featurizer will ignore all errors + during featurization. + drop_inputs (bool): Drop the columns containing input objects for + featurization after they are featurized. + guess_oxistates (bool): If True, try to decorate sites with oxidation + state. + multiiindex (bool): If True, returns a multiindexed dataframe. Not + recommended for use in MatPipe. + do_precheck (bool): Execute a precheck on each featurizer before + featurizing with it. See matminer prechecking for more info. + n_jobs (int): The number of parallel jobs to use during featurization + for each featurizer. Default is n_cores + composition_col="composition", + composition_col (str): Name of the column containing structures to be + featurized. + structure_col (str)L Name of the column containing structures to be + featurized + bandstructure (str): Name of the column containing bandstructures to + be featurized. + dos_col (str): Name of the column containing density of states obejcts + to be featurized. + + Attributes: + These attributes are set during fitting + + featurizers (dict): Same format as input dictionary in Args. Values + contain the actual objects being used for featurization. Featurizers + can be removed if check_validity=True and the featurizer is not + valid for more than self.min_precheck_frac fraction of the fitting + dataset. + features (dict): The features generated from the application of all + featurizers. + auto_featurizer (bool): whether the featurizers are set automatically, + or passed by the users. + fitted_input_df (pd.DataFrame): The dataframe which was fitted on + converted_input_df (pd.DataFrame): The converted dataframe which + was fitted on (i.e., strings converted to compositions). + removed_featurizers ([BaseFeaturizer]): A list of featurizers removed + by prechecking methods, if applicable + + Attributes not set during fitting and not specified by arguments: + + min_precheck_frac (float): The minimum fraction of a featuriser's input + that can be valid (via featurizer.precheck(data). + """ + + def __init__( + self, + cache_src=None, + preset=None, + featurizers=None, + exclude=None, + functionalize=False, + ignore_cols=None, + ignore_errors=True, + drop_inputs=True, + guess_oxistates=True, + multiindex=False, + do_precheck=True, + n_jobs=None, + composition_col="composition", + structure_col="structure", + bandstructure_col="bandstructure", + dos_col="dos", + ): + + if featurizers and preset: + raise AutomatminerError( + "Featurizers and preset were both set. " + "Please either use a preset ('express', " + "'all', 'debug', 'heavy') or set " + "featurizers manually." + ) + if not featurizers and not preset: + raise AutomatminerError( + "Please specify set(s) of featurizers to " + "use either through the featurizers" + "argument or through the preset argument." + ) + + self.cache_src = cache_src + self.preset = "express" if preset is None else preset + self.featurizers = featurizers + self.exclude = exclude if exclude else [] + self.functionalize = functionalize + self.ignore_cols = ignore_cols or [] + self.fitted_input_df = None + self.converted_input_df = None + self.ignore_errors = ignore_errors + self.drop_inputs = drop_inputs + self.multiindex = multiindex + self.do_precheck = do_precheck + self.n_jobs = n_jobs + self.guess_oxistates = guess_oxistates + self.features = [] + self.auto_featurizer = True if self.featurizers is None else False + self.removed_featurizers = None + self.composition_col = composition_col + self.structure_col = structure_col + self.bandstruct_col = bandstructure_col + self.dos_col = dos_col + super(AutoFeaturizer, self).__init__() + + _supported_featurizers = { + composition_col: CompositionFeaturizers, + structure_col: StructureFeaturizers, + bandstructure_col: BSFeaturizers, + dos_col: DOSFeaturizers, + } + + # user-set featurizers + if self.featurizers: + if not isinstance(self.featurizers, dict): + raise TypeError( + "Featurizers must be a dictionary with keys" + "matching your {}".format(_COMMON_COL_ERR_STR) + ) + + invalid_ftypes = [ + f + for f in self.featurizers.keys() + if f not in _supported_featurizers.keys() + ] + if invalid_ftypes: + raise KeyError( + "The following keys were specified as featurizer" + " types but were not set in {}" + "".format(_COMMON_COL_ERR_STR) + ) + + for ftype, fset in self.featurizers.items(): + _allowed = [ + f.__class__.__name__ for f in _supported_featurizers[ftype]().all + ] + for f in fset: + if f.__class__.__name__ not in _allowed: + raise ValueError( + "The {} featurizer {} is not supported by " + "AutoFeaturizer. Try updating your version of " + "automatminer and matminer.".format(ftype, f) + ) + + # auto-set featurizers + else: + featurizers = dict() + for featurizer_type in _supported_featurizers.keys(): + featurizer_set = _supported_featurizers[featurizer_type] + featurizers[featurizer_type] = getattr( + featurizer_set(exclude=self.exclude), self.preset + ) + self.featurizers = featurizers + + # Check if any featurizers need fitting (useful for MatPipe) + needs_fit = False + fittable_fs = StructureFeaturizers().need_fit + self.fittable_fcls = set([f.__class__.__name__ for f in fittable_fs]) + + # Currently structure featurizers are the only featurizer types which + # can be fittable + for f in self.featurizers[self.structure_col]: + if f.__class__.__name__ in self.fittable_fcls: + needs_fit = True + break + self.needs_fit = needs_fit + + if self.needs_fit and self.cache_src: + logger.critical( + self._log_prefix + "Using cached features on fittable featurizers! " + "Please make sure you are not benchmarking with " + "these options enabled; it is likely you will be" + "leaking data (i.e., features) from the testing" + "sets into the training." + ) + + if self.cache_src and "json" not in self.cache_src.lower(): + raise ValueError( + "The cache_src filename does not contain json." + "JSON is the required file type for featurizer" + "caching." + ) + + self.min_precheck_frac = 0.9 + + @log_progress(logger, AMM_LOG_FIT_STR) + @set_fitted + def fit(self, df, target): + """ + Fit all featurizers to the df and remove featurizers which are out of + scope or otherwise cannot be robustly applied to the dataset. + + WARNING: for fitting to work, the dataframe must contain the + corresponding *col keys set in __init__. So for composition + featurization to work, you must have the composition_col in the + dataframe. + + Composition features: composition_col - pymatgen Composition objs + Structure features: structure_col - pymatgen Structure objs + Bandstructure features: bandstructure_col - pymatgen Bandstructure + objs + DOS features: dos_col - pymatgen DOS objs + + Args: + df (pandas.DataFrame): A dataframe containing at least one of the + keys listed above. The column defined by that key should have + the corresponding types of objects in it. + target (str): The ML target key in the dataframe. + + Returns: + (AutoFeaturizer): self + """ + if self.cache_src and os.path.exists(self.cache_src): + logger.info( + self._log_prefix + "Cache {} found. Fit aborted." + "".format(self.cache_src) + ) + return self + + self.fitted_input_df = df + df = self._prescreen_df(df, inplace=True) + df = self._add_composition_from_structure(df) + + for featurizer_type, featurizers in self.featurizers.items(): + if not featurizers: + logger.info( + self._log_prefix + + "No {} featurizers being used.".format(featurizer_type) + ) + continue + + if featurizer_type in df.columns: + df = self._tidy_column(df, featurizer_type) + + # Remove invalid featurizers by looking at valid_fraction + if self.do_precheck: + logger.debug(self._log_prefix + "Prechecking featurizers.") + invalid_featurizers = [] + for f in featurizers: + try: + frac = f.precheck_dataframe( + df, featurizer_type, return_frac=True + ) + if frac < self.min_precheck_frac: + invalid_featurizers.append(f) + msg = ( + "Will remove {} because it's fraction " + "passing the precheck for this " + "dataset ({}) was less than the minimum" + " ({})" + "".format( + f.__class__.__name__, + frac, + self.min_precheck_frac, + ) + ) + logger.info(self._log_prefix + msg) + except (AttributeError, ValueError, KeyError) as E: + logger.warning( + self._log_prefix + + "{} precheck failed with {}. Ignoring...." + "".format(f, E) + ) + self.featurizers[featurizer_type] = [ + f for f in featurizers if f not in invalid_featurizers + ] + self.removed_featurizers = invalid_featurizers + featurizers = self.featurizers[featurizer_type] + + # Fit the featurizers + for f in featurizers: + log_fit = False + if featurizer_type == self.structure_col: + if f.__class__.__name__ in self.fittable_fcls: + log_fit = True + if log_fit: + logger.info( + self._log_prefix + + "Fitting {}.".format(f.__class__.__name__) + ) + + f.fit(df[featurizer_type].tolist()) + f.set_n_jobs(self.n_jobs) + self.features += f.feature_labels() + + if log_fit: + logger.info( + self._log_prefix + "Fit {} to {} samples in dataframe." + "".format(f.__class__.__name__, df.shape[0]) + ) + else: + logger.info( + self._log_prefix + + "Featurizer type {} not in the dataframe to be" + " fitted. Skipping...".format(featurizer_type) + ) + self.converted_input_df = df + return self + + @log_progress(logger, AMM_LOG_TRANSFORM_STR) + @check_fitted + def transform(self, df, target, prevent_cache_overwrite=False): + """ + Decorate a dataframe containing composition, structure, bandstructure, + and/or DOS objects with descriptors. + + Args: + df (pandas.DataFrame): The dataframe not containing features. + target (str): The ML-target property contained in the df. + prevent_cache_overwrite (bool): If True, does not try to write any + new features to the cache. + + Returns: + df (pandas.DataFrame): Transformed dataframe containing features. + """ + if self.cache_src and os.path.exists(self.cache_src): + logger.debug( + self._log_prefix + "Reading cache_src {}".format(self.cache_src) + ) + cached_df = load_dataframe_from_json(self.cache_src) + if not all([loc in cached_df.index for loc in df.index]): + raise AutomatminerError( + "Feature cache does not contain all " + "entries (by DataFrame index) needed " + "to transform the input df." + ) + else: + cached_subdf = cached_df.loc[df.index] + if target in cached_subdf.columns: + if target not in df.columns: + logger.warn( + self._log_prefix + + "Target not present in both cached df and input df." + " Cannot perform comparison to ensure index match." + ) + else: + cached_targets = cached_subdf[target] + input_targets = df[target] + cached_type = regression_or_classification(cached_targets) + input_type = regression_or_classification(input_targets) + if cached_type != input_type: + raise AutomatminerError( + "Cached targets appear to be '{}' type, while " + "input targets appear to be '{}'." + "".format(cached_type, input_type) + ) + + problems = {} + for ix in input_targets.index: + iv = input_targets[ix] + cv = cached_targets[ix] + if iv != cv: + try: + if not math.isclose(iv, cv): + problems[ix] = [iv, cv] + except TypeError: + pass + if problems: + logger.warning( + self._log_prefix + + "Mismatch between cached targets and input " + "targets: \n{}".format(problems) + ) + + logger.info( + self._log_prefix + "Restored {} features on {} samples from " + "cache {}".format( + len(cached_subdf.columns), len(df.index), self.cache_src + ) + ) + return cached_subdf + else: + transforming_on_fitted = df is self.fitted_input_df + df = self._prescreen_df(df, inplace=True) + + if transforming_on_fitted: + df = self.converted_input_df + else: + df = self._add_composition_from_structure(df) + + for featurizer_type, featurizers in self.featurizers.items(): + if featurizer_type in df.columns: + if not transforming_on_fitted: + df = self._tidy_column(df, featurizer_type) + + for f in featurizers: + logger.info( + self._log_prefix + "Featurizing with {}." + "".format(f.__class__.__name__) + ) + df = f.featurize_dataframe( + df, + featurizer_type, + ignore_errors=self.ignore_errors, + multiindex=self.multiindex, + inplace=False, + ) + if self.drop_inputs: + df = df.drop(columns=[featurizer_type]) + else: + logger.info( + self._log_prefix + + "Featurizer type {} not in the dataframe. " + "Skipping...".format(featurizer_type) + ) + if self.functionalize: + ff = FunctionFeaturizer() + ff.set_n_jobs(self.n_jobs) + cols = df.columns.tolist() + for ft in self.featurizers.keys(): + if ft in cols: + cols.pop(ft) + df = ff.fit_featurize_dataframe( + df, + cols, + ignore_errors=self.ignore_errors, + multiindex=self.multiindex, + inplace=False, + ) + if ( + self.cache_src + and not os.path.exists(self.cache_src) + and not prevent_cache_overwrite + ): + store_dataframe_as_json(df, self.cache_src) + return df + + def _prescreen_df(self, df, inplace=True): + """ + Pre-screen a dataframe. + + Args: + df (pandas.DataFrame): The dataframe to be screened. + inplace (bool): Manipulate the df in-place in memory + + Returns: + df (pandas.DataFrame) The screened dataframe. + + """ + if not inplace: + df = df.copy(deep=True) + if self.ignore_errors is not None: + for col in self.ignore_cols: + if col in df: + df = df.drop([col], axis=1) + return df + + def _tidy_column(self, df, featurizer_type): + """ + Various conversions to homogenize columns for featurization input. + For example, take a column of compositions and ensure they are decorated + with oxidation states, are not strings, etc. + + Args: + df (pandas.DataFrame) + featurizer_type: The key defining the featurizer input. For example, + composition featurizers should have featurizer_type of + "composition". + + Returns: + df (pandas.DataFrame): DataFrame with featurizer_type column + ready for featurization. + """ + # todo: Make the following conversions more robust (no [0] type checking) + type_tester = df[featurizer_type].iloc[0] + + if featurizer_type == self.composition_col: + # Convert formulas to composition objects + if isinstance(type_tester, str): + logger.info( + self._log_prefix + + "Compositions detected as strings. Attempting " + "conversion to Composition objects..." + ) + stc = StrToComposition( + overwrite_data=True, target_col_id=featurizer_type + ) + stc.set_n_jobs(self.n_jobs) + df = stc.featurize_dataframe( + df, + featurizer_type, + multiindex=self.multiindex, + ignore_errors=True, + inplace=False, + ) + + elif isinstance(type_tester, dict): + logger.info( + self._log_prefix + "Compositions detected as dicts. Attempting " + "conversion to Composition objects..." + ) + df[featurizer_type] = [ + Composition.from_dict(d) for d in df[featurizer_type] + ] + + # Convert non-oxidstate containing comps to oxidstate comps + if self.guess_oxistates: + logger.info( + self._log_prefix + + "Guessing oxidation states of compositions, as" + " they were not present in input." + ) + cto = CompositionToOxidComposition( + target_col_id=featurizer_type, + overwrite_data=True, + return_original_on_error=True, + max_sites=-50, + ) + cto.set_n_jobs(self.n_jobs) + try: + df = cto.featurize_dataframe( + df, + featurizer_type, + multiindex=self.multiindex, + inplace=False, + ) + except Exception as e: + logger.warning( + self._log_prefix + "Could not decorate oxidation states due " + "to {}. Some featurizers based on composition oxidstates" + "may not work".format(e) + ) + + else: + # Convert structure/bs/dos dicts to objects (robust already) + if isinstance(type_tester, (dict, str)): + logger.info( + self._log_prefix.capitalize() + + "{} detected as string or dict. Attempting " + "conversion to {} objects..." + "".format(featurizer_type, featurizer_type) + ) + if isinstance(type_tester, str): + raise ValueError( + "{} column is type {}. Cannot convert." + "".format(featurizer_type, type(type_tester)) + ) + dto = DictToObject( + overwrite_data=True, target_col_id=featurizer_type + ) + dto.set_n_jobs(self.n_jobs) + df = dto.featurize_dataframe(df, featurizer_type, inplace=False) + + # Decorate with oxidstates + if featurizer_type == self.structure_col and self.guess_oxistates: + logger.info( + self._log_prefix + + "Guessing oxidation states of structures if they were " + "not present in input." + ) + sto = StructureToOxidStructure( + target_col_id=featurizer_type, + overwrite_data=True, + return_original_on_error=True, + max_sites=30, + ) + sto.set_n_jobs(self.n_jobs) + try: + df = sto.featurize_dataframe( + df, + featurizer_type, + multiindex=self.multiindex, + inplace=False, + ) + except Exception as e: + logger.info( + self._log_prefix + + "Could not decorate oxidation states on structures " + "due to {}.".format(e) + ) + return df + + def _add_composition_from_structure(self, df, overwrite=True): + """ + Automatically deduce compositions from structures if: + + 1. structures are available + 2. compositions are not available (unless overwrite) + 3. composition features are actually desired. (deduced from whether + composition featurizers are present in self.featurizers). + + Args: + df (pandas.DataFrame): May or may not contain composition column. + overwrite (bool): Whether to overwrite the composition column if it + already exists. + + Returns: + df (pandas.DataFrame): Contains composition column if desired + """ + if ( + self.structure_col in df.columns + and (self.composition_col not in df.columns or overwrite) + and self.composition_col in self.featurizers + ): + + if self.composition_col in df.columns: + logger.info( + self._log_prefix + "composition column already exists, " + "overwriting with composition from structure." + ) + else: + logger.info( + self._log_prefix + "Adding compositions from structures." + ) + + df = self._tidy_column(df, self.structure_col) + + # above tidy column will add oxidation states, these oxidation + # states will then be transferred to composition. + struct2comp = StructureToComposition( + reduce=True, + target_col_id=self.composition_col, + overwrite_data=overwrite, + ) + struct2comp.set_n_jobs(self.n_jobs) + df = struct2comp.featurize_dataframe(df, self.structure_col) + return df diff --git a/automatminer/featurization/core.py b/automatminer/featurization/core.py index 1ba6d1f0..768ed23c 100644 --- a/automatminer/featurization/core.py +++ b/automatminer/featurization/core.py @@ -6,7 +6,7 @@ import math import logging -from pymatgen import Composition +from pymatgen.core import Composition from matminer.featurizers.conversions import ( StrToComposition, DictToObject,