interpretml
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/DESCRIPTION‎
Lines changed: 2 additions & 2 deletions b/‎R/DESCRIPTION‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/src/Makevars‎
Lines changed: 1 addition & 1 deletion b/‎R/src/Makevars‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/src/interpret_R.cpp‎
Lines changed: 64 additions & 67 deletions b/‎R/src/interpret_R.cpp‎
Lines changed: 64 additions & 67 deletions
diff --git a/‎azure-pipelines.yml‎
Lines changed: 53 additions & 48 deletions b/‎azure-pipelines.yml‎
Lines changed: 53 additions & 48 deletions
diff --git a/‎core/CachedThreadResources.h‎
Lines changed: 4 additions & 3 deletions b/‎core/CachedThreadResources.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎core/DataSetByFeature.h‎
Lines changed: 5 additions & 7 deletions b/‎core/DataSetByFeature.h‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎core/DataSetByFeatureCombination.cpp‎
Lines changed: 12 additions & 15 deletions b/‎core/DataSetByFeatureCombination.cpp‎
Lines changed: 12 additions & 15 deletions
@@ -18,6 +18,7 @@ enc_temp_folder/
 *.egg-info/
 *.ipynb_checkpoints/
 venv/
+.pytest_cache/
 
 # General
 *.bak
@@ -1,7 +1,7 @@
 Package: interpret
 Title: Fit Interpretable Models and Explain Blackbox Machine Learning
-Version: 0.1.21
-Date: 2019-10-07
+Version: 0.1.22
+Date: 2019-10-10
 Description: Machine Learning package for training interpretable models and explaining blackbox systems. Historically, the most intelligible models were not very accurate, and the most accurate models were not intelligible. Microsoft Research has developed an algorithm called the Explainable Boosting Machine (EBM) which has both high accuracy and intelligibility. EBM uses machine learning techniques like bagging and boosting to breathe new life into traditional GAMs (Generalized Additive Models). This makes them as accurate as random forests and gradient boosted trees, and also enhances their intelligibility and editability. Details on the EBM algorithm can be found in the paper by Rich Caruana, Yin Lou, Johannes Gehrke, Paul Koch, Marc Sturm, and Noemie Elhadad (2015, <doi:10.1145/2783258.2788613>).
 URL: https://github.com/microsoft/interpret
 BugReports: https://github.com/microsoft/interpret/issues
 
@@ -5,7 +5,7 @@
 COREDIR=../../core
 
 CXX_STD = CXX11
-PKG_CPPFLAGS= -I$(COREDIR) -I$(COREDIR)/inc -DEBMCORE_R
+PKG_CPPFLAGS= -I$(COREDIR) -I$(COREDIR)/inc -DEBMCORE_R -DEBMCORE_EXPORTS
 PKG_CXXFLAGS=$(CXX_VISIBILITY)
 
 OBJECTS = interpret_R.o $(COREDIR)/DataSetByFeature.o $(COREDIR)/DataSetByFeatureCombination.o $(COREDIR)/InteractionDetection.o $(COREDIR)/Logging.o $(COREDIR)/SamplingWithReplacement.o $(COREDIR)/Training.o
@@ -4,28 +4,29 @@ jobs:
     matrix:
       Linux:
         image.name: 'ubuntu-16.04'
-      Windows:
-        image.name: 'windows-2019'
       Mac:
         image.name: 'macOS-10.13'
+      Windows:
+        image.name: 'windows-2019'
     maxParallel: 3
   pool:
     vmImage: '$(image.name)'
   steps:
   - script: |
+      sudo apt-get -y update
       sudo apt-get -y install g++-multilib
-    displayName: 'Install prereq for linux.'
-    condition: in(variables['image.name'], 'ubuntu-16.04')
+      /bin/sh ./build.sh -32bit
+    condition: startsWith(variables['image.name'], 'ubuntu')
+    displayName: 'Building native code (Linux)'
   - script: |
-      chmod +x build.sh
-      ./build.sh -32bit
-    condition: in(variables['image.name'], 'ubuntu-16.04', 'macOS-10.13')
-    displayName: 'Building/moving native code (linux/mac).'
+      /bin/sh ./build.sh -32bit
+    condition: startsWith(variables['image.name'], 'macOS')
+    displayName: 'Building native code (Mac)'
   - script: |
-      set PATH=C:\Windows\system32\;C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\
-      build.bat -32bit
-    condition: in(variables['image.name'], 'windows-2019')
-    displayName: 'Building/moving native code. (win)'
+      set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\
+      .\build.bat -32bit
+    condition: startsWith(variables['image.name'], 'windows')
+    displayName: 'Building native code (Windows)'
   - task: CopyFiles@2
     condition: succeeded()
     inputs:
@@ -34,6 +35,7 @@ jobs:
       targetFolder: '$(Build.ArtifactStagingDirectory)'
     displayName: 'Move binary to staging'
   - task: PublishBuildArtifacts@1
+    condition: succeeded()
     inputs:
       pathtoPublish: $(Build.ArtifactStagingDirectory)
       artifactName: native-$(image.name)
@@ -45,10 +47,10 @@ jobs:
     matrix:
       Linux:
         image.name: 'ubuntu-16.04'
-      Windows:
-        image.name: 'windows-2019'
       Mac:
         image.name: 'macOS-10.13'
+      Windows:
+        image.name: 'windows-2019'
     maxParallel: 3
   pool:
     vmImage: '$(image.name)'
@@ -59,25 +61,28 @@ jobs:
       downloadType: 'specific'
     displayName: 'Download build artifacts'
   - task: CopyFiles@2
+    condition: succeeded()
     inputs:
       sourceFolder: '$(System.ArtifactsDirectory)'
       contents: '**/lib_ebmcore_*'
       targetFolder: 'staging'
       flattenFolders: true
     displayName: 'Move binary to staging directory'
   - script: |
+      sudo apt-get -y update
       sudo apt-get -y install g++-multilib
-    displayName: 'Install prereq for linux.'
-    condition: in(variables['image.name'], 'ubuntu-16.04')
+      /bin/sh ./tests/core/test_core_api.sh -nobuildcore
+    condition: startsWith(variables['image.name'], 'ubuntu')
+    displayName: 'Testing native code (Linux)'
   - script: |
       /bin/sh ./tests/core/test_core_api.sh -nobuildcore
-    condition: in(variables['image.name'], 'ubuntu-16.04', 'macOS-10.13')
-    displayName: 'Testing native code (linux/mac).'
+    condition: startsWith(variables['image.name'], 'macOS')
+    displayName: 'Testing native code (Mac)'
   - script: |
-      set PATH=C:\Windows\system32\;C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\
+      set PATH=%PATH%;C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\
       .\tests\core\test_core_api.bat -nobuildcore
-    condition: in(variables['image.name'], 'windows-2019')
-    displayName: 'Testing native code. (win)'
+    condition: startsWith(variables['image.name'], 'windows')
+    displayName: 'Testing native code (Windows)'
 
 - job: 'Build_JS_Inline'
   pool:
@@ -98,10 +103,11 @@ jobs:
       targetFolder: '$(Build.ArtifactStagingDirectory)'
     displayName: 'Move assets to staging'
   - task: PublishBuildArtifacts@1
+    condition: succeeded()
     inputs:
       pathtoPublish: $(Build.ArtifactStagingDirectory)
       artifactName: interpret-inline-bundle
-    displayName: 'Publish interpret inline library'
+    displayName: 'Publish interpret-inline.js library'
 
 - job: 'Build_Package'
   dependsOn: ['Build_Native', 'Build_JS_Inline']
@@ -114,25 +120,29 @@ jobs:
       downloadType: 'specific'
     displayName: 'Download build artifacts'
   - task: UsePythonVersion@0
+    condition: succeeded()
     inputs:
       versionSpec: '3.6'
       architecture: 'x64'
   - script: python -m pip install --upgrade pip setuptools wheel
+    condition: succeeded()
     displayName: 'Install tools'
   - task: CopyFiles@2
+    condition: succeeded()
     inputs:
       sourceFolder: '$(System.ArtifactsDirectory)'
-      contents: '**/lib_ebmcore_*'
+      contents: '**/lib_ebmcore_*_x64*'
       targetFolder: 'python/interpret-core/interpret/lib'
       flattenFolders: true
     displayName: 'Move binary to Python layer'
   - task: CopyFiles@2
+    condition: succeeded()
     inputs:
       sourceFolder: '$(System.ArtifactsDirectory)'
       contents: '**/interpret-inline.js'
       targetFolder: 'python/interpret-core/interpret/lib'
       flattenFolders: true
-    displayName: 'Move Interpret inline to Python layer'
+    displayName: 'Move interpret-inline.js to Python layer'
   - script: python setup.py bdist_wheel -d ../../staging
     condition: succeeded()
     workingDirectory: 'python/interpret-core'
@@ -149,6 +159,7 @@ jobs:
       targetFolder: '$(Build.ArtifactStagingDirectory)/wheel'
     displayName: 'Move wheel for Build Artifact'
   - task: PublishBuildArtifacts@1
+    condition: succeeded()
     inputs:
       pathtoPublish: '$(Build.ArtifactStagingDirectory)/wheel'
       artifactName: 'wheel'
@@ -176,9 +187,9 @@ jobs:
       WindowsPython37:
         python.version: '3.7'
         image.name: 'windows-2019'
-#      MacPython35:
-#        python.version: '3.5'
-#        image.name: 'macOS-10.13'
+      MacPython35:
+        python.version: '3.5'
+        image.name: 'macOS-10.13'
       MacPython36:
         python.version: '3.6'
         image.name: 'macOS-10.13'
@@ -195,57 +206,53 @@ jobs:
       downloadType: 'specific'
     displayName: 'Download build artifacts'
   - task: CopyFiles@2
+    condition: succeeded()
     inputs:
       sourceFolder: '$(System.ArtifactsDirectory)'
       contents: '**/lib_ebmcore_*'
       targetFolder: 'python/interpret-core/interpret/lib'
       flattenFolders: true
     displayName: 'Move binary to Python layer'
   - task: CopyFiles@2
+    condition: succeeded()
     inputs:
       sourceFolder: '$(System.ArtifactsDirectory)'
       contents: '**/interpret-inline.js'
       targetFolder: 'python/interpret-core/interpret/lib'
       flattenFolders: true
     displayName: 'Move Interpret inline to Python layer'
   - task: UsePythonVersion@0
+    condition: succeeded()
     inputs:
       versionSpec: '$(python.version)'
       architecture: 'x64'
   - script: python -m pip install --upgrade pip setuptools wheel
+    condition: succeeded()
     displayName: 'Install tools'
-  - script: |
-      python -m pip install numpy scipy
+  - script: python -m pip install numpy scipy
+    condition: succeeded()
     workingDirectory: python/interpret-core
     displayName: 'Install numpy/scipy first for mis-specified pip packages.'
   - script: |
       mkdir -p ~/.matplotlib
       echo "backend: TkAgg" >> ~/.matplotlib/matplotlibrc
-    condition: in(variables['image.name'], 'macOS-10.13')
+    condition: startsWith(variables['image.name'], 'macOS')
     displayName: 'Matplotlib patch for mac.'
-  - script: |
-      python -m pip install -r dev-requirements.txt
-    workingDirectory: python/interpret-core
-    displayName: 'Install requirements (Windows)'
-    condition: in(variables['image.name'], 'windows-2019')
-  - script: |
-      export LIBRARY_PATH=$LIBRARY_PATH:/opt/hostedtoolcache/Python/3.6.9/x64/lib
-      export LIBRARY_PATH=$LIBRARY_PATH:/opt/hostedtoolcache/Python/3.7.4/x64/lib
-      python -m pip install -r dev-requirements.txt
+  - script: python -m pip install -r dev-requirements.txt
+    condition: succeeded()
     workingDirectory: python/interpret-core
-    displayName: 'Install requirements (Non-Windows)'
-    condition: in(variables['image.name'], 'ubuntu-16.04', 'macOS-10.13')
+    displayName: 'Install requirements (Linux/Mac/Windows)'
   - script: |
       set PATH=%PATH%;%GeckoWebDriver%
       python -m pytest -vv -n auto --runslow --runselenium --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
     workingDirectory: python/interpret-core
-    condition: in(variables['image.name'], 'windows-2019')
+    condition: startsWith(variables['image.name'], 'windows')
     displayName: 'Run pytest (Windows)'
   - script: |
       python -m pytest -vv -n auto --runslow --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html
     workingDirectory: python/interpret-core
-    condition: in(variables['image.name'], 'ubuntu-16.04', 'macOS-10.13')
-    displayName: 'Run pytest (Non-Windows)'
+    condition: or(startsWith(variables['image.name'], 'ubuntu'), startsWith(variables['image.name'], 'macOS'))
+    displayName: 'Run pytest (Linux/Mac)'
   - task: PublishTestResults@2
     condition: succeededOrFailed()
     inputs:
@@ -257,13 +264,11 @@ jobs:
       codeCoverageTool: Cobertura
       summaryFileLocation: '$(System.DefaultWorkingDirectory)/**/coverage.xml'
       reportDirectory: '$(System.DefaultWorkingDirectory)/**/htmlcov'
-    condition: in(variables['image.name'], 'windows-2019')
+    condition: startsWith(variables['image.name'], 'windows')
     displayName: 'Publish test coverage results'
 
 - job: 'Publish_Package'
-  dependsOn:
-  - 'Test'
-  - 'Test_Native'
+  dependsOn: ['Test_Native', 'Test']
   pool:
     vmImage: 'ubuntu-16.04'
   steps:
 
@@ -5,6 +5,7 @@
 #ifndef CACHED_THREAD_RESOURCES_H
 #define CACHED_THREAD_RESOURCES_H
 
+#include <vector>
 #include <queue>
 #include <stdlib.h> // malloc, realloc, free
 #include <stddef.h> // size_t, ptrdiff_t
@@ -19,7 +20,7 @@ class CompareTreeNodeSplittingGain final {
 public:
    // TODO : check how efficient this is.  Is there a faster way to to this
    constexpr bool operator() (const TreeNode<bClassification> * const & lhs, const TreeNode<bClassification> * const & rhs) const {
-      return rhs->m_UNION.afterExaminationForPossibleSplitting.splitGain < lhs->m_UNION.afterExaminationForPossibleSplitting.splitGain;
+      return rhs->m_UNION.m_afterExaminationForPossibleSplitting.m_splitGain < lhs->m_UNION.m_afterExaminationForPossibleSplitting.m_splitGain;
    }
 };
 
@@ -103,15 +104,15 @@ class CachedTrainingThreadResources {
       return m_aThreadByteBuffer1;
    }
 
-   // TODO : we can probably avoid redoing any tree growing IF realloc doesn't move the memory since all the internal pointers would still be valid in that case
    EBM_INLINE bool GrowThreadByteBuffer2(const size_t cByteBoundaries) {
       // by adding cByteBoundaries and shifting our existing size, we do 2 things:
       //   1) we ensure that if we have zero size, we'll get some size that we'll get a non-zero size after the shift
       //   2) we'll always get back an odd number of items, which is good because we always have an odd number of TreeNodeChilden
       EBM_ASSERT(0 == m_cThreadByteBufferCapacity2 % cByteBoundaries);
       m_cThreadByteBufferCapacity2 = cByteBoundaries + (m_cThreadByteBufferCapacity2 << 1);
       LOG_N(TraceLevelInfo, "Growing CachedTrainingThreadResources::ThreadByteBuffer2 to %zu", m_cThreadByteBufferCapacity2);
-      // TODO : can we use malloc here?  We only need realloc if we need to keep the existing data
+      // TODO : use malloc here.  our tree objects have internal pointers, so we're going to dispose of our work anyways
+      // There is no way to check if the array was re-allocated or not without invoking undefined behavior, so we don't get a benefit if the array can be resized with realloc
       void * const aNewThreadByteBuffer = realloc(m_aThreadByteBuffer2, m_cThreadByteBufferCapacity2);
       if(UNLIKELY(nullptr == aNewThreadByteBuffer)) {
          // according to the realloc spec, if realloc fails to allocate the new memory, it returns nullptr BUT the old memory is valid.
 
@@ -2,8 +2,8 @@
 // Licensed under the MIT license.
 // Author: Paul Koch <[email protected]>
 
-#ifndef DATA_SET_INTERNAL_H
-#define DATA_SET_INTERNAL_H
+#ifndef DATA_SET_BY_FEATURE_H
+#define DATA_SET_BY_FEATURE_H
 
 #include <stddef.h> // size_t, ptrdiff_t
 
@@ -12,7 +12,6 @@
 #include "Logging.h" // EBM_ASSERT & LOG
 #include "FeatureCore.h"
 
-// TODO: rename this to DataSetByFeature
 class DataSetByFeature final {
    const FractionalDataType * const m_aResidualErrors;
    const StorageDataTypeCore * const * const m_aaInputData;
@@ -32,9 +31,8 @@ class DataSetByFeature final {
       EBM_ASSERT(nullptr != m_aResidualErrors);
       return m_aResidualErrors;
    }
-   // TODO: we can change this to take the m_iInputData value directly, which we get from the user! (this also applies to the other dataset)
-   // TODO: rename this to GetInputDataPointer
-   EBM_INLINE const StorageDataTypeCore * GetDataPointer(const FeatureCore * const pFeature) const {
+   // TODO: we can change this to take the m_iFeatureData value directly, which we get from a loop index
+   EBM_INLINE const StorageDataTypeCore * GetInputDataPointer(const FeatureCore * const pFeature) const {
       EBM_ASSERT(nullptr != pFeature);
       EBM_ASSERT(pFeature->m_iFeatureData < m_cFeatures);
       EBM_ASSERT(nullptr != m_aaInputData);
@@ -48,4 +46,4 @@ class DataSetByFeature final {
    }
 };
 
-#endif // DATA_SET_INTERNAL_H
+#endif // DATA_SET_BY_FEATURE_H
@@ -183,7 +183,7 @@ EBM_INLINE static const StorageDataTypeCore * const * ConstructInputData(const s
 
          EBM_ASSERT(nullptr != aInputDataFrom);
 
-         const FeatureCombinationCore::FeatureCombinationEntry * pFeatureCombinationEntry = &pFeatureCombination->m_FeatureCombinationEntry[0];
+         const FeatureCombinationCore::FeatureCombinationEntry * pFeatureCombinationEntry = ARRAY_TO_POINTER_CONST(pFeatureCombination->m_FeatureCombinationEntry);
          InputDataPointerAndCountBins dimensionInfo[k_cDimensionsMax];
          InputDataPointerAndCountBins * pDimensionInfo = &dimensionInfo[0];
          EBM_ASSERT(0 < cFeatures);
@@ -263,28 +263,25 @@ EBM_INLINE static const StorageDataTypeCore * const * ConstructInputData(const s
 }
 
 DataSetByFeatureCombination::DataSetByFeatureCombination(const bool bAllocateResidualErrors, const bool bAllocatePredictorScores, const bool bAllocateTargetData, const size_t cFeatureCombinations, const FeatureCombinationCore * const * const apFeatureCombination, const size_t cInstances, const IntegerDataType * const aInputDataFrom, const void * const aTargets, const FractionalDataType * const aPredictorScoresFrom, const size_t cVectorLength)
-   : m_aResidualErrors(bAllocateResidualErrors ? ConstructResidualErrors(cInstances, cVectorLength) : static_cast<FractionalDataType *>(INVALID_POINTER))
-   , m_aPredictorScores(bAllocatePredictorScores ? ConstructPredictorScores(cInstances, cVectorLength, aPredictorScoresFrom) : static_cast<FractionalDataType *>(INVALID_POINTER))
-   , m_aTargetData(bAllocateTargetData ? ConstructTargetData(cInstances, static_cast<const IntegerDataType *>(aTargets)) : static_cast<const StorageDataTypeCore *>(INVALID_POINTER))
+   : m_aResidualErrors(bAllocateResidualErrors ? ConstructResidualErrors(cInstances, cVectorLength) : static_cast<FractionalDataType *>(nullptr))
+   , m_aPredictorScores(bAllocatePredictorScores ? ConstructPredictorScores(cInstances, cVectorLength, aPredictorScoresFrom) : static_cast<FractionalDataType *>(nullptr))
+   , m_aTargetData(bAllocateTargetData ? ConstructTargetData(cInstances, static_cast<const IntegerDataType *>(aTargets)) : static_cast<const StorageDataTypeCore *>(nullptr))
    , m_aaInputData(0 == cFeatureCombinations ? nullptr : ConstructInputData(cFeatureCombinations, apFeatureCombination, cInstances, aInputDataFrom))
    , m_cInstances(cInstances)
-   , m_cFeatureCombinations(cFeatureCombinations) {
-
+   , m_cFeatureCombinations(cFeatureCombinations) 
+   , m_bAllocateResidualErrors(bAllocateResidualErrors)
+   , m_bAllocatePredictorScores(bAllocatePredictorScores)
+   , m_bAllocateTargetData(bAllocateTargetData) {
    EBM_ASSERT(0 < cInstances);
 }
 
 DataSetByFeatureCombination::~DataSetByFeatureCombination() {
    LOG_0(TraceLevelInfo, "Entered ~DataSetByFeatureCombination");
 
-   if(INVALID_POINTER != m_aResidualErrors) {
-      free(m_aResidualErrors);
-   }
-   if(INVALID_POINTER != m_aPredictorScores) {
-      free(m_aPredictorScores);
-   }
-   if(INVALID_POINTER != m_aTargetData) {
-      free(const_cast<StorageDataTypeCore *>(m_aTargetData));
-   }
+   free(m_aResidualErrors);
+   free(m_aPredictorScores);
+   free(const_cast<StorageDataTypeCore *>(m_aTargetData));
+
    if(nullptr != m_aaInputData) {
       EBM_ASSERT(0 < m_cFeatureCombinations);
       const StorageDataTypeCore * const * paInputData = m_aaInputData;