chore: prepare v0.1.3 release\n\n- docs: update README, wiki content, changelog\n- feat: add PyPI publish workflow via Trusted Publisher (OIDC)\n- fix: seaborn y= for utils; clarify Mahalanobis & transformations\n- chore: bump version to 0.1.3; downgrade classifier to Beta

SubaashNair · SubaashNair · commit 377e4e0eec8a · 2025-08-08T11:35:01.000+08:00
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -1,3 +1,51 @@
+name: Publish to PyPI
+
+on:
+  push:
+    tags:
+      - 'v*'
+
+permissions:
+  contents: read
+  id-token: write
+
+jobs:
+  test-build-publish:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest build
+
+      - name: Run tests (headless)
+        env:
+          MPLBACKEND: Agg
+        run: pytest -q
+
+      - name: Verify tag matches version
+        run: |
+          VERSION=$(python -c "import re;print(re.search(r"""__version__\s*=\s*'([^']+)'""", open('statclean/__init__.py').read()).group(1))")
+          TAG=${GITHUB_REF_NAME#v}
+          echo "Version: $VERSION | Tag: $TAG"
+          if [ "$VERSION" != "$TAG" ]; then echo "Tag ($TAG) does not match version ($VERSION)" && exit 1; fi
+
+      - name: Build package
+        run: python -m build
+
+      - name: Publish to PyPI (Trusted Publisher)
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          skip-existing: true
 name: Publish to PyPI and GitHub Release
 
 on:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,27 @@ All notable changes to StatClean will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.3] - 2025-08-08
+
+### Changed
+- Align docs/examples with actual API: remover methods return `self`; retrieve cleaned data via `cleaner.clean_df`.
+- Grubbs/Dixon docs updated to use keys `statistic` and `is_outlier`.
+- Clarified Mahalanobis `chi2_threshold` semantics; now accepts percentile in (0,1] or absolute chi-square statistic.
+- Seaborn plotting updated to explicit `y=`/`x=` to improve compatibility.
+- Transformations (`Box-Cox`, `log`, `sqrt`) preserve NaN positions; Box-Cox now computes on non-NA values only.
+- `analyze_distribution` is NaN-safe and limits Shapiro sample size.
+- Improved Mahalanobis stability: condition checks, pseudoinverse fallback, optional shrinkage covariance (`use_shrinkage`).
+- Replaced prints with `warnings.warn` where appropriate.
+
+### Added
+- GitHub Actions workflow for building and publishing to PyPI on release tags.
+
+### Fixed
+- Example unpacking errors and empty DataFrame init expectation in examples/tests.
+- Modified Z-score visualization now computes and labels bounds correctly.
+
+---
+
 ## [0.1.0] - 2025-08-06
 
 ### 🎉 Initial Release of StatClean
diff --git a/README.md b/README.md
@@ -55,6 +55,11 @@ df = pd.DataFrame({
     'age': [25, 30, 35, 40, 35, 45, 50]
 })
 
+"""
+Note: As of v0.1.3, remover methods return the cleaner instance for method chaining.
+Access cleaned data via `cleaner.clean_df` and details via `cleaner.outlier_info`.
+"""
+
 # Initialize StatClean
 cleaner = StatClean(df)
 
@@ -73,21 +78,23 @@ print(f"Outliers removed: {info['income']['outliers_removed']}")
 ```python
 # Grubbs' test for outliers with statistical significance
 result = cleaner.grubbs_test('income', alpha=0.05)
-print(f"Test statistic: {result['test_statistic']:.3f}")
+print(f"Test statistic: {result['statistic']:.3f}")
 print(f"P-value: {result['p_value']:.6f}")
-print(f"Outlier detected: {result['outlier_detected']}")
+print(f"Outlier detected: {result['is_outlier']}")
 
 # Dixon's Q-test for small samples
 result = cleaner.dixon_q_test('age', alpha=0.05)
-print(f"Q statistic: {result['q_statistic']:.3f}")
+print(f"Q statistic: {result['statistic']:.3f}")
 print(f"Critical value: {result['critical_value']:.3f}")
 ```
 
 ### Multivariate Outlier Detection
 
 ```python
 # Mahalanobis distance for multivariate outliers
-outliers = cleaner.detect_outliers_mahalanobis(['income', 'age'], chi2_threshold=0.95)
+# chi2_threshold can be a percentile (0<val<=1) or absolute chi-square statistic
+# use_shrinkage=True uses Ledoit–Wolf shrinkage covariance if scikit-learn is installed
+outliers = cleaner.detect_outliers_mahalanobis(['income', 'age'], chi2_threshold=0.95, use_shrinkage=True)
 print(f"Multivariate outliers detected: {outliers.sum()}")
 
 # Remove multivariate outliers
@@ -99,12 +106,12 @@ cleaned_df = cleaner.remove_outliers_mahalanobis(['income', 'age'])
 ```python
 # Automatic transformation recommendation
 recommendation = cleaner.recommend_transformation('income')
-print(f"Recommended transformation: {recommendation['best_transformation']}")
-print(f"Improvement in skewness: {recommendation['skewness_improvement']:.3f}")
+print(f"Recommended transformation: {recommendation['recommended_method']}")
+print(f"Improvement in skewness: {recommendation['expected_improvement']:.3f}")
 
 # Apply Box-Cox transformation
-transformed_df = cleaner.transform_boxcox('income')
-print(f"Optimal lambda: {transformed_df['lambda']:.3f}")
+_, info = cleaner.transform_boxcox('income')
+print(f"Optimal lambda: {info['lambda']:.3f}")
 
 # Method chaining for complex workflows
 result = (cleaner
@@ -263,10 +270,19 @@ for feature in features:
 - **seaborn**: ≥0.11.0
 - **scipy**: ≥1.6.0 (for statistical tests)
 - **tqdm**: ≥4.60.0 (for progress bars)
-- **scikit-learn**: ≥0.24.0 (optional, for examples)
+- **scikit-learn**: ≥0.24.0 (optional, for shrinkage covariance in Mahalanobis)
 
 ## Changelog
 
+### Version 0.1.3 (2025-08-08)
+
+- Align docs/examples with actual API: remover methods return `self`; use `cleaner.clean_df` and `cleaner.outlier_info`.
+- Grubbs/Dixon result keys clarified: `statistic`, `is_outlier`.
+- Mahalanobis `chi2_threshold` accepts percentile (0<val<=1) or absolute chi-square statistic; added `use_shrinkage` option.
+- Transformations preserve NaNs; Box-Cox computed on non-NA values only.
+- Seaborn plotting calls updated for compatibility; analysis functions made NaN-safe.
+- Added GitHub Actions workflow to publish to PyPI on releases.
+
 ### Version 0.1.0 (2025-08-06)
 
 **🎉 Initial Release of StatClean**
@@ -319,4 +335,30 @@ MIT License
 
 ---
 
-*StatClean: Where statistical rigor meets practical data science.*
+*StatClean: Where statistical rigor meets practical data science.*
+
+## Development: Run Tests in Headless Mode and Capture Logs
+
+```bash
+# Ensure a headless matplotlib backend and run tests quietly
+export MPLBACKEND=Agg
+pytest -q
+
+# Save a timestamped test log (example)
+LOG=cursor_logs/test_log.md
+mkdir -p cursor_logs
+echo "==== $(date) ====\n" >> "$LOG"
+MPLBACKEND=Agg pytest -q 2>&1 | tee -a "$LOG"
+
+## Continuous Delivery: Publish to PyPI (Trusted Publisher)
+
+This repository includes a GitHub Actions workflow using PyPI Trusted Publisher (OIDC).
+
+Setup (one-time on PyPI):
+- Add this GitHub repo as a Trusted Publisher in the PyPI project settings.
+
+Release steps:
+1. Bump version in `statclean/__init__.py` and `setup.py` (already `0.1.3`).
+2. Push a tag matching the version, e.g., `git tag v0.1.3 && git push origin v0.1.3`.
+3. Workflow will run tests, build, and publish to PyPI without storing credentials.
+```
diff --git a/docs/api-reference.md b/docs/api-reference.md
@@ -45,12 +45,13 @@ Detect outliers using Modified Z-score (MAD-based) method.
 **Returns:**
 - `pandas.Series`: Boolean mask indicating outliers
 
-#### `detect_outliers_mahalanobis(columns, chi2_threshold=0.95)`
+#### `detect_outliers_mahalanobis(columns, chi2_threshold=None, use_shrinkage=False)`
 Detect multivariate outliers using Mahalanobis distance.
 
 **Parameters:**
 - `columns` (list): List of column names for multivariate analysis
-- `chi2_threshold` (float): Chi-square threshold percentile
+- `chi2_threshold` (float): If `None`, defaults to 97.5th percentile; if `0 < value <= 1`, treated as percentile; otherwise treated as absolute chi-square threshold
+- `use_shrinkage` (bool): Use Ledoit–Wolf shrinkage covariance estimator when available (requires scikit-learn); falls back to sample covariance otherwise
 
 **Returns:**
 - `pandas.Series`: Boolean mask indicating outliers
@@ -86,7 +87,7 @@ Perform Grubbs' test for outliers with statistical significance.
 - `two_sided` (bool): Whether to perform two-sided test
 
 **Returns:**
-- `dict`: Test results including p-value, test statistic, critical value
+- `dict`: Test results including `statistic`, `p_value`, `critical_value`, `is_outlier`, `outlier_value`, `outlier_index`
 
 #### `dixon_q_test(column, alpha=0.05)`
 Perform Dixon's Q-test for small samples (n < 30).
@@ -96,7 +97,7 @@ Perform Dixon's Q-test for small samples (n < 30).
 - `alpha` (float): Significance level
 
 **Returns:**
-- `dict`: Test results including Q-statistic, critical value, p-value
+- `dict`: Test results including `statistic`, `critical_value`, `p_value`, `is_outlier`
 
 ### Data Transformations
 
diff --git a/docs/examples.md b/docs/examples.md
@@ -29,11 +29,11 @@ print(f"Cleaned shape: {cleaned_df.shape}")
 # Formal statistical testing
 grubbs_result = cleaner.grubbs_test('income', alpha=0.05)
 print(f"P-value: {grubbs_result['p_value']:.6f}")
-print(f"Outlier detected: {grubbs_result['outlier_detected']}")
+print(f"Outlier detected: {grubbs_result['is_outlier']}")
 
 # Dixon's Q-test for small samples
 dixon_result = cleaner.dixon_q_test('age', alpha=0.05)
-print(f"Q-statistic: {dixon_result['q_statistic']:.3f}")
+print(f"Statistic: {dixon_result['statistic']:.3f}")
 ```
 
 ## Multivariate Analysis Example
@@ -52,11 +52,11 @@ cleaner.remove_outliers_mahalanobis(['income', 'age'])
 ```python
 # Automatic transformation recommendation
 recommendation = cleaner.recommend_transformation('income')
-print(f"Best transformation: {recommendation['best_transformation']}")
+print(f"Best transformation: {recommendation['recommended_method']}")
 
 # Apply Box-Cox transformation
-transformed = cleaner.transform_boxcox('income')
-print(f"Optimal lambda: {transformed['lambda']:.3f}")
+_, info = cleaner.transform_boxcox('income')
+print(f"Optimal lambda: {info['lambda']:.3f}")
 ```
 
 ## Method Chaining Example
@@ -80,9 +80,8 @@ print(f"Recommended method: {analysis['recommended_method']}")
 
 # Compare detection methods
 comparison = cleaner.compare_methods(['income'])
-print("Method Agreement:")
-for method, stats in comparison['income']['method_stats'].items():
-    print(f"  {method}: {stats['outliers_detected']} outliers")
+print("Method Comparison Summary:")
+print(comparison['income']['summary'])
 ```
 
 ## Visualization Example
diff --git a/examples/comprehensive_demo.py b/examples/comprehensive_demo.py
@@ -89,10 +89,13 @@ def test_initialization(results: TestResults, df: pd.DataFrame):
         cleaner_preserve = StatClean(df, preserve_index=True)
         results.add_pass("Initialization with preserve_index=True")
         
-        # Test with empty DataFrame
-        empty_df = pd.DataFrame()
-        cleaner_empty = StatClean(empty_df)
-        results.add_pass("Initialization with empty DataFrame")
+        # Test with empty DataFrame should raise ValueError
+        try:
+            empty_df = pd.DataFrame()
+            StatClean(empty_df)
+            results.add_fail("Initialization with empty DataFrame", "Expected ValueError not raised")
+        except ValueError:
+            results.add_pass("Initialization with empty DataFrame", "Correctly raised ValueError")
         
     except Exception as e:
         results.add_fail("Initialization", str(e))
@@ -147,23 +150,26 @@ def test_outlier_detection_methods(results: TestResults, df: pd.DataFrame):
     
     # Test IQR method
     try:
-        cleaned_df, info = cleaner.remove_outliers_iqr(test_column)
+        cleaner.remove_outliers_iqr(test_column)
+        info = cleaner.outlier_info[test_column]
         results.add_pass("remove_outliers_iqr", 
                         f"Removed {info['num_outliers']} outliers, {info['percent_removed']:.1f}%")
     except Exception as e:
         results.add_fail("remove_outliers_iqr", str(e))
     
     # Test Z-score method
     try:
-        cleaned_df, info = cleaner.remove_outliers_zscore(test_column)
+        cleaner.remove_outliers_zscore(test_column)
+        info = cleaner.outlier_info[test_column]
         results.add_pass("remove_outliers_zscore", 
                         f"Removed {info['num_outliers']} outliers, {info['percent_removed']:.1f}%")
     except Exception as e:
         results.add_fail("remove_outliers_zscore", str(e))
     
     # Test Modified Z-score method
     try:
-        cleaned_df, info = cleaner.remove_outliers_modified_zscore(test_column)
+        cleaner.remove_outliers_modified_zscore(test_column)
+        info = cleaner.outlier_info[test_column]
         results.add_pass("remove_outliers_modified_zscore", 
                         f"Removed {info['num_outliers']} outliers, {info['percent_removed']:.1f}%")
     except Exception as e:
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name="statclean",
-    version="0.1.2",
+    version="0.1.3",
     author="Subashanan Nair",
     author_email="subaashnair12@gmail.com",
     description="A comprehensive statistical data preprocessing and outlier detection library with formal statistical testing and publication-quality reporting",
@@ -21,7 +21,7 @@
     },
     packages=find_packages(),
     classifiers=[
-        "Development Status :: 5 - Production/Stable",
+        "Development Status :: 4 - Beta",
         "Intended Audience :: Science/Research",
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
diff --git a/statclean/__init__.py b/statclean/__init__.py
@@ -11,7 +11,7 @@
 Designed for academic research, data science, and statistical analysis.
 """
 
-__version__ = '0.1.0'
+__version__ = '0.1.3'
 __author__ = 'Subashanan Nair'
 
 from .cleaner import StatClean
diff --git a/statclean/cleaner.py b/statclean/cleaner.py
diff --git a/statclean/utils.py b/statclean/utils.py
diff --git a/wiki_content.md b/wiki_content.md