Merge pull request #7 from remydubois/fix/readme-and-action

remydubois · web-flow · commit 07055a98d880 · 2025-12-23T15:37:50.000+01:00
Fixed reference_group, enhanced readme
diff --git a/.github/workflows/publish-package.yaml b/.github/workflows/publish-package.yaml
@@ -26,4 +26,4 @@ jobs:
     - name: Build package
       run: python -m poetry build
     - name: Publish package
-      run: python -m poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+      run: python -m poetry publish -u __token__ -p ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml
@@ -4,8 +4,8 @@
 name: Python package
 
 on:
-  push:
-    branches: [ "main" ]
+  # push:
+  #   branches: [ "main" ]
   pull_request:
     branches: [ "main" ]
 
diff --git a/LICENSE b/LICENSE
@@ -10,4 +10,4 @@ Copyright 2025 Rémy Dubois
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
+   limitations under the License.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ Approximate speed benchmarks ran on k562-essential can be found below. All the c
 4. This package is not intended at running out-of-core single cell data analyses like `rapids-singlecell`.
 
 ## Installation
-`illico` can be installed via pip, compatible with Python 3.12 and onward:
+`illico` can be installed via pip, compatible with Python 3.11 and onward:
 ```bash
 pip install illico -U
 ```
@@ -75,10 +75,18 @@ scanpy_port_asymptotic_wilcoxon(adata, group_keys="perturbation", reference="non
 `illico` relies on a few optimization tricks to be faster than other existing tools. It is very possible that for some reason, the specific layout of your dataset (very small control population, very low sparsity, very small amount of distinct values) result in those tricks being effect-less, or less effective than observed on the datasets used to develop & benchmark `illico`. It is also very possible that because of those, other solutions end up faster than `illico` ! If this is your case, please open a issue describing your situation.
 
 ### `illico`'s results (p-values or fold-change) does not match `pdex` or `scanpy`.
+#### Test results (p-values)
 Please open an issue, but before that: make sure that you are running **asymptotic** wilcoxon rank-sum tests as this is the only test exposed by `illico`.
 - `pdex` relies on `scipy.stats.mannwhitneyu` that runs exact (non asymptotic) only when there are 8 values in both groups combined, and no ties.
 - `scanpy` offers the possibility to run non-tie-corrected wilcoxon rank-sum tests, make sure this is disabled by passing `tie_correct=True`.
-- Also, `illico` uses continuity correction which is the best practice.
+- Also, `illico` uses continuity correction by default which is the best practice.
+
+The test suite implemented in the CI and used to develop `illico` targets a precision of 1.e-12 compared to `scipy`, not `scanpy`. Consequently, there **will be** slight disagreement between `scanpy`'s p-values and `illico`'s p-values.
+
+#### Fold-change
+The fold-change computed by illico is the most naive form of the fold-change:
+$$\text{fold-change} = \frac{E[X_{\text{perturbed}}]}{E[X_{\text{control}}]}$$
+If your data underwent log1p transform, `np.expm1` is applied **before** computing the expectations (means). I know many definitions exist, and adding more control over this should not be complicated. If this is your case, please open an issue.
 
 ### What about normalization and log1p
 1. `illico` does not care about your data being normalized or not, it is up to you to apply the preprocessing of your choice before running the tests. It is expected that `illico` is slower if ran on total-count normalized data by a factor ~2. This is because if applied on non total-count normalized data, sorting relies on radix sort which is faster than the usual quicksort (that is used if testing total-count normalized data).
diff --git a/illico/asymptotic_wilcoxon.py b/illico/asymptotic_wilcoxon.py
@@ -25,7 +25,7 @@ def asymptotic_wilcoxon(
     adata: ad.AnnData,
     is_log1p: bool,
     group_keys: str,
-    reference_group: str | None = None,
+    reference: str | None = None,
     n_threads: int = 1,
     batch_size: int = 256,
     alternative: str = "two-sided",
@@ -50,7 +50,7 @@ def asymptotic_wilcoxon(
         Whether the data is log1p transformed.
     group_keys
         Key in `adata.obs` specifying the group variable.
-    reference_group
+    reference
         Name of the reference group for OVO tests. If `None`, OVR tests are performed.
     n_threads
         Number of threads to use for parallel computation.
@@ -92,13 +92,13 @@ def asymptotic_wilcoxon(
             )
 
     if precompile:
-        _precompile(X, reference_group)
+        _precompile(X, reference)
 
     # Process the groups information
     raw_groups = adata.obs[group_keys].tolist()
-    unique_raw_groups, group_container = encode_and_count_groups(groups=raw_groups, ref_group=reference_group)
+    unique_raw_groups, group_container = encode_and_count_groups(groups=raw_groups, ref_group=reference)
     logger.info(
-        f"Found {group_container.counts.size} unique groups (min size: {group_container.counts.min()} cells; max size: {group_container.counts.max()} cells), with reference group: {reference_group}"
+        f"Found {group_container.counts.size} unique groups (min size: {group_container.counts.min()} cells; max size: {group_container.counts.max()} cells), with reference group: {reference}"
     )
     _, n_genes = X.shape
 
@@ -121,7 +121,7 @@ def asymptotic_wilcoxon(
     logger.trace(f"Performing a total of {n_tests:,d} tests.")
     with Parallel(n_threads, prefer="threads", return_as="generator_unordered") as pool:
         with tqdm(total=n_tests, smoothing=0.0, unit="it", unit_scale=True, unit_divisor=1000) as pbar:
-            if reference_group is None:  # ovr use case
+            if reference is None:  # ovr use case
                 pbar.set_description("Running one-versus-all MannWhitney-U tests")
                 op = delayed(lambda *args: (ovr_mwu_over_col_contiguous_chunk(*args), args))
             else:  # ovo use case
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "illico"
-version = "0.1.0"
+version = "0.1.1"
 description = "Fast asymptotic mannwhitney-u test"
 authors = [
     {name = "remydubois",email = "remydubois14@gmail.com"}
diff --git a/tests/test_asymptotic_wilcoxon.py b/tests/test_asymptotic_wilcoxon.py
@@ -118,7 +118,7 @@ def test_asymptotic_wilcoxon(rand_adata, test, use_continuity, alternative):
         adata=rand_adata,
         is_log1p=False,
         group_keys="pert",
-        reference_group=reference,
+        reference=reference,
         use_continuity=use_continuity,
         n_threads=1,
         batch_size=16,
@@ -177,7 +177,7 @@ def test_unsorted_indices_error(rand_adata):
             adata=rand_adata,
             is_log1p=False,
             group_keys="pert",
-            reference_group="non-targeting",
+            reference="non-targeting",
             n_threads=1,
             batch_size=16,
         )
@@ -207,7 +207,7 @@ def run():
                     data,
                     is_log1p=False,
                     group_keys="gene",
-                    reference_group=reference,
+                    reference=reference,
                     n_threads=num_threads,
                     batch_size=256,
                 )
@@ -242,7 +242,7 @@ def test_speed_benchmark(adata, method, test, num_threads, benchmark, request):
 
     # Compile
     if method == "illico":
-        _precompile(adata.X, reference_group="non-targeting" if test == "ovo" else None)
+        _precompile(adata.X, reference="non-targeting" if test == "ovo" else None)
 
     params = re.match(".*\[(.*)\]", request.node.name).group(1).split("-")
     group_params = [p for i, p in enumerate(params) if i in [0, 1, 4]]
@@ -263,7 +263,7 @@ def test_memory_benchmark(adata, method, test, num_threads, request):
 
     # Compile outside of the tracker context
     if method == "illico":
-        _precompile(adata.X, reference_group="non-targeting" if test == "ovo" else None)
+        _precompile(adata.X, reference="non-targeting" if test == "ovo" else None)
 
     test_params_string = re.match(".*\[(.*)\]", request.node.name).group(1)
     outdir = Path(os.environ.get("MEMRAY_RESULTS_DIR") or Path(__file__).parents[1])