mancusolab · zeyunlu · Feb 4, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@
 __pycache__/
 
 .vscode/
+.claude/
 
 *.metadata2.mmm
 test_result*

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ exclude: '^(docs/conf.py|tests/testdata/.*)'
 
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.0.1
+  rev: v4.5.0
   hooks:
   - id: trailing-whitespace
   - id: check-added-large-files
@@ -18,25 +18,24 @@ repos:
     args: ['--fix=auto']  # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows
 
 - repo: https://github.com/pycqa/isort
-  rev: 5.10.1
+  rev: 5.13.2
   hooks:
   - id: isort
 
 - repo: https://github.com/psf/black
-  rev: 21.11b1
+  rev: 24.3.0
   hooks:
   - id: black
     language_version: python3
-    additional_dependencies: ['click==8.0.4']
 
 - repo: https://github.com/PyCQA/flake8
-  rev: 4.0.1
+  rev: 7.0.0
   hooks:
   - id: flake8
   ## You can add flake8 plugins via `additional_dependencies`:
   #  additional_dependencies: [flake8-bugbear]
 
 - repo: https://github.com/pre-commit/mirrors-mypy
-  rev: 'v0.971'  # Use the sha / tag you want to point at
+  rev: 'v1.8.0'
   hooks:
   - id: mypy
diff --git a/README.md b/README.md
diff --git a/docs/conf.py b/docs/conf.py
@@ -108,11 +108,11 @@
 }
 
 python_apigen_default_groups = [
-    (r".*:sushie.cli.*", "CLI Public-members"),
-    (r".*:sushie.utils.*", "Utils Public-members"),
-    (r".*:sushie.infer.*", "Infer Public-members"),
+    (r".*:sushie.cli.*", "CLI Public-Members"),
+    (r".*:sushie.utils.*", "Utils Public-Members"),
+    (r".*:sushie.infer.*", "Infer Public-Members"),
     (r"class:sushie.infer.*", "Infer Classes"),
-    (r".*:sushie.io.*", "IO Public-members"),
+    (r".*:sushie.io.*", "IO Public-Members"),
     (r"class:sushie.io.*", "IO Classes"),
 ]
 

diff --git a/docs/contributing.rst b/docs/contributing.rst
@@ -72,6 +72,28 @@ and use Python's built-in web server for a preview in your web browser
 
     python3 -m http.server --directory 'docs/_build/html'
 
+.. important::
+
+   **Building API documentation locally:** The API reference documentation
+   requires importing the ``sushie`` package and all its dependencies. If you
+   see warnings like ``Failed to import module sushie.infer``, you need to
+   install the package first::
+
+       # Create and activate a conda environment with dependencies
+       conda create -n sushie-docs python=3.10
+       conda activate sushie-docs
+
+       # Install sushie and all dependencies
+       pip install -r requirements.txt -r requirements_dev.txt
+       pip install -e .
+
+       # Now build the docs
+       make -C docs html
+
+   Alternatively, using tox with the ``-r`` flag to recreate the environment::
+
+       tox -r -e docs
+
 
 Code Contributions
 ==================
@@ -215,7 +237,7 @@ package:
     tox -r -e docs
 
 #. Make sure to have a reliable |tox|_ installation that uses the correct
-   Python version (e.g., 3.7+). When in doubt you can run::
+   Python version (e.g., 3.8+). When in doubt you can run::
 
     tox --version
     # OR

diff --git a/docs/faq.rst b/docs/faq.rst
@@ -0,0 +1,160 @@
+===
+FAQ
+===
+
+Frequently asked questions about SuShiE.
+
+General Questions
+=================
+
+What is SuShiE?
+---------------
+SuShiE (Sum of Shared Single Effect) is a Bayesian fine-mapping method designed for multi-ancestry genetic studies. It identifies causal variants while accounting for effect size correlations across ancestries.
+
+When should I use SuShiE vs single-ancestry SuSiE?
+--------------------------------------------------
+Use SuShiE when you have genetic data from multiple ancestries and want to:
+
+- Leverage shared genetic architecture across populations
+- Improve fine-mapping resolution through diverse LD patterns
+- Estimate effect size correlations across ancestries
+
+Use single-ancestry SuSiE (via ``--meta`` or ``--mega`` flags) when:
+
+- You only have data from one ancestry
+- You want to compare results with multi-ancestry SuShiE
+
+What data formats does SuShiE support?
+--------------------------------------
+**Genotype data:**
+
+- PLINK (.bed/.bim/.fam)
+- VCF (.vcf, .vcf.gz)
+- BGEN (.bgen)
+
+**Phenotype/Covariate data:**
+
+- Tab-separated text files
+
+**Summary statistics:**
+
+- Tab-separated GWAS files with Z-scores or effect sizes
+
+Parameters
+==========
+
+How do I choose the number of causal variants (L)?
+--------------------------------------------------
+The ``-L`` parameter specifies the maximum number of causal variants. Recommendations:
+
+- Start with ``-L 10`` (default) for most fine-mapping analyses
+- Increase if you expect more causal variants in the region
+- The algorithm will only identify credible sets for true signals
+
+What does the ``--rho`` parameter do?
+-------------------------------------
+The ``--rho`` parameter sets the prior correlation of effect sizes across ancestries:
+
+- ``--rho 0.0``: Assumes independent effects (equivalent to ``--indep``)
+- ``--rho 1.0``: Assumes perfectly correlated effects
+- Default: Learns correlation from data (recommended)
+
+When should I use ``--no-update``?
+----------------------------------
+Use ``--no-update`` to disable prior updates during inference:
+
+- When you have strong prior beliefs about effect distributions
+- For faster runtime on large datasets
+- When empirical Bayes updates cause convergence issues
+
+Output Files
+============
+
+What is a credible set?
+-----------------------
+A credible set is a set of SNPs that contains the causal variant with high probability (default 95%). SuShiE outputs:
+
+- SNPs in each credible set
+- Posterior inclusion probabilities (PIPs)
+- Purity scores (minimum LD among SNPs in the set)
+
+How do I interpret the PIP?
+---------------------------
+The Posterior Inclusion Probability (PIP) represents the probability that a SNP is causal given SNPs in the model:
+
+- PIP > 0.95: Strong evidence for causality
+- PIP 0.5-0.95: Moderate evidence
+- PIP < 0.5: Weak evidence
+
+Higher PIPs in credible sets indicate better fine-mapping resolution.
+
+What does purity mean?
+----------------------
+Purity is the minimum absolute correlation (r²) between any pair of SNPs in a credible set:
+
+- High purity (>0.5): SNPs are in high LD, harder to distinguish
+- Low purity: SNPs are more independent, but set may contain false positives
+
+The ``--purity`` threshold (default 0.5) filters out low-quality credible sets.
+
+Performance
+===========
+
+How can I speed up SuShiE?
+--------------------------
+Several options to improve performance:
+
+1. **Reduce genomic region size:**
+
+2. **Reduce iterations:**
+
+   .. code-block:: bash
+
+      sushie finemap --max-iter 100 ...
+
+3. **Limit SNPs** for purity calculation:
+
+   .. code-block:: bash
+
+      sushie finemap --max-select 500 ...
+
+How much memory does SuShiE need?
+---------------------------------
+Memory usage depends on:
+
+- Number of samples (N)
+- Number of SNPs (P)
+- Number of ancestries (K)
+
+Rough estimate: ``N × P × K × 8 bytes`` for genotype storage.
+
+For large datasets, consider:
+
+- Analyzing smaller genomic regions
+- Running on a high-memory compute node
+
+Errors
+======
+
+"LD matrix is not positive semi-definite"
+-----------------------------------------
+This error occurs with summary-level data when the LD matrix has numerical issues:
+
+- Ensure LD was computed from a sufficiently large reference panel
+- Check for missing or mismatched SNPs
+- Try adding a small ridge regularization term
+
+"No credible sets found"
+------------------------
+This may indicate:
+
+- No significant signal in the region
+- Too stringent purity threshold (try ``--purity 0.1``)
+- Convergence issues (check ELBO in output)
+
+"Sample sizes don't match"
+--------------------------
+For summary-level data, ensure:
+
+- ``--sample-size`` matches the number of ancestries
+- Sample sizes correspond to the correct GWAS files (in order)
diff --git a/docs/files.rst b/docs/files.rst
@@ -299,19 +299,19 @@ It contains two rounds of heritability estimation:
      - 1, 2
      - The ancestry index.
    * - genetic_var
-     - Flat
+     - Float
      - 1.32
      - The variance of genetic components contributing to the complex traits. ``s_genetic_var``, which is estimated only from the SNPs in the credible sets, will be appended if credible sets are not empty after pruning for purity.
    * - h2g
-     - Flat
+     - Float
      - 0.23
      - The narrow-sense cis-heritability of the traits based on `limix <https://github.com/limix/limix>`_ definition. This includes the variance of the fixed effects.
    * - lrt_stats
-     - Flat
+     - Float
      - -123.23
      - The likelihood ratio test statistics compared the linear mixed effects model to the fixed effects model (no genetic variance). ``s_lrt_stats``, which is estimated only from the SNPs in the credible sets, will be appended if credible sets are not empty after pruning for purity.
    * - p_value
-     - Flat
+     - Float
      - -123.23
      - The :math:`p` value for the likelihood ratio test statistics based on chi-square distribution with 1 dof. ``s_p_value``, which is estimated only from the SNPs in the credible sets, will be appended if credible sets are not empty after pruning for purity.
    * - trait
@@ -338,11 +338,11 @@ By specifying ``--cv``, SuShiE outputs a ``*.cv.tsv`` file that contains the res
      - 1, 2
      - The ancestry index.
    * - rsq
-     - Flat
+     - Float
      - 0.9
      - :math:`r^2` between predicted and measured expressions from cross-validations.
    * - p_value
-     - Flat
+     - Float
      - 0.23
      - The :math:`p` value for the :math:`r^2`.
    * - N

diff --git a/docs/index.rst b/docs/index.rst
@@ -24,6 +24,7 @@ Contents
    Model <model>
    Users Manual <manual>
    Files <files>
+   FAQ <faq>
 
 
 .. toctree::
@@ -51,7 +52,7 @@ Other Software
 Feel free to use other software developed by `Mancuso
 Lab <https://www.mancusolab.com/>`_:
 
-* `jaxQTL` <https://github.com/mancusolab/jaxqtl>: jaxQTL is a single-cell eQTL mapping tool using highly efficient count-based model (i.e., negative binomial or Poisson).
+* `jaxQTL <https://github.com/mancusolab/jaxqtl>`_: jaxQTL is a single-cell eQTL mapping tool using highly efficient count-based model (i.e., negative binomial or Poisson).
 * `MA-FOCUS <https://github.com/mancusolab/ma-focus>`_: a Bayesian fine-mapping framework using statistics across multiple ancestries to identify the causal genes for complex traits.
 * `SuSiE-PCA <https://github.com/mancusolab/susiepca>`_: a scalable Bayesian variable selection technique for sparse principal component analysis
 * `twas_sim <https://github.com/mancusolab/twas_sim>`_: a Python software to simulate statistics.

diff --git a/docs/manual.rst b/docs/manual.rst
@@ -7,9 +7,20 @@ Users Manual
 Initialize Environment
 ======================
 
-SuShiE is a command-line software written in Python. Before installation, we recommend to create a new environment using `conda <https://docs.conda.io/en/latest/>`_ so that it will not affect the software versions of users' other projects.
+SuShiE is a command-line software written in Python. Before installation, we recommend to create a new environment using `conda <https://docs.conda.io/en/latest/>`_ so that it will not affect the software versions of users' other projects:
 
-SuShiE uses `JAX <https://github.com/google/jax>`_ with `Just In Time  <https://jax.readthedocs.io/en/latest/jax-101/02-jitting.html>`_ compilation to achieve high-speed computation. However, there are some `issues <https://github.com/google/jax/issues/5501>`_ for JAX with Mac M1 chip. To solve this, users need to initiate conda using `miniforge <https://github.com/conda-forge/miniforge>`_, and then install SuShiE using ``pip`` in the desired environment.
+.. code:: bash
+
+    conda create -n env-sushie python=3.8
+    conda activate env-sushie
+
+SuShiE uses `JAX <https://github.com/google/jax>`_ with `Just In Time  <https://jax.readthedocs.io/en/latest/jax-101/02-jitting.html>`_ compilation to achieve high-speed computation. However, there are some `issues <https://github.com/google/jax/issues/5501>`_ for JAX with Mac M1 chip. To solve this, users need to initiate conda using `miniforge <https://github.com/conda-forge/miniforge>`_, and install ``cbgen`` from conda-forge first:
+
+.. code:: bash
+
+    conda install -c conda-forge cbgen
+
+Then install SuShiE using ``pip`` in the desired environment.
 
 Installation
 ============
@@ -31,7 +42,7 @@ Users can download the latest repository and then use ``pip``:
     cd sushie
     pip install .
 
-*We currently only support Python3.8+.*
+*We currently support Python 3.8, 3.9, 3.10, and 3.11 (stable versions).*
 
 Data Preparation
 ================
@@ -50,8 +61,8 @@ Although we highly recommend users to perform high-quality QC on their own genot
 #. Only keep SNPs that are available in all the ancestries.
 #. Adjust genotype data across ancestries based on the same reference alleles. Drop non-biallelic SNPs.
 #. Remove SNPs that have minor allele frequency (MAF) less than 1% within each ancestry (users can change 1% with ``--maf``).
-#. Users also have an option to keep ambiguous SNPs (i.e., A/T, T/A, C/G, or GC) by specifying ``--keep-ambiguous`` (Default is NOT to keep them).
-#. For single ancestry SuSiE or Mega-SuSiE, users have the option to perform rank inverse normalization transformation on the phenotype data.
+#. Users also have an option to keep ambiguous SNPs (i.e., A/T, T/A, C/G, or G/C) by specifying ``--keep-ambiguous`` (Default is NOT to keep them).
+#. Users have the option to perform rank inverse normalization transformation on the phenotype data.
 
 See :func:`sushie.cli.process_raw` for these QCs' source codes.
 
@@ -66,7 +77,7 @@ Although we highly recommend users to perform high-quality QC on their own summa
 #. Only keep SNPs that are available in all the ancestries.
 #. Adjust GWAS and genotype data across ancestries based on the same reference alleles. Drop non-biallelic SNPs.
 #. Remove SNPs (for LD computation) that have minor allele frequency (MAF) less than 1% within each ancestry (users can change 1% with ``--maf``).
-#. Users also have an option to keep ambiguous SNPs (i.e., A/T, T/A, C/G, or GC) by specifying ``--keep-ambiguous`` (Default is NOT to keep them).
+#. Users also have an option to keep ambiguous SNPs (i.e., A/T, T/A, C/G, or G/C) by specifying ``--keep-ambiguous`` (Default is NOT to keep them).
 
 
 Testing Data
@@ -456,12 +467,12 @@ Parameters
      - Float
      - 1e-3
      - ``--effect-var 5.21 0.99 ``
-     - Specify the prior for the causal effect size variance (:math:`\sigma^2_{i,b}` in :ref:`Model`) for ancestries. Values have to be positive. Use ``space`` to separate ancestries if more than two. If ``--no-update`` is specified and ``--rho`` is not, specifying this parameter will only fix ``effect-var`` as prior through optimizations and update ``rho``. If ``--effect-covar``, ``--rho``, and ``--no-update`` all three are specified, both ``--effect-covar`` and ``--rho`` will be fixed as prior through optimizations. If ``--no-update`` is specified, but neither ``--effect-covar`` nor ``--rho``, both ``--effect-covar`` and ``--rho`` will be fixed as default prior value through optimizations.
+     - Specify the prior for the causal effect size variance (:math:`\sigma^2_{i,b}` in :ref:`Model`) for ancestries. Values have to be positive. Use ``space`` to separate ancestries if more than two. If ``--no-update`` is specified and ``--rho`` is not, specifying this parameter will only fix ``effect-var`` as prior through optimizations and update ``rho``. If ``--effect-var``, ``--rho``, and ``--no-update`` all three are specified, both ``--effect-var`` and ``--rho`` will be fixed as prior through optimizations. If ``--no-update`` is specified, but neither ``--effect-var`` nor ``--rho``, both ``--effect-var`` and ``--rho`` will be fixed as default prior value through optimizations.
    * - ``--rho``
      - Float
      - 0.1
      - ``--rho 0.05``
-     - Specify the prior for the effect correlation (:math:`\rho` in :ref:`Model`) for ancestries. Default is 0.1 for each pair of ancestries. Use space to separate ancestries if more than two. Each rho has to be a float number between -1 and 1. If there are ``N > 2`` ancestries, ``X = choose(N, 2)`` is required. The rho order has to be ``rho(1,2)``, ..., ``rho(1, N)``, ``rho(2,3)``, ..., ``rho(N-1. N)``. If ``--no-update`` is specified and ``--effect-covar`` is not, specifying this parameter will only fix ``rho`` as prior through optimizations and update ``effect-covar``. If ``--effect-covar``, ``--rho``, and ``--no-update`` all three are specified, both ``--effect-covar`` and ``--rho`` will be fixed as prior through optimizations. If ``--no-update`` is specified, but neither ``--effect-covar`` nor ``--rho``, both ``--effect-covar`` and ``--rho`` will be fixed as default prior value through optimizations.
+     - Specify the prior for the effect correlation (:math:`\rho` in :ref:`Model`) for ancestries. Default is 0.1 for each pair of ancestries. Use space to separate ancestries if more than two. Each rho has to be a float number between -1 and 1. If there are ``N > 2`` ancestries, ``X = choose(N, 2)`` is required. The rho order has to be ``rho(1,2)``, ..., ``rho(1, N)``, ``rho(2,3)``, ..., ``rho(N-1. N)``. If ``--no-update`` is specified and ``--effect-var`` is not, specifying this parameter will only fix ``rho`` as prior through optimizations and update ``effect-covar``. If ``--effect-var``, ``--rho``, and ``--no-update`` all three are specified, both ``--effect-var`` and ``--rho`` will be fixed as prior through optimizations. If ``--no-update`` is specified, but neither ``--effect-var`` nor ``--rho``, both ``--effect-var`` and ``--rho`` will be fixed as default prior value through optimizations.
    * - ``--no-scale``
      - Boolean
      - False
@@ -556,7 +567,7 @@ Parameters
      - Boolean
      - False
      - ``--cv 0.5 # will store as True``
-     - Indicator to perform cross validation (CV) and output CV results (adjusted r-squared and its p-value) for future `FUSION <http://gusevlab.org/projects/fusion/>`_ pipline. Specify ``--cv`` will store ``True`` value and increase running time.
+     - Indicator to perform cross validation (CV) and output CV results (adjusted r-squared and its p-value) for future `FUSION <http://gusevlab.org/projects/fusion/>`_ pipeline. Specify ``--cv`` will store ``True`` value and increase running time.
    * - ``--cv-num``
      - Integer
      - 5