Add CLI and Hypothesis tests

DiogoRibeiro7 · DiogoRibeiro7 · commit 0f2c560a4851 · 2025-06-18T21:22:22.000+01:00
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,18 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it using the metadata below."
+
+# Basic information
+preferred-citation:
+  type: software
+  title: "gen_surv"
+  version: "1.0.1"
+  url: "https://github.com/DiogoRibeiro7/genSurvPy"
+  authors:
+    - family-names: Ribeiro
+      given-names: Diogo
+      orcid: "https://orcid.org/0009-0001-2022-7072"
+      affiliation: "ESMAD - Instituto Politécnico do Porto"
+      email: "dfr@esmad.ipp.pt"
+  license: "MIT"
+  date-released: "2024-01-01"
+
diff --git a/TODO.md b/TODO.md
@@ -4,6 +4,17 @@ This document outlines future enhancements, features, and ideas for improving th
 
 ---
 
+## ✨ Priority Items
+
+- [✅] Add property-based tests using Hypothesis to cover edge cases
+- [✅] Build a CLI for generating datasets from the terminal
+- [ ] Expand documentation with multilingual support and more usage examples
+- [ ] Implement Weibull and log-logistic AFT models and add visualization utilities
+- [✅] Provide CITATION metadata for proper referencing
+- [ ] Ensure all functions include Google-style docstrings with inline comments
+
+---
+
 ## 📦 1. Interface and UX
 
 - [✅] Create a `generate(..., return_type="df" | "dict")` interface
diff --git a/gen_surv/__main__.py b/gen_surv/__main__.py
@@ -1,30 +1,4 @@
-import argparse
-import pandas as pd
-from gen_surv.cphm import gen_cphm
-from gen_surv.cmm import gen_cmm
-from gen_surv.tdcm import gen_tdcm
-from gen_surv.thmm import gen_thmm
-
-def run_example(model: str):
-    if model == "cphm":
-        df = gen_cphm(n=10, model_cens="uniform", cens_par=1.0, beta=0.5, covar=2.0)
-    elif model == "cmm":
-        df = gen_cmm(n=10, model_cens="exponential", cens_par=1.0,
-                     beta=[0.5, 0.2, -0.1], covar=2.0, rate=[0.1, 1.0, 0.2, 1.0, 0.3, 1.0])
-    elif model == "tdcm":
-        df = gen_tdcm(n=10, dist="weibull", corr=0.5, dist_par=[1, 2, 1, 2],
-                      model_cens="uniform", cens_par=0.5, beta=[0.1, 0.2, 0.3], lam=1.0)
-    elif model == "thmm":
-        df = gen_thmm(n=10, model_cens="uniform", cens_par=0.5,
-                      beta=[0.1, 0.2, 0.3], covar=1.0, rate=[0.5, 0.6, 0.7])
-    else:
-        raise ValueError(f"Unknown model: {model}")
-    
-    print(df)
+from gen_surv.cli import app
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run gen_surv model example.")
-    parser.add_argument("model", choices=["cphm", "cmm", "tdcm", "thmm"],
-                        help="Model to run (cphm, cmm, tdcm, thmm)")
-    args = parser.parse_args()
-    run_example(args.model)
+    app()
diff --git a/gen_surv/cli.py b/gen_surv/cli.py
@@ -0,0 +1,23 @@
+import csv
+from typing import Optional
+import typer
+from gen_surv.interface import generate
+
+app = typer.Typer(help="Generate synthetic survival datasets.")
+
+@app.command()
+def dataset(
+    model: str = typer.Argument(..., help="Model to simulate [cphm, cmm, tdcm, thmm, aft_ln]"),
+    n: int = typer.Option(100, help="Number of samples"),
+    output: Optional[str] = typer.Option(None, "-o", help="Output CSV file. Prints to stdout if omitted."),
+):
+    """Generate survival data and optionally save to CSV."""
+    df = generate(model=model, n=n)
+    if output:
+        df.to_csv(output, index=False)
+        typer.echo(f"Saved dataset to {output}")
+    else:
+        typer.echo(df.to_csv(index=False))
+
+if __name__ == "__main__":
+    app()
diff --git a/tests/test_aft_property.py b/tests/test_aft_property.py
@@ -0,0 +1,22 @@
+from hypothesis import given, strategies as st
+from gen_surv.aft import gen_aft_log_normal
+
+@given(
+    n=st.integers(min_value=1, max_value=20),
+    sigma=st.floats(min_value=0.1, max_value=2.0, allow_nan=False, allow_infinity=False),
+    cens_par=st.floats(min_value=0.1, max_value=10.0, allow_nan=False, allow_infinity=False),
+    seed=st.integers(min_value=0, max_value=1000)
+)
+def test_gen_aft_log_normal_properties(n, sigma, cens_par, seed):
+    df = gen_aft_log_normal(
+        n=n,
+        beta=[0.5, -0.2],
+        sigma=sigma,
+        model_cens="uniform",
+        cens_par=cens_par,
+        seed=seed
+    )
+    assert df.shape[0] == n
+    assert set(df["status"].unique()).issubset({0, 1})
+    assert (df["time"] >= 0).all()
+    assert df.filter(regex="^X[0-9]+$").shape[1] == 2