dataforgoodfr
diff --git a/‎.github/workflows/ci.yaml‎
Lines changed: 25 additions & 0 deletions b/‎.github/workflows/ci.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎.github/workflows/pre-commit.yaml‎
Lines changed: 0 additions & 20 deletions b/‎.github/workflows/pre-commit.yaml‎
Lines changed: 0 additions & 20 deletions
diff --git a/‎biolit/observations.py‎
Lines changed: 38 additions & 0 deletions b/‎biolit/observations.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎biolit/visualisation/species_distribution.py‎
Lines changed: 1 addition & 1 deletion b/‎biolit/visualisation/species_distribution.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 5 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tests/__init__.py‎ b/‎tests/__init__.py‎
diff --git a/‎tests/test_observations.py‎
Lines changed: 47 additions & 0 deletions b/‎tests/test_observations.py‎
Lines changed: 47 additions & 0 deletions
@@ -0,0 +1,25 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: astral-sh/setup-uv@v4
+      with:
+        version: "latest"
+    - uses: actions/cache@v4
+      with:
+        path: ~/.cache/pre-commit
+        key: pre-commit|${{ runner.os }}|${{ hashFiles('.pre-commit-config.yaml') }}
+    - name: Install dependencies
+      run: uv sync
+    - name: Run pre-commit hooks
+      run: uv run pre-commit run --all-files
+    - name: Run unit tests
+      run: uv run --with pytest pytest -vv
@@ -1,3 +1,5 @@
+import itertools
+
 import polars as pl
 import structlog
 from polars import col
@@ -108,3 +110,39 @@ def _check_validated_non_identifiable(frame: pl.DataFrame) -> pl.DataFrame:
     with pl.Config(fmt_str_lengths=50):
         print(errors)
     return frame.filter(~filt)
+
+
+def learnable_taxonomy(
+    frame: pl.DataFrame, current_taxon: str, levels: list[str], n_learnable: int
+) -> dict:
+    """
+    Liste les niveau taxonomiques les plus bas predictibles.
+    """
+    next_level = levels[0] if levels else "nom_scientifique"
+    level_agg = frame.group_by(next_level).agg(col("n_obs").sum())
+    learnables = level_agg.filter(col("n_obs") >= n_learnable)[next_level].to_list()
+
+    unlearnable = level_agg.filter(col("n_obs") < n_learnable)
+    remaining_taxon = []
+    if not unlearnable.is_empty():
+        autre_keyword = (
+            "AUTRE -- " if unlearnable["n_obs"].sum() >= n_learnable else "NO_STATS -- "
+        )
+        remaining_taxon.append(autre_keyword + current_taxon)
+
+    if not levels:
+        return learnables + remaining_taxon
+
+    next_frame = frame.group_by(levels + ["nom_scientifique"]).agg(col("n_obs").sum())
+
+    learnable_sublevels = [
+        learnable_taxonomy(
+            next_frame.filter(col(next_level) == taxon),
+            taxon,
+            levels[1:],
+            n_learnable=n_learnable,
+        )
+        or [taxon]
+        for taxon in learnables
+    ] + [remaining_taxon]
+    return sorted(set(itertools.chain(*learnable_sublevels)))
@@ -83,7 +83,7 @@ def _baseline_edges(species_counts: pl.DataFrame) -> pl.DataFrame:
             .rename({_source: "source", _target: "target", "id": "value"})
         )
         _edges.append(tmp)
-    return pl.concat(_edges)
+    return pl.concat(_edges).filter(col("source") != col("target"))
 
 
 def nodes_from_edges(edges: pl.DataFrame) -> pl.DataFrame:
 
@@ -14,5 +14,9 @@ dependencies = [
     "jupysql>=0.10.17",
     "matplotlib>=3.10.0",
     "pandas>=2.2.3",
+    "polars>=1.36.1",
+    "pre-commit>=4.5.1",
+    "pytest>=9.0.2",
     "requests>=2.32.3",
-]
+    "structlog>=25.5.0",
+]
@@ -0,0 +1,47 @@
+import polars as pl
+
+from biolit.observations import learnable_taxonomy
+
+
+class TestLearnableTaxonomy:
+    def test_valid_deepest_taxon(self):
+        inp = pl.DataFrame(
+            {
+                "nom_scientifique": ["herbe", "herbe"],
+                "genre": ["plante", "plante"],
+                "classe": ["chlorophyle", "chlorophyle"],
+                "n_obs": 1,
+            }
+        )
+
+        out = learnable_taxonomy(inp, "vivant", ["genre", "classe"], 2)
+        exp = ["herbe"]
+        assert out == exp
+
+    def test_autre_taxons(self):
+        inp = pl.DataFrame(
+            {
+                "genre": "plante",
+                "classe": "chlorophyle",
+                "nom_scientifique": ["herbe", "mousse", "fleur"],
+                "n_obs": [10, 5, 5],
+            }
+        )
+
+        out = learnable_taxonomy(inp, "vivant", ["genre", "classe"], 10)
+        exp = ["AUTRE -- chlorophyle", "herbe"]
+        assert out == exp
+
+    def test_not_enough_autre_taxons(self):
+        inp = pl.DataFrame(
+            {
+                "genre": "plante",
+                "classe": "chlorophyle",
+                "nom_scientifique": ["herbe", "mousse", "fleur"],
+                "n_obs": [10, 1, 1],
+            }
+        )
+
+        out = learnable_taxonomy(inp, "vivant", ["genre", "classe"], 10)
+        exp = ["NO_STATS -- chlorophyle", "herbe"]
+        assert out == exp
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ def _baseline_edges(species_counts: pl.DataFrame) -> pl.DataFrame:`
`83`	`83`	`.rename({_source: "source", _target: "target", "id": "value"})`
`84`	`84`	`)`
`85`	`85`	`_edges.append(tmp)`
`86`		`- return pl.concat(_edges)`
	`86`	`+ return pl.concat(_edges).filter(col("source") != col("target"))`
`87`	`87`
`88`	`88`
`89`	`89`	`def nodes_from_edges(edges: pl.DataFrame) -> pl.DataFrame:`