Skip to content

Commit 5a3a646

Browse files
authored
Merge pull request #3 from dataforgoodfr/learnable_taxons
Identifie les niveaux de taxons avec suffisament de données pour l'entrainement
2 parents b021551 + 81f8831 commit 5a3a646

File tree

8 files changed

+268
-22
lines changed

8 files changed

+268
-22
lines changed

.github/workflows/ci.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
name: pre-commit
2+
3+
on:
4+
pull_request:
5+
push:
6+
branches: [main]
7+
8+
jobs:
9+
pre-commit:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- uses: actions/checkout@v4
13+
- uses: astral-sh/setup-uv@v4
14+
with:
15+
version: "latest"
16+
- uses: actions/cache@v4
17+
with:
18+
path: ~/.cache/pre-commit
19+
key: pre-commit|${{ runner.os }}|${{ hashFiles('.pre-commit-config.yaml') }}
20+
- name: Install dependencies
21+
run: uv sync
22+
- name: Run pre-commit hooks
23+
run: uv run pre-commit run --all-files
24+
- name: Run unit tests
25+
run: uv run --with pytest pytest -vv

.github/workflows/pre-commit.yaml

Lines changed: 0 additions & 20 deletions
This file was deleted.

biolit/observations.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import itertools
2+
13
import polars as pl
24
import structlog
35
from polars import col
@@ -108,3 +110,39 @@ def _check_validated_non_identifiable(frame: pl.DataFrame) -> pl.DataFrame:
108110
with pl.Config(fmt_str_lengths=50):
109111
print(errors)
110112
return frame.filter(~filt)
113+
114+
115+
def learnable_taxonomy(
116+
frame: pl.DataFrame, current_taxon: str, levels: list[str], n_learnable: int
117+
) -> dict:
118+
"""
119+
Liste les niveau taxonomiques les plus bas predictibles.
120+
"""
121+
next_level = levels[0] if levels else "nom_scientifique"
122+
level_agg = frame.group_by(next_level).agg(col("n_obs").sum())
123+
learnables = level_agg.filter(col("n_obs") >= n_learnable)[next_level].to_list()
124+
125+
unlearnable = level_agg.filter(col("n_obs") < n_learnable)
126+
remaining_taxon = []
127+
if not unlearnable.is_empty():
128+
autre_keyword = (
129+
"AUTRE -- " if unlearnable["n_obs"].sum() >= n_learnable else "NO_STATS -- "
130+
)
131+
remaining_taxon.append(autre_keyword + current_taxon)
132+
133+
if not levels:
134+
return learnables + remaining_taxon
135+
136+
next_frame = frame.group_by(levels + ["nom_scientifique"]).agg(col("n_obs").sum())
137+
138+
learnable_sublevels = [
139+
learnable_taxonomy(
140+
next_frame.filter(col(next_level) == taxon),
141+
taxon,
142+
levels[1:],
143+
n_learnable=n_learnable,
144+
)
145+
or [taxon]
146+
for taxon in learnables
147+
] + [remaining_taxon]
148+
return sorted(set(itertools.chain(*learnable_sublevels)))

biolit/visualisation/species_distribution.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def _baseline_edges(species_counts: pl.DataFrame) -> pl.DataFrame:
8383
.rename({_source: "source", _target: "target", "id": "value"})
8484
)
8585
_edges.append(tmp)
86-
return pl.concat(_edges)
86+
return pl.concat(_edges).filter(col("source") != col("target"))
8787

8888

8989
def nodes_from_edges(edges: pl.DataFrame) -> pl.DataFrame:

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,9 @@ dependencies = [
1414
"jupysql>=0.10.17",
1515
"matplotlib>=3.10.0",
1616
"pandas>=2.2.3",
17+
"polars>=1.36.1",
18+
"pre-commit>=4.5.1",
19+
"pytest>=9.0.2",
1720
"requests>=2.32.3",
18-
]
21+
"structlog>=25.5.0",
22+
]

tests/__init__.py

Whitespace-only changes.

tests/test_observations.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import polars as pl
2+
3+
from biolit.observations import learnable_taxonomy
4+
5+
6+
class TestLearnableTaxonomy:
7+
def test_valid_deepest_taxon(self):
8+
inp = pl.DataFrame(
9+
{
10+
"nom_scientifique": ["herbe", "herbe"],
11+
"genre": ["plante", "plante"],
12+
"classe": ["chlorophyle", "chlorophyle"],
13+
"n_obs": 1,
14+
}
15+
)
16+
17+
out = learnable_taxonomy(inp, "vivant", ["genre", "classe"], 2)
18+
exp = ["herbe"]
19+
assert out == exp
20+
21+
def test_autre_taxons(self):
22+
inp = pl.DataFrame(
23+
{
24+
"genre": "plante",
25+
"classe": "chlorophyle",
26+
"nom_scientifique": ["herbe", "mousse", "fleur"],
27+
"n_obs": [10, 5, 5],
28+
}
29+
)
30+
31+
out = learnable_taxonomy(inp, "vivant", ["genre", "classe"], 10)
32+
exp = ["AUTRE -- chlorophyle", "herbe"]
33+
assert out == exp
34+
35+
def test_not_enough_autre_taxons(self):
36+
inp = pl.DataFrame(
37+
{
38+
"genre": "plante",
39+
"classe": "chlorophyle",
40+
"nom_scientifique": ["herbe", "mousse", "fleur"],
41+
"n_obs": [10, 1, 1],
42+
}
43+
)
44+
45+
out = learnable_taxonomy(inp, "vivant", ["genre", "classe"], 10)
46+
exp = ["NO_STATS -- chlorophyle", "herbe"]
47+
assert out == exp

0 commit comments

Comments
 (0)