Skip to content

Commit b021551

Browse files
authored
Merge pull request #2 from dataforgoodfr/species_distribution
Crée la cartographie de distribution des espèces
2 parents 555ecbe + 1366d67 commit b021551

File tree

7 files changed

+334
-1
lines changed

7 files changed

+334
-1
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,4 +160,8 @@ dmypy.json
160160
cython_debug/
161161

162162
# Precommit hooks: ruff cache
163-
.ruff_cache
163+
.ruff_cache
164+
165+
nohup.out
166+
data/
167+
notebooks/

biolit/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from pathlib import Path
2+
3+
import matplotlib as mpl
4+
5+
ROOTDIR = Path(__file__).parent / ".."
6+
DATADIR = ROOTDIR / "data"
7+
8+
9+
mpl.rcParams["axes.spines.right"] = False
10+
mpl.rcParams["axes.spines.top"] = False

biolit/observations.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import polars as pl
2+
import structlog
3+
from polars import col
4+
5+
from biolit import DATADIR
6+
from biolit.taxref import TAXREF_HIERARCHY
7+
8+
LOGGER = structlog.get_logger()
9+
10+
11+
def format_observations():
12+
fn = DATADIR / "export_biolit.csv"
13+
taxref = pl.read_parquet(DATADIR / "taxref.parquet")
14+
biolit = (
15+
pl.read_csv(fn)
16+
.rename(lambda c: c.replace(" - observation", "").lower().replace(" ", "_"))
17+
.with_columns(
18+
col("nom_scientifique").str.to_lowercase(),
19+
col("espece_identifiable_?").fill_null("Identifiable"),
20+
)
21+
.filter(
22+
col(
23+
"validee"
24+
) # & ~col("espece_identifiable_?").is_in(["non-identifiable"])
25+
)
26+
.join(taxref, how="left", left_on="nom_scientifique", right_on="lb_nom")
27+
.pipe(full_upper_hierarchy)
28+
.pipe(_observation_quality)
29+
)
30+
31+
LOGGER.info(
32+
"valid_observations",
33+
size=len(biolit),
34+
species=biolit["nom_scientifique"].n_unique(),
35+
)
36+
biolit.write_parquet(DATADIR / "biolit_valid_observations.parquet")
37+
38+
39+
def full_upper_hierarchy(frame: pl.DataFrame) -> pl.DataFrame:
40+
"""
41+
Fill all levels of hierachies with the complete name of the upper levels.
42+
43+
horse -> animal | vertebrate | horse
44+
"""
45+
for i, name in enumerate(TAXREF_HIERARCHY):
46+
prefix = (
47+
pl.lit("")
48+
if not i
49+
else (col(TAXREF_HIERARCHY[i - 1]).fill_null("NA") + pl.lit(" | "))
50+
)
51+
frame = frame.with_columns((prefix + col(name).fill_null("NA")).alias(name))
52+
return frame
53+
54+
55+
def _observation_quality(frame: pl.DataFrame) -> pl.DataFrame:
56+
return (
57+
frame.pipe(_check_missing_nom)
58+
.pipe(_check_missing_taxref)
59+
.pipe(_check_validated_non_identifiable)
60+
)
61+
62+
63+
def _check_missing_taxref(frame: pl.DataFrame) -> pl.DataFrame:
64+
missing_taxref_filter = (
65+
col("cd_nom").is_null() & col("nom_scientifique").is_not_null()
66+
)
67+
missing_taxref = (
68+
frame.filter()
69+
.group_by("nom_scientifique")
70+
.agg(col("id").count().alias("n_observations"))
71+
.sort("n_observations", descending=True)
72+
)
73+
missing_taxref.write_csv(DATADIR / "observations_missing_taxref.csv")
74+
75+
LOGGER.warning(
76+
"observation_quality_missing_taxref",
77+
missing_taxref_species=len(missing_taxref),
78+
missing_taxref_observations=missing_taxref["n_observations"].sum(),
79+
)
80+
with pl.Config(tbl_rows=100):
81+
print(missing_taxref)
82+
return frame.filter(~missing_taxref_filter)
83+
84+
85+
def _check_missing_nom(frame: pl.DataFrame) -> pl.DataFrame:
86+
missing_filter = col("nom_scientifique").is_null()
87+
missing_nom = frame.filter(missing_filter).select(
88+
"validee", "espece_identifiable_?", "lien"
89+
)
90+
91+
missing_nom.write_csv(DATADIR / "biolit_observation_missing_nom.csv")
92+
LOGGER.warning(
93+
"observation_quality_missing_nom",
94+
missing_nom=len(missing_nom),
95+
)
96+
with pl.Config(fmt_str_lengths=500):
97+
print(missing_nom)
98+
return frame.filter(~missing_filter)
99+
100+
101+
def _check_validated_non_identifiable(frame: pl.DataFrame) -> pl.DataFrame:
102+
filt = col("espece_identifiable_?") != "Identifiable"
103+
errors = frame.filter(filt).select(
104+
"lien", "espece_identifiable_?", "nom_scientifique"
105+
)
106+
errors.write_csv(DATADIR / "biolit_observation_validated_non_identifiable.csv")
107+
LOGGER.warning("observation_quality_validated_non_identifiable", n_obs=len(errors))
108+
with pl.Config(fmt_str_lengths=50):
109+
print(errors)
110+
return frame.filter(~filt)

biolit/taxref.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import polars as pl
2+
import structlog
3+
from polars import col
4+
5+
from biolit import DATADIR
6+
7+
TAXREF_HIERARCHY = ["regne", "phylum", "classe", "ordre", "famille", "sous_famille"]
8+
LOGGER = structlog.get_logger()
9+
10+
11+
def format_taxref():
12+
fn = DATADIR / "TAXREF_v18_2025" / "TAXREFv18.txt"
13+
taxref = (
14+
pl.read_csv(fn, separator="\t")
15+
.rename(str.lower)
16+
.with_columns(
17+
col("lb_nom").str.to_lowercase(),
18+
(
19+
col("sous_famille").is_not_null()
20+
+ col("famille").is_not_null() * 10
21+
+ col("ordre").is_not_null() * 100
22+
+ col("classe").is_not_null() * 1000
23+
).alias("priority"),
24+
)
25+
.select(["cd_nom", "lb_nom", "priority"] + TAXREF_HIERARCHY)
26+
)
27+
_check_duplicates(taxref)
28+
taxref = (
29+
taxref.sort(["lb_nom", "priority"], descending=[False, True])
30+
.unique("lb_nom")
31+
.drop("priority")
32+
)
33+
taxref.write_parquet(DATADIR / "taxref.parquet")
34+
35+
36+
def _check_duplicates(frame: pl.DataFrame):
37+
frame = frame.sort("lb_nom").filter(col("lb_nom").is_duplicated())
38+
if frame.is_empty():
39+
return
40+
frame.write_csv(DATADIR / "taxref_duplicate_species.csv")
41+
LOGGER.warning(
42+
"taxref_duplicate_species",
43+
n_species=len(frame),
44+
n_names=frame["lb_nom"].n_unique(),
45+
)
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
from pathlib import Path
2+
3+
import matplotlib as mpl
4+
import plotly.graph_objects as go
5+
import polars as pl
6+
from polars import col
7+
8+
from biolit import DATADIR
9+
from biolit.taxref import TAXREF_HIERARCHY
10+
11+
COLOR_MATCHING = {
12+
i: f"rgb({', '.join(str(int(x * 255)) for x in mpl.colormaps['tab10'](i)[:3])})"
13+
for i in range(20)
14+
}
15+
16+
17+
def _species_colors(frame: pl.DataFrame) -> pl.DataFrame:
18+
return (
19+
frame["regne"]
20+
.unique()
21+
.sort()
22+
.to_frame()
23+
.with_row_index("color")
24+
.with_columns(col("color").replace_strict(COLOR_MATCHING))
25+
)
26+
27+
28+
def plot_species_distribution(frame: pl.DataFrame, fn: Path):
29+
colors = _species_colors(frame)
30+
species_counts = (
31+
frame.filter(col("cd_nom").is_not_null())
32+
.group_by(["nom_scientifique", "cd_nom"] + TAXREF_HIERARCHY)
33+
.agg(col("id").count())
34+
.join(colors, on="regne")
35+
)
36+
37+
edges = _baseline_edges(species_counts)
38+
nodes = nodes_from_edges(edges)
39+
edges = enrich_edges(edges, nodes)
40+
edges.write_parquet(DATADIR / "species_edges.parquet")
41+
nodes.write_parquet(DATADIR / "species_node.parquet")
42+
save_sankey_plot(edges, nodes, fn)
43+
44+
45+
def save_sankey_plot(edges: pl.DataFrame, nodes: pl.DataFrame, fn: Path) -> Path:
46+
_data = go.Sankey(
47+
link=edges.to_dict(as_series=False),
48+
node=nodes.select("label", "color", "customdata").to_dict(as_series=False)
49+
| {
50+
"line": dict(color="lightgrey", width=0.1),
51+
"hovertemplate": "<b>%{customdata.name}</b><br>"
52+
"node_id: %{customdata.node_id}<br>"
53+
"# images: %{value}<br>"
54+
"# sub level: %{customdata.n_incoming}<br>"
55+
"# species: %{customdata.n_species}<br>"
56+
"<extra></extra>",
57+
},
58+
)
59+
60+
_fig = go.Figure(_data)
61+
_fig.update_layout(
62+
autosize=False,
63+
width=1000,
64+
height=1500,
65+
title_text="Répartition des images Biolit en selon les différentes strates de la hierarchie",
66+
font_size=10,
67+
)
68+
_fig.write_html(fn)
69+
70+
71+
def _baseline_edges(species_counts: pl.DataFrame) -> pl.DataFrame:
72+
_edges = []
73+
74+
_steps = ["nom_scientifique"] + TAXREF_HIERARCHY[:-1][::-1]
75+
for _source, _target in zip(_steps, _steps[1:]):
76+
tmp = (
77+
species_counts.group_by(_source, _target)
78+
.agg(
79+
col("id").sum(),
80+
col("id").count().alias("n_species"),
81+
col("color").first(),
82+
)
83+
.rename({_source: "source", _target: "target", "id": "value"})
84+
)
85+
_edges.append(tmp)
86+
return pl.concat(_edges)
87+
88+
89+
def nodes_from_edges(edges: pl.DataFrame) -> pl.DataFrame:
90+
has_labels = _node_has_labels(edges)
91+
return (
92+
pl.concat([edges["source"], edges["target"]])
93+
.unique()
94+
.sort()
95+
.to_frame()
96+
.with_row_index("id")
97+
.with_columns(col("id") - 1)
98+
.join(has_labels, left_on="source", right_on="target")
99+
.with_columns(
100+
pl.when(col("has_label")).then(col("source")).alias("label"),
101+
pl.when(col("has_label"))
102+
.then(pl.lit("blue"))
103+
.otherwise(pl.lit("lightgrey"))
104+
.alias("color"),
105+
pl.struct(
106+
name=col("source"),
107+
n_incoming=col("n_incoming"),
108+
n_species=col("n_species"),
109+
node_id=col("id"),
110+
).alias("customdata"),
111+
)
112+
)
113+
114+
115+
def _node_has_labels(edges: pl.DataFrame) -> pl.DataFrame:
116+
return (
117+
edges.group_by("target")
118+
.agg(
119+
col("value").sum(),
120+
col("source").count().alias("n_incoming"),
121+
col("n_species").sum(),
122+
)
123+
.with_columns(
124+
(col("value") > 300).alias("has_label"),
125+
col("target").str.count_matches("|", literal=True).alias("n_levels"),
126+
)
127+
)
128+
129+
130+
def enrich_edges(edges: pl.DataFrame, nodes: pl.DataFrame) -> pl.DataFrame:
131+
_sub_nodes = nodes.select("id", "source")
132+
return (
133+
edges.select("source", "target", "value", "color")
134+
.join(_sub_nodes, left_on="source", right_on="source")
135+
.join(_sub_nodes, left_on="target", right_on="source")
136+
.drop("target", "source")
137+
.rename({"id": "source", "id_right": "target"})
138+
.sort("source", "target")
139+
)

cmd/export_inpn.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import sys
2+
from pathlib import Path
3+
4+
import polars as pl
5+
6+
_base_dir = str(Path(__file__).parent.parent)
7+
if _base_dir not in sys.path:
8+
sys.path.insert(0, _base_dir)
9+
10+
if True:
11+
from biolit import DATADIR
12+
from biolit.observations import format_observations
13+
from biolit.taxref import format_taxref
14+
from biolit.visualisation.species_distribution import plot_species_distribution
15+
16+
17+
def main():
18+
format_taxref()
19+
format_observations()
20+
biolit_df = pl.read_parquet(DATADIR / "biolit_valid_observations.parquet")
21+
plot_species_distribution(biolit_df, fn=DATADIR / "distribution_images.html")
22+
23+
24+
if __name__ == "__main__":
25+
main()

data/.gitkeep

Whitespace-only changes.

0 commit comments

Comments
 (0)