Skip to content

Commit fd2e074

Browse files
committed
Add manifest-driven model cards
1 parent 103af7f commit fd2e074

5 files changed

Lines changed: 250 additions & 8 deletions

File tree

openmed/core/hf_publish.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from pathlib import Path
1313
from typing import Any
1414

15+
from openmed.core.model_card import DEFAULT_ARXIV, write_model_card
16+
1517

1618
DEFAULT_ORG = "OpenMed"
1719
DEFAULT_TOKEN_ENV = "HF_WRITE_TOKEN"
@@ -105,7 +107,7 @@ def build_manifest_row(
105107
"formats": [_manifest_format_name(format_name)],
106108
"canonical_labels": labels,
107109
"benchmark": {"dataset": None, "micro_f1": None, "recall": None},
108-
"arxiv": None,
110+
"arxiv": DEFAULT_ARXIV,
109111
"license": "apache-2.0",
110112
"reproducibility_hash": reproducibility_hash,
111113
"released": released,
@@ -171,6 +173,14 @@ def publish_artifact(
171173
version=version,
172174
)
173175
api = api or _load_hf_api()
176+
row = build_manifest_row(
177+
repo_id=repo_id,
178+
source_model_id=source_model_id,
179+
artifact_dir=artifact_dir,
180+
format_name=format_name,
181+
released=released,
182+
)
183+
write_model_card(artifact_dir / "README.md", row)
174184

175185
skipped = False
176186
if skip_existing and _repo_exists(api, repo_id=repo_id, token=token):
@@ -191,13 +201,6 @@ def publish_artifact(
191201
commit_message=f"Publish {format_name} artifact",
192202
)
193203

194-
row = build_manifest_row(
195-
repo_id=repo_id,
196-
source_model_id=source_model_id,
197-
artifact_dir=artifact_dir,
198-
format_name=format_name,
199-
released=released,
200-
)
201204
if manifest_path is not None:
202205
append_manifest_row(manifest_path, row)
203206

@@ -222,6 +225,8 @@ def artifact_sha256(path: str | Path) -> str:
222225

223226
for file_path in paths:
224227
relative = file_path.relative_to(root).as_posix()
228+
if relative == "README.md":
229+
continue
225230
digest.update(relative.encode("utf-8"))
226231
digest.update(b"\0")
227232
with file_path.open("rb") as handle:

openmed/core/model_card.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
"""Render OpenMed model cards from manifest rows."""
2+
3+
from __future__ import annotations
4+
5+
from pathlib import Path
6+
from typing import Any
7+
8+
9+
DEFAULT_ARXIV = "2508.01630"
10+
11+
12+
def render_model_card(row: dict[str, Any]) -> str:
13+
"""Return a README.md model card for one manifest row."""
14+
15+
repo_id = _string(row.get("repo_id"), "OpenMed/model")
16+
title = repo_id.rsplit("/", 1)[-1]
17+
benchmark = row.get("benchmark") if isinstance(row.get("benchmark"), dict) else {}
18+
formats = _list(row.get("formats"))
19+
languages = _list(row.get("languages"))
20+
labels = _list(row.get("canonical_labels"))
21+
arxiv = _string(row.get("arxiv"), DEFAULT_ARXIV)
22+
license_name = _string(row.get("license"), "Not specified")
23+
task = _string(row.get("task"), "unknown")
24+
25+
lines = [
26+
"---",
27+
f"license: {license_name}",
28+
f"pipeline_tag: {task}",
29+
"library_name: openmed",
30+
"tags:",
31+
"- openmed",
32+
"- medical-nlp",
33+
"---",
34+
"",
35+
f"# {title}",
36+
"",
37+
"This model card is rendered from the OpenMed model manifest. Update `models.jsonl` and rerun the publish step instead of editing this file directly.",
38+
"",
39+
"## Manifest Summary",
40+
"",
41+
"| Field | Value |",
42+
"|---|---|",
43+
f"| Repository | `{repo_id}` |",
44+
f"| Family | {_string(row.get('family'), 'Not specified')} |",
45+
f"| Task | {task} |",
46+
f"| Languages | {_comma_or_unspecified(languages)} |",
47+
f"| Tier | {_string(row.get('tier'), 'Not specified')} |",
48+
f"| Parameters | {_format_param_count(row.get('param_count'))} |",
49+
f"| Architecture | {_string(row.get('architecture'), 'Not specified')} |",
50+
f"| Base model | {_string(row.get('base_model'), 'Not specified')} |",
51+
f"| Formats | {_comma_or_unspecified(formats)} |",
52+
f"| License | {license_name} |",
53+
f"| arXiv | {_arxiv_link(arxiv)} |",
54+
f"| Reproducibility hash | `{_string(row.get('reproducibility_hash'), 'Not specified')}` |",
55+
f"| Released | {_string(row.get('released'), 'Not specified')} |",
56+
"",
57+
"## Benchmark",
58+
"",
59+
"| Dataset | Micro F1 | Recall |",
60+
"|---|---:|---:|",
61+
f"| {_string(benchmark.get('dataset'), 'Not reported')} | {_metric(benchmark.get('micro_f1'))} | {_metric(benchmark.get('recall'))} |",
62+
"",
63+
"## Canonical Labels",
64+
"",
65+
_labels_block(labels),
66+
]
67+
return "\n".join(lines) + "\n"
68+
69+
70+
def write_model_card(path: str | Path, row: dict[str, Any]) -> Path:
71+
"""Write a rendered model card to *path* and return the path."""
72+
73+
path = Path(path)
74+
path.parent.mkdir(parents=True, exist_ok=True)
75+
path.write_text(render_model_card(row), encoding="utf-8")
76+
return path
77+
78+
79+
def _string(value: Any, default: str) -> str:
80+
if value is None:
81+
return default
82+
text = str(value).strip()
83+
return text or default
84+
85+
86+
def _list(value: Any) -> list[str]:
87+
if not isinstance(value, list):
88+
return []
89+
return [str(item) for item in value if str(item)]
90+
91+
92+
def _comma_or_unspecified(values: list[str]) -> str:
93+
return ", ".join(values) if values else "Not specified"
94+
95+
96+
def _format_param_count(value: Any) -> str:
97+
if not isinstance(value, int) or value <= 0:
98+
return "Not specified"
99+
if value >= 1_000_000_000:
100+
compact = f"{value / 1_000_000_000:g}B"
101+
elif value >= 1_000_000:
102+
compact = f"{value / 1_000_000:g}M"
103+
elif value >= 1_000:
104+
compact = f"{value / 1_000:g}K"
105+
else:
106+
compact = str(value)
107+
return f"{compact} ({value:,})"
108+
109+
110+
def _metric(value: Any) -> str:
111+
if isinstance(value, (int, float)):
112+
return f"{float(value):.4f}"
113+
return "Not reported"
114+
115+
116+
def _arxiv_link(value: str) -> str:
117+
if value == "Not specified":
118+
return value
119+
return f"[arXiv:{value}](https://arxiv.org/abs/{value})"
120+
121+
122+
def _labels_block(labels: list[str]) -> str:
123+
if not labels:
124+
return "Not specified."
125+
return ", ".join(f"`{label}`" for label in labels)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
---
2+
license: apache-2.0
3+
pipeline_tag: token-classification
4+
library_name: openmed
5+
tags:
6+
- openmed
7+
- medical-nlp
8+
---
9+
10+
# OpenMed-PII-Turkish-SuperClinical-Small-44M-v1-mlx
11+
12+
This model card is rendered from the OpenMed model manifest. Update `models.jsonl` and rerun the publish step instead of editing this file directly.
13+
14+
## Manifest Summary
15+
16+
| Field | Value |
17+
|---|---|
18+
| Repository | `OpenMed/OpenMed-PII-Turkish-SuperClinical-Small-44M-v1-mlx` |
19+
| Family | PII |
20+
| Task | token-classification |
21+
| Languages | tr |
22+
| Tier | Small |
23+
| Parameters | 44M (44,000,000) |
24+
| Architecture | deberta-v2 |
25+
| Base model | OpenMed/OpenMed-PII-Turkish-SuperClinical-Small-44M-v1 |
26+
| Formats | mlx-fp, mlx-8bit |
27+
| License | apache-2.0 |
28+
| arXiv | [arXiv:2508.01630](https://arxiv.org/abs/2508.01630) |
29+
| Reproducibility hash | `sha256:1111111111111111111111111111111111111111111111111111111111111111` |
30+
| Released | 2026-06-14 |
31+
32+
## Benchmark
33+
34+
| Dataset | Micro F1 | Recall |
35+
|---|---:|---:|
36+
| openmed-golden-pii | 0.9823 | 0.9910 |
37+
38+
## Canonical Labels
39+
40+
`PERSON`, `DATE`, `ID_NUM`

tests/unit/core/test_model_card.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Tests for manifest-rendered model cards."""
2+
3+
from __future__ import annotations
4+
5+
from pathlib import Path
6+
7+
from openmed.core.model_card import render_model_card, write_model_card
8+
9+
10+
ROOT = Path(__file__).resolve().parents[3]
11+
GOLDEN = ROOT / "tests" / "fixtures" / "model_card_expected.md"
12+
13+
14+
def _fixture_row():
15+
return {
16+
"repo_id": "OpenMed/OpenMed-PII-Turkish-SuperClinical-Small-44M-v1-mlx",
17+
"family": "PII",
18+
"task": "token-classification",
19+
"languages": ["tr"],
20+
"tier": "Small",
21+
"param_count": 44_000_000,
22+
"architecture": "deberta-v2",
23+
"base_model": "OpenMed/OpenMed-PII-Turkish-SuperClinical-Small-44M-v1",
24+
"formats": ["mlx-fp", "mlx-8bit"],
25+
"canonical_labels": ["PERSON", "DATE", "ID_NUM"],
26+
"benchmark": {
27+
"dataset": "openmed-golden-pii",
28+
"micro_f1": 0.9823,
29+
"recall": 0.991,
30+
},
31+
"arxiv": "2508.01630",
32+
"license": "apache-2.0",
33+
"reproducibility_hash": (
34+
"sha256:1111111111111111111111111111111111111111111111111111111111111111"
35+
),
36+
"released": "2026-06-14",
37+
}
38+
39+
40+
def test_render_model_card_matches_golden_fixture():
41+
assert render_model_card(_fixture_row()) == GOLDEN.read_text(encoding="utf-8")
42+
43+
44+
def test_render_model_card_contains_manifest_release_fields():
45+
card = render_model_card(_fixture_row())
46+
47+
assert "| Tier | Small |" in card
48+
assert "| Parameters | 44M (44,000,000) |" in card
49+
assert "| Formats | mlx-fp, mlx-8bit |" in card
50+
assert "| openmed-golden-pii | 0.9823 | 0.9910 |" in card
51+
assert "[arXiv:2508.01630]" in card
52+
assert "| License | apache-2.0 |" in card
53+
assert _fixture_row()["reproducibility_hash"] in card
54+
55+
56+
def test_write_model_card_writes_readme_from_row(tmp_path):
57+
target = tmp_path / "README.md"
58+
59+
path = write_model_card(target, _fixture_row())
60+
61+
assert path == target
62+
assert target.read_text(encoding="utf-8") == GOLDEN.read_text(encoding="utf-8")

tests/unit/mlx/test_hf_publish.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import inspect
66
import json
77
import re
8+
from pathlib import Path
89

910
import pytest
1011

@@ -25,6 +26,7 @@ def __init__(self, *, exists: bool = False):
2526
self.exists = exists
2627
self.created = []
2728
self.uploaded = []
29+
self.uploaded_cards = []
2830
self.info_calls = []
2931

3032
def repo_info(self, **kwargs):
@@ -39,6 +41,9 @@ def create_repo(self, **kwargs):
3941

4042
def upload_folder(self, **kwargs):
4143
self.uploaded.append(kwargs)
44+
self.uploaded_cards.append(
45+
(Path(kwargs["folder_path"]) / "README.md").read_text(encoding="utf-8")
46+
)
4247

4348

4449
def _write_mlx_artifact(tmp_path):
@@ -96,11 +101,15 @@ def test_publish_artifact_creates_repo_uploads_folder_and_writes_manifest(tmp_pa
96101
assert fake_api.created[0]["token"] == "secret-token"
97102
assert fake_api.uploaded[0]["folder_path"] == str(artifact)
98103
assert fake_api.uploaded[0]["token"] == "secret-token"
104+
assert (artifact / "README.md").exists()
105+
assert "OpenMed/test-model-v1-mlx" in fake_api.uploaded_cards[0]
106+
assert "sha256:" in fake_api.uploaded_cards[0]
99107

100108
rows = [json.loads(line) for line in manifest.read_text(encoding="utf-8").splitlines()]
101109
assert rows == [result.manifest_row]
102110
assert rows[0]["repo_id"] == "OpenMed/test-model-v1-mlx"
103111
assert rows[0]["formats"] == ["mlx-fp"]
112+
assert rows[0]["arxiv"] == "2508.01630"
104113
assert rows[0]["canonical_labels"] == ["PERSON", "DATE"]
105114
assert re.fullmatch(r"sha256:[0-9a-f]{64}", rows[0]["reproducibility_hash"])
106115
assert "secret-token" not in json.dumps(rows[0])
@@ -121,6 +130,7 @@ def test_publish_artifact_skips_existing_repo_without_upload(tmp_path, monkeypat
121130
assert result.skipped is True
122131
assert fake_api.created == []
123132
assert fake_api.uploaded == []
133+
assert (artifact / "README.md").exists()
124134
assert result.repo_id == "OpenMed/test-model-v1-coreml"
125135

126136

0 commit comments

Comments
 (0)