Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit 889c9da

Browse files
authored
Merge pull request #242 from danmcp/fulltax
Add taxonomy_base='null' option to support using the full taxonomy repo contents
2 parents b814dfe + 680cbfd commit 889c9da

2 files changed

Lines changed: 71 additions & 26 deletions

File tree

src/instructlab/sdg/utils/taxonomy.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,17 @@ def _get_taxonomy_diff(repo="taxonomy", base="origin/main"):
9797
return updated_taxonomy_files
9898

9999

100+
def _get_taxonomy(repo="taxonomy"):
101+
repo = Path(repo)
102+
taxonomy_file_paths = []
103+
for root, _, files in os.walk(repo):
104+
for file in files:
105+
file_path = Path(root).joinpath(file).relative_to(repo)
106+
if _istaxonomyfile(file_path):
107+
taxonomy_file_paths.append(str(file_path))
108+
return taxonomy_file_paths
109+
110+
100111
def _get_documents(
101112
source: Dict[str, Union[str, List[str]]],
102113
skip_checkout: bool = False,
@@ -400,15 +411,19 @@ def read_taxonomy(taxonomy, taxonomy_base, yaml_rules):
400411
if errors:
401412
raise SystemExit(yaml.YAMLError("Taxonomy file with errors! Exiting."))
402413
else: # taxonomy is dir
403-
# Gather the new or changed YAMLs using git diff
404-
updated_taxonomy_files = _get_taxonomy_diff(taxonomy, taxonomy_base)
414+
if taxonomy_base == "empty":
415+
# Gather all the yamls - equivalent to a diff against "the null tree"
416+
taxonomy_files = _get_taxonomy(taxonomy)
417+
else:
418+
# Gather the new or changed YAMLs using git diff, including untracked files
419+
taxonomy_files = _get_taxonomy_diff(taxonomy, taxonomy_base)
405420
total_errors = 0
406421
total_warnings = 0
407-
if updated_taxonomy_files:
408-
logger.debug("Found new taxonomy files:")
409-
for e in updated_taxonomy_files:
422+
if taxonomy_files:
423+
logger.debug("Found taxonomy files:")
424+
for e in taxonomy_files:
410425
logger.debug(f"* {e}")
411-
for f in updated_taxonomy_files:
426+
for f in taxonomy_files:
412427
file_path = os.path.join(taxonomy, f)
413428
data, warnings, errors = _read_taxonomy_file(file_path, yaml_rules)
414429
total_warnings += warnings

tests/test_taxonomy.py

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@
3333

3434
TEST_SEED_EXAMPLE = "Can you help me debug this failing unit test?"
3535

36-
TEST_TAXONOMY_BASE = "main"
37-
3836
TEST_CUSTOM_YAML_RULES = b"""extends: relaxed
3937
4038
rules:
@@ -50,26 +48,58 @@ class TestTaxonomy:
5048
def _init_taxonomy(self, taxonomy_dir):
5149
self.taxonomy = taxonomy_dir
5250

53-
def test_read_taxonomy_leaf_nodes(self):
51+
@pytest.mark.parametrize(
52+
"taxonomy_base, create_tracked_file, create_untracked_file, check_leaf_node_keys",
53+
[
54+
("main", True, True, ["compositional_skills->new"]),
55+
("main", False, True, ["compositional_skills->new"]),
56+
("main", True, False, []),
57+
("main", False, False, []),
58+
("main^", True, False, ["compositional_skills->tracked"]),
59+
(
60+
"main^",
61+
True,
62+
True,
63+
["compositional_skills->new", "compositional_skills->tracked"],
64+
),
65+
("empty", True, False, ["compositional_skills->tracked"]),
66+
(
67+
"empty",
68+
True,
69+
True,
70+
["compositional_skills->new", "compositional_skills->tracked"],
71+
),
72+
],
73+
)
74+
def test_read_taxonomy_leaf_nodes(
75+
self,
76+
taxonomy_base,
77+
create_tracked_file,
78+
create_untracked_file,
79+
check_leaf_node_keys,
80+
):
5481
tracked_file = "compositional_skills/tracked/qna.yaml"
5582
untracked_file = "compositional_skills/new/qna.yaml"
56-
self.taxonomy.add_tracked(tracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML)
57-
self.taxonomy.create_untracked(
58-
untracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML
59-
)
83+
if create_tracked_file:
84+
self.taxonomy.add_tracked(tracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML)
85+
if create_untracked_file:
86+
self.taxonomy.create_untracked(
87+
untracked_file, TEST_VALID_COMPOSITIONAL_SKILL_YAML
88+
)
6089

61-
leaf_node = taxonomy.read_taxonomy_leaf_nodes(
62-
self.taxonomy.root, TEST_TAXONOMY_BASE, TEST_CUSTOM_YAML_RULES
63-
)
64-
leaf_node_key = str(pathlib.Path(untracked_file).parent).replace(
65-
os.path.sep, "->"
90+
leaf_nodes = taxonomy.read_taxonomy_leaf_nodes(
91+
self.taxonomy.root, taxonomy_base, TEST_CUSTOM_YAML_RULES
6692
)
67-
assert leaf_node_key in leaf_node
6893

69-
leaf_node_entries = leaf_node.get(leaf_node_key)
70-
seed_example_exists = False
71-
if any(
72-
entry["instruction"] == TEST_SEED_EXAMPLE for entry in leaf_node_entries
73-
):
74-
seed_example_exists = True
75-
assert seed_example_exists is True
94+
assert len(leaf_nodes) == len(check_leaf_node_keys)
95+
96+
for leaf_node_key in check_leaf_node_keys:
97+
assert leaf_node_key in leaf_nodes
98+
99+
leaf_node_entries = leaf_nodes.get(leaf_node_key)
100+
seed_example_exists = False
101+
if any(
102+
entry["instruction"] == TEST_SEED_EXAMPLE for entry in leaf_node_entries
103+
):
104+
seed_example_exists = True
105+
assert seed_example_exists is True

0 commit comments

Comments
 (0)