Skip to content

Commit 8a2160e

Browse files
committed
feat: add get_content_type_info() public API method (fixes #826)
Root cause: the content type knowledge base (mime_type, group, description, extensions, is_text) was only accessible via the private _get_ct_info() method and the internal _cts_infos dict, with no supported public interface for external clients. Fix: expose a new public method Magika.get_content_type_info(label) that returns the ContentTypeInfo for any label in the knowledge base, raising MagikaError (rather than a bare KeyError) for unknown labels. This completes the content-type introspection API alongside the existing get_output_content_types() and get_model_content_types() methods.
1 parent 7056c1f commit 8a2160e

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

python/src/magika/magika.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,30 @@ def get_model_content_types(self) -> List[ContentTypeLabel]:
256256
model_content_types.update(self._model_config.target_labels_space)
257257
return sorted(model_content_types)
258258

259+
def get_content_type_info(self, label: ContentTypeLabel) -> ContentTypeInfo:
260+
"""Returns metadata for a given content type label.
261+
262+
This provides access to the content type knowledge base, exposing
263+
structured metadata such as mime type, group, human-readable
264+
description, common file extensions, and whether the type is
265+
text-based.
266+
267+
Args:
268+
label: The ContentTypeLabel to look up.
269+
270+
Returns:
271+
The ContentTypeInfo for the given label.
272+
273+
Raises:
274+
MagikaError: If the label is not present in the content type
275+
knowledge base (e.g., ContentTypeLabel.UNDEFINED).
276+
"""
277+
if label not in self._cts_infos:
278+
raise MagikaError(
279+
f"Content type '{label}' is not in the content type knowledge base."
280+
)
281+
return self._cts_infos[label]
282+
259283
@staticmethod
260284
def _get_default_model_name() -> str:
261285
"""Returns the default model name.

python/tests/test_magika_python_module.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,37 @@ def test_get_model_and_output_content_types() -> None:
761761
}.issubset(model_content_types_set)
762762

763763

764+
def test_get_content_type_info() -> None:
765+
"""Test Magika.get_content_type_info() — fixes #826.
766+
767+
Verifies that the public API correctly exposes ContentTypeInfo metadata
768+
(mime_type, group, description, extensions, is_text) for every
769+
ContentTypeLabel that is part of the content type knowledge base.
770+
"""
771+
m = Magika()
772+
773+
# Every output content type must be queryable and return valid metadata.
774+
for label in m.get_output_content_types():
775+
info = m.get_content_type_info(label)
776+
assert isinstance(info, ContentTypeInfo)
777+
assert info.label == label
778+
assert isinstance(info.mime_type, str) and info.mime_type != ""
779+
assert isinstance(info.group, str) and info.group != ""
780+
assert isinstance(info.description, str) and info.description != ""
781+
assert isinstance(info.extensions, list)
782+
assert isinstance(info.is_text, bool)
783+
784+
# Spot-check a well-known content type's metadata.
785+
pdf_info = m.get_content_type_info(ContentTypeLabel.PDF)
786+
assert pdf_info.mime_type == "application/pdf"
787+
assert pdf_info.group == "document"
788+
assert pdf_info.is_text is False
789+
790+
# Text-based type should report is_text=True.
791+
py_info = m.get_content_type_info(ContentTypeLabel.PYTHON)
792+
assert py_info.is_text is True
793+
794+
764795
def test_magika_imports():
765796
imported_modules = utils.get_imported_objects_after_wildcard()
766797

0 commit comments

Comments
 (0)