enhancement: get model name and initialization params externally (#291)

qued · web-flow · commit 6ed8ac2e9197 · 2023-11-20T11:19:30.000-05:00
Added a method to externally inject a supergradients ONNX model. By
setting the environment variable `UNSTRUCTURED_DEFAULT_MODEL_NAME` one
can override the default model. By setting the environment variable
`UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH`, one can
specify the path to a JSON containing model initialization parameters.

#### Testing:

```python
import os
from unstructured_inference.models.base import get_model
os.environ["UNSTRUCTURED_DEFAULT_MODEL_NAME"] = "detectron2_onnx"
model = get_model()
print(type(model))

```
Output should be `UnstructuredDetectronONNXModel` as opposed to
`UnstructuredYoloXModel`.

```python
from unittest import mock
import os
import json
from unstructured_inference.models.base import get_model
from huggingface_hub import hf_hub_download

label_map = {0: "Blue", 1: "Red"}
model_path = hf_hub_download("unstructuredio/yolo_x_layout", "yolox_tiny.onnx")
json_dict = {"model_path": model_path, "label_map": label_map}

os.environ["UNSTRUCTURED_DEFAULT_MODEL_NAME"] = "yolox"
os.environ["UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH"] = "some/fake/path.json"
with mock.patch("builtins.open", mock.mock_open(read_data=json.dumps(json_dict))):
    model = get_model()

print(model.layout_classes)

```
Output should be `{0: "Blue", 1: "Red"}` as opposed to the normal YoloX
labels.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.7.15
+
+* enhancement: Enable env variables for model definition
+
 ## 0.7.14
 
 * enhancement: Remove Super-Gradients Dependency and Allow General Onnx Models Instead
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -145,14 +145,12 @@ def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_imag
 
         layouts = [mock_initial_layout, mock_initial_layout]
 
-        monkeypatch.setattr(
-            models,
-            "UnstructuredDetectronModel",
-            partial(MockLayoutModel, layout=mock_final_layout),
-        )
         monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
 
-        with patch.object(layout, "load_pdf", return_value=(layouts, image_paths)):
+        with patch.object(layout, "load_pdf", return_value=(layouts, image_paths)), patch.dict(
+            models.model_class_map,
+            {"detectron2_lp": partial(MockLayoutModel, layout=mock_final_layout)},
+        ):
             model = layout.get_model("detectron2_lp")
             doc = layout.DocumentLayout.from_file("fake-file.pdf", detection_model=model)
 
@@ -266,15 +264,26 @@ def __init__(
 
 @pytest.mark.parametrize(
     ("text", "expected"),
-    [("base", 0.0), ("", 0.0), ("(cid:2)", 1.0), ("(cid:1)a", 0.5), ("c(cid:1)ab", 0.25)],
+    [
+        ("base", 0.0),
+        ("", 0.0),
+        ("(cid:2)", 1.0),
+        ("(cid:1)a", 0.5),
+        ("c(cid:1)ab", 0.25),
+    ],
 )
 def test_cid_ratio(text, expected):
     assert elements.cid_ratio(text) == expected
 
 
 @pytest.mark.parametrize(
     ("text", "expected"),
-    [("base", False), ("(cid:2)", True), ("(cid:1234567890)", True), ("jkl;(cid:12)asdf", True)],
+    [
+        ("base", False),
+        ("(cid:2)", True),
+        ("(cid:1234567890)", True),
+        ("jkl;(cid:12)asdf", True),
+    ],
 )
 def test_is_cid_present(text, expected):
     assert elements.is_cid_present(text) == expected
@@ -389,7 +398,11 @@ def test_page_numbers_in_page_objects():
 @pytest.mark.parametrize(
     ("fixed_layouts", "called_method", "not_called_method"),
     [
-        ([MockLayout()], "get_elements_from_layout", "get_elements_with_detection_model"),
+        (
+            [MockLayout()],
+            "get_elements_from_layout",
+            "get_elements_with_detection_model",
+        ),
         (None, "get_elements_with_detection_model", "get_elements_from_layout"),
     ],
 )
@@ -470,7 +483,11 @@ def test_load_pdf_raises_with_path_only_no_output_folder():
 def test_load_pdf_with_multicolumn_layout(filename="sample-docs/design-thinking.pdf"):
     layouts, images = layout.load_pdf(filename)
     doc = layout.process_file_with_model(filename=filename, model_name=None)
-    test_snippets = ["Key to design thinking", "Design thinking also", "But in recent years"]
+    test_snippets = [
+        "Key to design thinking",
+        "Design thinking also",
+        "But in recent years",
+    ]
 
     test_elements = []
     for element in doc.pages[0].elements:
@@ -590,7 +607,9 @@ def test_get_elements_using_image_extraction(mock_image, inplace, expected):
     assert page.get_elements_using_image_extraction(inplace=inplace) == expected
 
 
-def test_get_elements_using_image_extraction_raises_with_no_extraction_model(mock_image):
+def test_get_elements_using_image_extraction_raises_with_no_extraction_model(
+    mock_image,
+):
     page = layout.PageLayout(1, mock_image, None, element_extraction_model=None)
     with pytest.raises(ValueError):
         page.get_elements_using_image_extraction()
@@ -707,7 +726,10 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
 
 @pytest.mark.parametrize(
     ("filename", "img_num", "should_complete"),
-    [("sample-docs/empty-document.pdf", 0, True), ("sample-docs/empty-document.pdf", 10, False)],
+    [
+        ("sample-docs/empty-document.pdf", 0, True),
+        ("sample-docs/empty-document.pdf", 10, False),
+    ],
 )
 def test_get_image(filename, img_num, should_complete):
     doc = layout.DocumentLayout.from_file(filename)
diff --git a/test_unstructured_inference/models/test_model.py b/test_unstructured_inference/models/test_model.py
@@ -27,12 +27,8 @@ def predict(self, x: Any) -> Any:
 
 def test_get_model(monkeypatch):
     monkeypatch.setattr(models, "models", {})
-    monkeypatch.setattr(
-        models,
-        "UnstructuredDetectronModel",
-        MockModel,
-    )
-    assert isinstance(models.get_model("checkbox"), MockModel)
+    with mock.patch.dict(models.model_class_map, {"checkbox": MockModel}):
+        assert isinstance(models.get_model("checkbox"), MockModel)
 
 
 def test_raises_invalid_model():
@@ -48,20 +44,15 @@ def test_raises_uninitialized():
 def test_model_initializes_once():
     from unstructured_inference.inference import layout
 
-    with mock.patch.object(models, "UnstructuredYoloXModel", MockModel), mock.patch.object(
-        models,
-        "models",
-        {},
+    with mock.patch.dict(models.model_class_map, {"yolox": MockModel}), mock.patch.object(
+        models, "models", {}
     ):
         doc = layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf")
         doc.pages[0].detection_model.initializer.assert_called_once()
-        assert hasattr(
-            doc.pages[0].elements[0],
-            "prob",
-        )  # NOTE(pravin) New Assertion to Make Sure Elements have probability attribute
-        assert (
-            doc.pages[0].elements[0].prob is None
-        )  # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
+        # NOTE(pravin) New Assertion to Make Sure Elements have probability attribute
+        assert hasattr(doc.pages[0].elements[0], "prob")
+        # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
+        assert doc.pages[0].elements[0].prob is None
 
 
 def test_deduplicate_detected_elements():
@@ -107,7 +98,12 @@ def test_enhance_regions():
     model = get_model("yolox_tiny")
     elements = model.enhance_regions(elements, 0.5)
     assert len(elements) == 1
-    assert (elements[0].bbox.x1, elements[0].bbox.y1, elements[0].bbox.x2, elements[0].bbox.x2) == (
+    assert (
+        elements[0].bbox.x1,
+        elements[0].bbox.y1,
+        elements[0].bbox.x2,
+        elements[0].bbox.x2,
+    ) == (
         0,
         0,
         1.10,
@@ -138,9 +134,36 @@ def test_clean_type():
     model = get_model("yolox_tiny")
     elements = model.clean_type(elements, type_to_clean="Table")
     assert len(elements) == 1
-    assert (elements[0].bbox.x1, elements[0].bbox.y1, elements[0].bbox.x2, elements[0].bbox.x2) == (
-        0,
-        0,
-        1,
-        1,
-    )
+    assert (
+        elements[0].bbox.x1,
+        elements[0].bbox.y1,
+        elements[0].bbox.x2,
+        elements[0].bbox.x2,
+    ) == (0, 0, 1, 1)
+
+
+def test_env_variables_override_default_model(monkeypatch):
+    # When an environment variable specifies a different default model and we call get_model with no
+    # args, we should get back the model the env var calls for
+    monkeypatch.setattr(models, "models", {})
+    with mock.patch.dict(
+        models.os.environ, {"UNSTRUCTURED_DEFAULT_MODEL_NAME": "checkbox"}
+    ), mock.patch.dict(models.model_class_map, {"checkbox": MockModel}):
+        model = models.get_model()
+    assert isinstance(model, MockModel)
+
+
+def test_env_variables_override_intialization_params(monkeypatch):
+    # When initialization params are specified in an environment variable, and we call get_model, we
+    # should see that the model was initialized with those params
+    monkeypatch.setattr(models, "models", {})
+    with mock.patch.dict(
+        models.os.environ,
+        {"UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH": "fake_json.json"},
+    ), mock.patch.object(models, "DEFAULT_MODEL", "fake"), mock.patch.dict(
+        models.model_class_map, {"fake": mock.MagicMock()}
+    ), mock.patch(
+        "builtins.open", mock.mock_open(read_data='{"date": "3/26/81"}')
+    ):
+        model = models.get_model()
+    model.initialize.assert_called_once_with(date="3/26/81")
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.14"  # pragma: no cover
+__version__ = "0.7.15"  # pragma: no cover
diff --git a/unstructured_inference/models/base.py b/unstructured_inference/models/base.py
@@ -1,4 +1,6 @@
-from typing import Dict, Optional
+import json
+import os
+from typing import Dict, Optional, Type
 
 from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES
 from unstructured_inference.models.chipper import UnstructuredChipperModel
@@ -29,46 +31,47 @@
 
 models: Dict[str, UnstructuredModel] = {}
 
+model_class_map: Dict[str, Type[UnstructuredModel]] = {
+    **{name: UnstructuredDetectronModel for name in DETECTRON2_MODEL_TYPES},
+    **{name: UnstructuredDetectronONNXModel for name in DETECTRON2_ONNX_MODEL_TYPES},
+    **{name: UnstructuredYoloXModel for name in YOLOX_MODEL_TYPES},
+    **{name: UnstructuredChipperModel for name in CHIPPER_MODEL_TYPES},
+    "super_gradients": UnstructuredSuperGradients,
+}
 
-def get_model(
-    model_name: Optional[str] = None,
-    model_path: Optional[str] = None,
-    label_map: Optional[dict] = None,
-    input_shape: Optional[tuple] = None,
-) -> UnstructuredModel:
+
+def get_model(model_name: Optional[str] = None) -> UnstructuredModel:
     """Gets the model object by model name."""
     # TODO(alan): These cases are similar enough that we can probably do them all together with
     # importlib
 
     global models
 
     if model_name is None:
-        model_name = DEFAULT_MODEL
+        default_name_from_env = os.environ.get("UNSTRUCTURED_DEFAULT_MODEL_NAME")
+        model_name = default_name_from_env if default_name_from_env is not None else DEFAULT_MODEL
 
     if model_name in models:
         return models[model_name]
 
-    if model_name in DETECTRON2_MODEL_TYPES:
-        model: UnstructuredModel = UnstructuredDetectronModel()
-        initialize_params = {**DETECTRON2_MODEL_TYPES[model_name]}
-    elif model_name in DETECTRON2_ONNX_MODEL_TYPES:
-        model = UnstructuredDetectronONNXModel()
-        initialize_params = {**DETECTRON2_ONNX_MODEL_TYPES[model_name]}
-    elif model_name in YOLOX_MODEL_TYPES:
-        model = UnstructuredYoloXModel()
-        initialize_params = {**YOLOX_MODEL_TYPES[model_name]}
-    elif model_name in CHIPPER_MODEL_TYPES:
-        model = UnstructuredChipperModel()
-        initialize_params = {**CHIPPER_MODEL_TYPES[model_name]}
-    elif model_name == "super_gradients":
-        model = UnstructuredSuperGradients()
-        initialize_params = {
-            "model_path": model_path,
-            "label_map": label_map,
-            "input_shape": input_shape,
-        }
+    initialize_param_json = os.environ.get("UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH")
+    if initialize_param_json is not None:
+        with open(initialize_param_json) as fp:
+            initialize_params = json.load(fp)
     else:
-        raise UnknownModelException(f"Unknown model type: {model_name}")
+        if model_name in DETECTRON2_MODEL_TYPES:
+            initialize_params = DETECTRON2_MODEL_TYPES[model_name]
+        elif model_name in DETECTRON2_ONNX_MODEL_TYPES:
+            initialize_params = DETECTRON2_ONNX_MODEL_TYPES[model_name]
+        elif model_name in YOLOX_MODEL_TYPES:
+            initialize_params = YOLOX_MODEL_TYPES[model_name]
+        elif model_name in CHIPPER_MODEL_TYPES:
+            initialize_params = CHIPPER_MODEL_TYPES[model_name]
+        else:
+            raise UnknownModelException(f"Unknown model type: {model_name}")
+
+    model: UnstructuredModel = model_class_map[model_name]()
+
     model.initialize(**initialize_params)
     models[model_name] = model
     return model
diff --git a/unstructured_inference/models/chipper.py b/unstructured_inference/models/chipper.py
@@ -23,7 +23,7 @@
 )
 from unstructured_inference.utils import LazyDict, strip_tags
 
-MODEL_TYPES: Dict[Optional[str], Union[LazyDict, dict]] = {
+MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {
     "chipperv1": {
         "pre_trained_model_repo": "unstructuredio/ved-fine-tuning",
         "swap_head": False,
diff --git a/unstructured_inference/models/detectron2onnx.py b/unstructured_inference/models/detectron2onnx.py
@@ -31,7 +31,7 @@
 
 # NOTE(alan): Entries are implemented as LazyDicts so that models aren't downloaded until they are
 # needed.
-MODEL_TYPES: Dict[Optional[str], Union[LazyDict, dict]] = {
+MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {
     "detectron2_onnx": LazyDict(
         model_path=LazyEvaluateInfo(
             hf_hub_download,
@@ -124,7 +124,8 @@ def initialize(
 
     def preprocess(self, image: Image.Image) -> Dict[str, np.ndarray]:
         """Process input image into required format for ingestion into the Detectron2 ONNX binary.
-        This involves resizing to a fixed shape and converting to a specific numpy format."""
+        This involves resizing to a fixed shape and converting to a specific numpy format.
+        """
         # TODO (benjamin): check other shapes for inference
         img = np.array(image)
         # TODO (benjamin): We should use models.get_model() but currenly returns Detectron model

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.7.14" # pragma: no cover`
	`1`	`+__version__ = "0.7.15" # pragma: no cover`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@`
`23`	`23`	`)`
`24`	`24`	`from unstructured_inference.utils import LazyDict, strip_tags`
`25`	`25`
`26`		`-MODEL_TYPES: Dict[Optional[str], Union[LazyDict, dict]] = {`
	`26`	`+MODEL_TYPES: Dict[str, Union[LazyDict, dict]] = {`
`27`	`27`	`"chipperv1": {`
`28`	`28`	`"pre_trained_model_repo": "unstructuredio/ved-fine-tuning",`
`29`	`29`	`"swap_head": False,`