Merge branch 'master' into 502

C-K-Loan · web-flow · commit fdbc5f8503b4 · 2023-10-08T14:52:50.000+02:00
diff --git a/docs/_includes/footer.html b/docs/_includes/footer.html
@@ -25,8 +25,8 @@
       <div>
         {%- include snippets/get-locale-string.html key='COPYRIGHT_DATES' -%}
         {%- assign _locale_copyright_dates = __return -%}
-        © <span id="years"></span> John Snow Labs Inc.
-        <a href="http://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="http://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a>
+        &copy; <span id="years"></span> John Snow Labs Inc.
+        <a href="https://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="https://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a>
       </div>
     </div>
   </div>
diff --git a/docs/en/examples.md b/docs/en/examples.md
@@ -441,7 +441,7 @@ nlp.load('lang').predict(['NLU is an open-source text processing library for adv
 ## E2E Classifier
 [E2E Classifier example](https://colab.research.google.com/drive/1OSkiXGEpKlm9HWDoVb42uLNQQgb7nqNZ?usp=sharing)     
 
-This is a multi class classifier trained on the E2E [dataset for Natural language generation](http://www.macs.hw.ac.uk/InteractionLab/E2E/#)
+This is a multi class classifier trained on the E2E [dataset for Natural language generation](https://www.macs.hw.ac.uk/InteractionLab/E2E/#)
 
 ```python
 nlp.load('e2e').predict('E2E is a dataset for training generative models')
diff --git a/docs/en/nlu_for_healthcare.md b/docs/en/nlu_for_healthcare.md
diff --git a/docs/en/release_notes.md b/docs/en/release_notes.md
diff --git a/docs/en/start_sparkseession.md b/docs/en/start_sparkseession.md
@@ -31,7 +31,7 @@ Review detailed [docs here](https://nlu.johnsnowlabs.com/docs/en/install#authori
 |-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------|---------|
 | `None`                  | Load license automatically via one of the **Auto-Detection Mechanisms**                                                                                                                                              | `jsl.start()`                                    | `False` |
 | `browser_login`         | Browser based authorization, Button to click on Notebooks and Browser Pop-Up otherwise.                                                                                                                              | `jsl.start(browser_login=True)`                  | `False` |
-| `access_token`          | Vist [my.johnsnowlabs.com](https://my.johnsnowlabs.com/) to extract a token which you can provide to enable license access. See [Access Token Example](http://nlu.johnsnowlabs.com/docs/en/install#via-access-token) | `jsl.start(access_token='myToken')`              | `None`  |
+| `access_token`          | Vist [my.johnsnowlabs.com](https://my.johnsnowlabs.com/) to extract a token which you can provide to enable license access. See [Access Token Example](https://nlu.johnsnowlabs.com/docs/en/install#via-access-token) | `jsl.start(access_token='myToken')`              | `None`  |
 | `secrets_file`          | Define JSON license file with keys  defined by [License Variable Overview](https://nlu.johnsnowlabs.com/docs/en/install#license-variables-names-for-json-and-os-variables) and provide file path                     | `jsl.start(secrets_file='path/to/license.json')` | `None`  |
 | `store_in_jsl_home`     | Disable caching of new licenses to `~./jsl_home`                                                                                                                                                                     | `jsl.start(store_in_jsl_home=False)`             | `True`  |
 | `local_license_number`  | Specify which license to use, if you have access to multiple locally cached                                                                                                                                          | `jsl.start(license_number=5)`                    | `0`     |
diff --git a/docs/footer.html b/docs/footer.html
@@ -1,4 +1,4 @@
 <div class="container">
-        <p>&copy; <span id="year_id"></span> . John Snow Labs Inc. <a href="http://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="http://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a></p>
+        <p>&copy; <span id="year_id"></span> . John Snow Labs Inc. <a href="https://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="https://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a></p>
 </div>
 <script>document.getElementById("year_id").innerHTML = new Date().getFullYear();</script>
diff --git a/nlu/__init__.py b/nlu/__init__.py
@@ -1,5 +1,6 @@
 __version__ = '5.0.2'
 
+
 import nlu.utils.environment.env_utils as env_utils
 
 if not env_utils.try_import_pyspark_in_streamlit():
@@ -208,7 +209,6 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool =
         if path is not None:
             logger.info(f'Trying to load nlu pipeline from local hard drive, located at {path}')
             pipe = load_nlu_pipe_from_hdd(path, request)
-            pipe.nlu_ref = request
             return pipe
     except Exception as err:
         if verbose:
@@ -274,6 +274,10 @@ def auth(HEALTHCARE_LICENSE_OR_JSON_PATH='/content/spark_nlp_for_healthcare.json
     return nlu
 
 
+def is_nlu_uid(uid: str):
+    return 'is_nlu_pipe' in uid
+
+
 def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
     """Either there is a pipeline of models in the path or just one singular model_anno_obj.
     If it is a component_list,  load the component_list and return it.
@@ -298,7 +302,7 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
     # if dbfs_path_exist(pipe_path):
     # Resource in path is a pipeline
     if is_pipe(pipe_path):
-        pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
+        pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
     # Resource in path is a single model_anno_obj
     elif is_model(pipe_path):
         c = offline_utils.verify_and_create_model(pipe_path)
@@ -307,13 +311,12 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
         return PipelineCompleter.check_and_fix_nlu_pipeline(pipe)
 
     else:
-        #fallback pipe
-        pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
+        # fallback pipe
+        pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
     for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
     return pipe
 
 
-
 def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
     """Either there is a pipeline of models in the path or just one singular model_anno_obj.
     If it is a component_list,  load the component_list and return it.
@@ -322,14 +325,15 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
     if is_running_in_databricks():
         return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request)
     pipe = NLUPipeline()
+    pipe.nlu_ref = request
     nlu_ref = request  # pipe_path
     if os.path.exists(pipe_path):
 
         # Resource in path is a pipeline
         if offline_utils.is_pipe(pipe_path):
             # language, nlp_ref, nlu_ref,path=None, is_licensed=False
             # todo deduct lang and if Licensed or not
-            pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
+            pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
         # Resource in path is a single model_anno_obj
         elif offline_utils.is_model(pipe_path):
             c = offline_utils.verify_and_create_model(pipe_path)
@@ -340,7 +344,16 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
             print(
                 f"Could not load model_anno_obj in path {pipe_path}. Make sure the jsl_folder contains either a stages subfolder or a metadata subfolder.")
             raise ValueError
-        for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
+        for c in pipe_components:
+            pipe.add(c, nlu_ref, pretrained_pipe_component=True)
+        if is_nlu_uid(uid):
+            data = json.loads(uid)
+            print(data)
+            pipe.nlu_ref = data['0']['nlu_ref']
+            for i, c in enumerate(pipe.components):
+                c.nlu_ref = data[str(i + 1)]['nlu_ref']
+                c.nlp_ref = data[str(i + 1)]['nlp_ref']
+                c.loaded_from_pretrained_pipe = data[str(i + 1)]['loaded_from_pretrained_pipe']
         return pipe
 
     else:
diff --git a/nlu/pipe/component_resolution.py b/nlu/pipe/component_resolution.py
@@ -2,7 +2,7 @@
 Contains methods used to resolve a NLU reference to a NLU component_to_resolve.
 Handler for getting default components, etc.
 '''
-from typing import Dict, List, Union, Optional, Callable
+from typing import Dict, List, Union, Optional, Callable, Tuple
 
 from pyspark.ml import PipelineModel, Pipeline
 from sparknlp.pretrained import PretrainedPipeline, LightPipeline
@@ -155,7 +155,7 @@ def nlu_ref_to_component(nlu_ref, detect_lang=False, authenticated=False) -> Uni
     lang, nlu_ref, nlp_ref, license_type, is_pipe, model_params = nlu_ref_to_nlp_metadata(nlu_ref)
 
     if is_pipe:
-        resolved_component = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
+        resolved_component, _ = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
                                                                          license_type=license_type)
     else:
         resolved_component = get_trained_component_for_nlp_model_ref(lang, nlu_ref, nlp_ref, license_type, model_params)
@@ -179,7 +179,7 @@ def get_trainable_component_for_nlu_ref(nlu_ref) -> NluComponent:
 
 def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path=None,
                                                       license_type: LicenseType = Licenses.open_source,
-                                                      ) -> List[NluComponent]:
+                                                      ) -> Tuple[List[NluComponent],str]:
     """
     creates a list of components from a Spark NLP Pipeline reference
     1. download pipeline
@@ -190,7 +190,9 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
     :param language: language of the pipeline
     :param nlp_ref: Reference to a spark nlp pretrained pipeline
     :param path: Load component_list from HDD
-    :return: Each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
+    :return: Tuple,
+                first element List of each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
+                second element UUID of the pipeline
     """
     logger.info(f'Building pretrained pipe for nlu_ref={nlu_ref} nlp_ref={nlp_ref}')
     if 'language' in nlp_ref:
@@ -199,16 +201,20 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
     if path is None:
         if license_type != Licenses.open_source:
             pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models')
+            uid = pipe.model.uid
+
         else:
             pipe = PretrainedPipeline(nlp_ref, lang=language)
+            uid = pipe.model.uid
         iterable_stages = pipe.light_model.pipeline_model.stages
     else:
         pipe = LightPipeline(PipelineModel.load(path=path))
+        uid = pipe.pipeline_model.uid
         iterable_stages = pipe.pipeline_model.stages
     constructed_components = get_component_list_for_iterable_stages(iterable_stages, language, nlp_ref, nlu_ref,
                                                                     license_type)
     return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
-        PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components))
+        PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components)), uid
 
 
 
diff --git a/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py b/nlu/pipe/extractors/extractor_methods/base_extractor_methods.py
@@ -206,7 +206,6 @@ def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorCon
 
     return {**beginnings, **endings, **results, **annotator_types, **embeddings, **origins}  # Merge dicts NLP output
 
-
 def extract_sparknlp_metadata(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict:
     """
     Extract base features common in all saprk NLP annotators
diff --git a/nlu/pipe/nlu_component.py b/nlu/pipe/nlu_component.py
@@ -72,6 +72,9 @@ def __init__(self,
                  trained_mirror_anno: Optional[JslAnnoId] = None,
                  applicable_file_types: List[str] = None,  # Used for OCR annotators to deduct applicable file types
                  is_trained: bool = True,  # Set to true for trainable annotators
+                 requires_binary_format: bool = False,  # Set to true for OCR annotators that require binary image format
+                 requires_image_format: bool = False,  # Set to true for OCR annotators that require image format
+                 is_visual_annotator: bool = False,  # Set to true for OCR annotators that require image format
                  ):
         self.name = name
         self.type = type
@@ -110,6 +113,9 @@ def __init__(self,
         self.trained_mirror_anno = trained_mirror_anno
         self.applicable_file_types = applicable_file_types
         self.is_trained = is_trained
+        self.requires_binary_format = requires_binary_format
+        self.requires_image_format = requires_image_format
+        self.is_visual_annotator = is_visual_annotator
 
     def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
                      nlu_ref: str,
diff --git a/nlu/pipe/pipeline.py b/nlu/pipe/pipeline.py
@@ -1,3 +1,4 @@
+import json
 import logging
 from typing import Union
 
@@ -57,7 +58,8 @@ def __init__(self):
         self.has_span_classifiers = False
         self.prefer_light = False
         self.has_table_qa_models = False
-
+        self.requires_image_format = False
+        self.requires_binary_format = False
     def add(self, component: NluComponent, nlu_reference=None, pretrained_pipe_component=False,
             name_to_add='', idx=None):
         '''
@@ -418,11 +420,26 @@ def drop_irrelevant_cols(self, cols, keep_origin_index=False):
         if keep_origin_index == False and 'origin_index' in cols: cols.remove('origin_index')
         return cols
 
-    def save(self, path, component='entire_pipeline', overwrite=False):
+    def save(self, path, component='entire_pipeline', overwrite=True):
+        # serialize data
+        data = {}
+        data[0] = {'nlu_ref': self.nlu_ref}
+        data['is_nlu_pipe'] = True
+        for i, c in enumerate(self.components):
+            data[i + 1] = {'nlu_ref': c.nlu_ref, 'nlp_ref': c.nlp_ref,
+                           'loaded_from_pretrained_pipe': c.loaded_from_pretrained_pipe}
+
+        data = json.dumps(data)
         if not self.is_fitted or not hasattr(self, 'vanilla_transformer_pipe'):
             self.fit()
             self.is_fitted = True
-
+        # self.vanilla_transformer_pipe.extractParamMap()
+        if hasattr(self, 'nlu_ref'):
+            """ ATTRS TO SAVE FOR EACH COMPONENT / PIPELINE: 
+            - nlp ref/nlu ref
+            - is loaded_form_pipe
+            """
+            self.vanilla_transformer_pipe._resetUid(data)
         if component == 'entire_pipeline':
             if overwrite:
                 self.vanilla_transformer_pipe.write().overwrite().save(path)
diff --git a/nlu/pipe/utils/pipe_utils.py b/nlu/pipe/utils/pipe_utils.py
@@ -667,8 +667,13 @@ def add_metadata_to_pipe(pipe: NLUPipeline):
 
         for c in pipe.components:
             # Check for OCR componments
-            if c.jsl_anno_py_class in py_class_to_anno_id.keys():
+            if c.jsl_anno_py_class in py_class_to_anno_id.keys() or c.is_visual_annotator:
                 pipe.contains_ocr_components = True
+                if c.requires_image_format:
+                    pipe.requires_image_format = True
+                if c.requires_binary_format:
+                    pipe.requires_binary_format = True
+
             # Check for licensed components
             if c.license in [Licenses.ocr, Licenses.hc]:
                 pipe.has_licensed_components = True
diff --git a/nlu/pipe/utils/predict_helper.py b/nlu/pipe/utils/predict_helper.py
diff --git a/nlu/universe/component_universes.py b/nlu/universe/component_universes.py