Skip to content

Commit fdbc5f8

Browse files
authored
Merge branch 'master' into 502
2 parents 3d58c7d + 55d7d18 commit fdbc5f8

14 files changed

Lines changed: 2450 additions & 2299 deletions

docs/_includes/footer.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
<div>
2626
{%- include snippets/get-locale-string.html key='COPYRIGHT_DATES' -%}
2727
{%- assign _locale_copyright_dates = __return -%}
28-
© <span id="years"></span> John Snow Labs Inc.
29-
<a href="http://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="http://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a>
28+
&copy; <span id="years"></span> John Snow Labs Inc.
29+
<a href="https://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="https://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a>
3030
</div>
3131
</div>
3232
</div>

docs/en/examples.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ nlp.load('lang').predict(['NLU is an open-source text processing library for adv
441441
## E2E Classifier
442442
[E2E Classifier example](https://colab.research.google.com/drive/1OSkiXGEpKlm9HWDoVb42uLNQQgb7nqNZ?usp=sharing)
443443

444-
This is a multi class classifier trained on the E2E [dataset for Natural language generation](http://www.macs.hw.ac.uk/InteractionLab/E2E/#)
444+
This is a multi class classifier trained on the E2E [dataset for Natural language generation](https://www.macs.hw.ac.uk/InteractionLab/E2E/#)
445445

446446
```python
447447
nlp.load('e2e').predict('E2E is a dataset for training generative models')

docs/en/nlu_for_healthcare.md

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

docs/en/release_notes.md

Lines changed: 2152 additions & 2152 deletions
Large diffs are not rendered by default.

docs/en/start_sparkseession.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Review detailed [docs here](https://nlu.johnsnowlabs.com/docs/en/install#authori
3131
|-------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------|---------|
3232
| `None` | Load license automatically via one of the **Auto-Detection Mechanisms** | `jsl.start()` | `False` |
3333
| `browser_login` | Browser based authorization, Button to click on Notebooks and Browser Pop-Up otherwise. | `jsl.start(browser_login=True)` | `False` |
34-
| `access_token` | Vist [my.johnsnowlabs.com](https://my.johnsnowlabs.com/) to extract a token which you can provide to enable license access. See [Access Token Example](http://nlu.johnsnowlabs.com/docs/en/install#via-access-token) | `jsl.start(access_token='myToken')` | `None` |
34+
| `access_token` | Vist [my.johnsnowlabs.com](https://my.johnsnowlabs.com/) to extract a token which you can provide to enable license access. See [Access Token Example](https://nlu.johnsnowlabs.com/docs/en/install#via-access-token) | `jsl.start(access_token='myToken')` | `None` |
3535
| `secrets_file` | Define JSON license file with keys defined by [License Variable Overview](https://nlu.johnsnowlabs.com/docs/en/install#license-variables-names-for-json-and-os-variables) and provide file path | `jsl.start(secrets_file='path/to/license.json')` | `None` |
3636
| `store_in_jsl_home` | Disable caching of new licenses to `~./jsl_home` | `jsl.start(store_in_jsl_home=False)` | `True` |
3737
| `local_license_number` | Specify which license to use, if you have access to multiple locally cached | `jsl.start(license_number=5)` | `0` |

docs/footer.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
<div class="container">
2-
<p>&copy; <span id="year_id"></span> . John Snow Labs Inc. <a href="http://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="http://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a></p>
2+
<p>&copy; <span id="year_id"></span> . John Snow Labs Inc. <a href="https://www.johnsnowlabs.com/terms-of-service">Terms of Service</a> | <a href="https://www.johnsnowlabs.com/privacy-policy/">Privacy Policy</a></p>
33
</div>
44
<script>document.getElementById("year_id").innerHTML = new Date().getFullYear();</script>

nlu/__init__.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
__version__ = '5.0.2'
22

3+
34
import nlu.utils.environment.env_utils as env_utils
45

56
if not env_utils.try_import_pyspark_in_streamlit():
@@ -208,7 +209,6 @@ def load(request: str = 'from_disk', path: Optional[str] = None, verbose: bool =
208209
if path is not None:
209210
logger.info(f'Trying to load nlu pipeline from local hard drive, located at {path}')
210211
pipe = load_nlu_pipe_from_hdd(path, request)
211-
pipe.nlu_ref = request
212212
return pipe
213213
except Exception as err:
214214
if verbose:
@@ -274,6 +274,10 @@ def auth(HEALTHCARE_LICENSE_OR_JSON_PATH='/content/spark_nlp_for_healthcare.json
274274
return nlu
275275

276276

277+
def is_nlu_uid(uid: str):
278+
return 'is_nlu_pipe' in uid
279+
280+
277281
def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
278282
"""Either there is a pipeline of models in the path or just one singular model_anno_obj.
279283
If it is a component_list, load the component_list and return it.
@@ -298,7 +302,7 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
298302
# if dbfs_path_exist(pipe_path):
299303
# Resource in path is a pipeline
300304
if is_pipe(pipe_path):
301-
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
305+
pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
302306
# Resource in path is a single model_anno_obj
303307
elif is_model(pipe_path):
304308
c = offline_utils.verify_and_create_model(pipe_path)
@@ -307,13 +311,12 @@ def load_nlu_pipe_from_hdd_in_databricks(pipe_path, request) -> NLUPipeline:
307311
return PipelineCompleter.check_and_fix_nlu_pipeline(pipe)
308312

309313
else:
310-
#fallback pipe
311-
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
314+
# fallback pipe
315+
pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
312316
for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
313317
return pipe
314318

315319

316-
317320
def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
318321
"""Either there is a pipeline of models in the path or just one singular model_anno_obj.
319322
If it is a component_list, load the component_list and return it.
@@ -322,14 +325,15 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
322325
if is_running_in_databricks():
323326
return load_nlu_pipe_from_hdd_in_databricks(pipe_path, request)
324327
pipe = NLUPipeline()
328+
pipe.nlu_ref = request
325329
nlu_ref = request # pipe_path
326330
if os.path.exists(pipe_path):
327331

328332
# Resource in path is a pipeline
329333
if offline_utils.is_pipe(pipe_path):
330334
# language, nlp_ref, nlu_ref,path=None, is_licensed=False
331335
# todo deduct lang and if Licensed or not
332-
pipe_components = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
336+
pipe_components, uid = get_trained_component_list_for_nlp_pipe_ref('en', nlu_ref, nlu_ref, pipe_path, False)
333337
# Resource in path is a single model_anno_obj
334338
elif offline_utils.is_model(pipe_path):
335339
c = offline_utils.verify_and_create_model(pipe_path)
@@ -340,7 +344,16 @@ def load_nlu_pipe_from_hdd(pipe_path, request) -> NLUPipeline:
340344
print(
341345
f"Could not load model_anno_obj in path {pipe_path}. Make sure the jsl_folder contains either a stages subfolder or a metadata subfolder.")
342346
raise ValueError
343-
for c in pipe_components: pipe.add(c, nlu_ref, pretrained_pipe_component=True)
347+
for c in pipe_components:
348+
pipe.add(c, nlu_ref, pretrained_pipe_component=True)
349+
if is_nlu_uid(uid):
350+
data = json.loads(uid)
351+
print(data)
352+
pipe.nlu_ref = data['0']['nlu_ref']
353+
for i, c in enumerate(pipe.components):
354+
c.nlu_ref = data[str(i + 1)]['nlu_ref']
355+
c.nlp_ref = data[str(i + 1)]['nlp_ref']
356+
c.loaded_from_pretrained_pipe = data[str(i + 1)]['loaded_from_pretrained_pipe']
344357
return pipe
345358

346359
else:

nlu/pipe/component_resolution.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Contains methods used to resolve a NLU reference to a NLU component_to_resolve.
33
Handler for getting default components, etc.
44
'''
5-
from typing import Dict, List, Union, Optional, Callable
5+
from typing import Dict, List, Union, Optional, Callable, Tuple
66

77
from pyspark.ml import PipelineModel, Pipeline
88
from sparknlp.pretrained import PretrainedPipeline, LightPipeline
@@ -155,7 +155,7 @@ def nlu_ref_to_component(nlu_ref, detect_lang=False, authenticated=False) -> Uni
155155
lang, nlu_ref, nlp_ref, license_type, is_pipe, model_params = nlu_ref_to_nlp_metadata(nlu_ref)
156156

157157
if is_pipe:
158-
resolved_component = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
158+
resolved_component, _ = get_trained_component_list_for_nlp_pipe_ref(lang, nlp_ref, nlu_ref,
159159
license_type=license_type)
160160
else:
161161
resolved_component = get_trained_component_for_nlp_model_ref(lang, nlu_ref, nlp_ref, license_type, model_params)
@@ -179,7 +179,7 @@ def get_trainable_component_for_nlu_ref(nlu_ref) -> NluComponent:
179179

180180
def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path=None,
181181
license_type: LicenseType = Licenses.open_source,
182-
) -> List[NluComponent]:
182+
) -> Tuple[List[NluComponent],str]:
183183
"""
184184
creates a list of components from a Spark NLP Pipeline reference
185185
1. download pipeline
@@ -190,7 +190,9 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
190190
:param language: language of the pipeline
191191
:param nlp_ref: Reference to a spark nlp pretrained pipeline
192192
:param path: Load component_list from HDD
193-
:return: Each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
193+
:return: Tuple,
194+
first element List of each element of the Spark NLP pipeline wrapped as a NLU component_to_resolve inside a list
195+
second element UUID of the pipeline
194196
"""
195197
logger.info(f'Building pretrained pipe for nlu_ref={nlu_ref} nlp_ref={nlp_ref}')
196198
if 'language' in nlp_ref:
@@ -199,16 +201,20 @@ def get_trained_component_list_for_nlp_pipe_ref(language, nlp_ref, nlu_ref, path
199201
if path is None:
200202
if license_type != Licenses.open_source:
201203
pipe = PretrainedPipeline(nlp_ref, lang=language, remote_loc='clinical/models')
204+
uid = pipe.model.uid
205+
202206
else:
203207
pipe = PretrainedPipeline(nlp_ref, lang=language)
208+
uid = pipe.model.uid
204209
iterable_stages = pipe.light_model.pipeline_model.stages
205210
else:
206211
pipe = LightPipeline(PipelineModel.load(path=path))
212+
uid = pipe.pipeline_model.uid
207213
iterable_stages = pipe.pipeline_model.stages
208214
constructed_components = get_component_list_for_iterable_stages(iterable_stages, language, nlp_ref, nlu_ref,
209215
license_type)
210216
return ComponentUtils.set_storage_ref_attribute_of_embedding_converters(
211-
PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components))
217+
PipeUtils.set_column_values_on_components_from_pretrained_pipe(constructed_components)), uid
212218

213219

214220

nlu/pipe/extractors/extractor_methods/base_extractor_methods.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,6 @@ def extract_base_sparknlp_features(row: pd.Series, configs: SparkNLPExtractorCon
206206

207207
return {**beginnings, **endings, **results, **annotator_types, **embeddings, **origins} # Merge dicts NLP output
208208

209-
210209
def extract_sparknlp_metadata(row: pd.Series, configs: SparkNLPExtractorConfig) -> dict:
211210
"""
212211
Extract base features common in all saprk NLP annotators

nlu/pipe/nlu_component.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ def __init__(self,
7272
trained_mirror_anno: Optional[JslAnnoId] = None,
7373
applicable_file_types: List[str] = None, # Used for OCR annotators to deduct applicable file types
7474
is_trained: bool = True, # Set to true for trainable annotators
75+
requires_binary_format: bool = False, # Set to true for OCR annotators that require binary image format
76+
requires_image_format: bool = False, # Set to true for OCR annotators that require image format
77+
is_visual_annotator: bool = False, # Set to true for OCR annotators that require image format
7578
):
7679
self.name = name
7780
self.type = type
@@ -110,6 +113,9 @@ def __init__(self,
110113
self.trained_mirror_anno = trained_mirror_anno
111114
self.applicable_file_types = applicable_file_types
112115
self.is_trained = is_trained
116+
self.requires_binary_format = requires_binary_format
117+
self.requires_image_format = requires_image_format
118+
self.is_visual_annotator = is_visual_annotator
113119

114120
def set_metadata(self, jsl_anno_object: Union[AnnotatorApproach, AnnotatorModel],
115121
nlu_ref: str,

0 commit comments

Comments
 (0)