diff --git a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb index 448ea18d..fed6a925 100644 --- a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb +++ b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb @@ -269,49 +269,50 @@ " config_oid = source[\"configuration_oid\"]\n", " domain_oid = source[\"domain_oid\"]\n", "\n", - " all_rules = source[\"rules\"]\n", - " all_url_filters = source[\"url_filters\"]\n", + " # ensure the domain oid actually exists in our in-memory data structure\n", + " if domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]:\n", + " all_rules = source[\"rules\"]\n", + " all_url_filters = source[\"url_filters\"]\n", "\n", - " # extract url filters\n", - " url_filters = []\n", - " if all_url_filters:\n", - " url_filters = [\n", - " {\n", - " \"type\": all_url_filters[0][\"filter\"],\n", - " \"pattern\": all_url_filters[0][\"pattern\"],\n", - " }\n", - " ]\n", + " # extract url filters\n", + " url_filters = []\n", + " if all_url_filters:\n", + " url_filters = [\n", + " {\n", + " \"type\": all_url_filters[0][\"filter\"],\n", + " \"pattern\": all_url_filters[0][\"pattern\"],\n", + " }\n", + " ]\n", "\n", - " # extract rulesets\n", - " action_translation_map = {\n", - " \"fixed\": \"set\",\n", - " \"extracted\": \"extract\",\n", - " }\n", + " # extract rulesets\n", + " action_translation_map = {\n", + " \"fixed\": \"set\",\n", + " \"extracted\": \"extract\",\n", + " }\n", "\n", - " ruleset = {}\n", - " if all_rules:\n", - " ruleset = [\n", + " ruleset = {}\n", + " if all_rules:\n", + " ruleset = [\n", + " {\n", + " \"action\": action_translation_map[\n", + " all_rules[0][\"content_from\"][\"value_type\"]\n", + " ],\n", + " \"field_name\": all_rules[0][\"field_name\"],\n", + " \"selector\": all_rules[0][\"selector\"],\n", + " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", + " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", + " \"source\": all_rules[0][\"source_type\"],\n", + " }\n", + " ]\n", + "\n", + " # populate the in-memory data structure\n", + " temp_extraction_rulesets = [\n", " {\n", - " \"action\": action_translation_map[\n", - " all_rules[0][\"content_from\"][\"value_type\"]\n", - " ],\n", - " \"field_name\": all_rules[0][\"field_name\"],\n", - " \"selector\": all_rules[0][\"selector\"],\n", - " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", - " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", - " \"source\": all_rules[0][\"source_type\"],\n", + " \"url_filters\": url_filters,\n", + " \"rules\": ruleset,\n", " }\n", " ]\n", "\n", - " # populate the in-memory data structure\n", - " temp_extraction_rulesets = [\n", - " {\n", - " \"url_filters\": url_filters,\n", - " \"rules\": ruleset,\n", - " }\n", - " ]\n", - "\n", - " if domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]:\n", " print(\n", " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", " )\n",