From ccedd540d5403f6207865f6e003e0ab807ffd2ce Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Tue, 25 Feb 2025 14:40:03 -0500 Subject: [PATCH 1/4] Minor tweak to the extraction rule cell to handle an edge case --- ...astic-crawler-to-open-crawler-migration.ipynb | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb index 0c1d48ed..448ea18d 100644 --- a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb +++ b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb @@ -311,13 +311,15 @@ " }\n", " ]\n", "\n", - " print(\n", - " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", - " )\n", - " extr_count += 1\n", - " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", - " \"extraction_rulesets\"\n", - " ] = temp_extraction_rulesets" + " if domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]:\n", + " print(\n", + " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", + " )\n", + " extr_count += 1\n", + "\n", + " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", + " \"extraction_rulesets\"\n", + " ] = temp_extraction_rulesets" ] }, { From a3bd8feae230137671c652b8300a75fa637ab2d2 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Wed, 26 Feb 2025 08:39:09 -0500 Subject: [PATCH 2/4] Moved domain_oid check to prevent an unnecessary extraction rule from being processed --- ...ic-crawler-to-open-crawler-migration.ipynb | 73 ++++++++++--------- 1 file changed, 37 insertions(+), 36 deletions(-) diff --git a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb index 448ea18d..fed6a925 100644 --- a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb +++ b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb @@ -269,49 +269,50 @@ " config_oid = source[\"configuration_oid\"]\n", " domain_oid = source[\"domain_oid\"]\n", "\n", - " all_rules = source[\"rules\"]\n", - " all_url_filters = source[\"url_filters\"]\n", + " # ensure the domain oid actually exists in our in-memory data structure\n", + " if domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]:\n", + " all_rules = source[\"rules\"]\n", + " all_url_filters = source[\"url_filters\"]\n", "\n", - " # extract url filters\n", - " url_filters = []\n", - " if all_url_filters:\n", - " url_filters = [\n", - " {\n", - " \"type\": all_url_filters[0][\"filter\"],\n", - " \"pattern\": all_url_filters[0][\"pattern\"],\n", - " }\n", - " ]\n", + " # extract url filters\n", + " url_filters = []\n", + " if all_url_filters:\n", + " url_filters = [\n", + " {\n", + " \"type\": all_url_filters[0][\"filter\"],\n", + " \"pattern\": all_url_filters[0][\"pattern\"],\n", + " }\n", + " ]\n", "\n", - " # extract rulesets\n", - " action_translation_map = {\n", - " \"fixed\": \"set\",\n", - " \"extracted\": \"extract\",\n", - " }\n", + " # extract rulesets\n", + " action_translation_map = {\n", + " \"fixed\": \"set\",\n", + " \"extracted\": \"extract\",\n", + " }\n", "\n", - " ruleset = {}\n", - " if all_rules:\n", - " ruleset = [\n", + " ruleset = {}\n", + " if all_rules:\n", + " ruleset = [\n", + " {\n", + " \"action\": action_translation_map[\n", + " all_rules[0][\"content_from\"][\"value_type\"]\n", + " ],\n", + " \"field_name\": all_rules[0][\"field_name\"],\n", + " \"selector\": all_rules[0][\"selector\"],\n", + " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", + " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", + " \"source\": all_rules[0][\"source_type\"],\n", + " }\n", + " ]\n", + "\n", + " # populate the in-memory data structure\n", + " temp_extraction_rulesets = [\n", " {\n", - " \"action\": action_translation_map[\n", - " all_rules[0][\"content_from\"][\"value_type\"]\n", - " ],\n", - " \"field_name\": all_rules[0][\"field_name\"],\n", - " \"selector\": all_rules[0][\"selector\"],\n", - " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", - " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", - " \"source\": all_rules[0][\"source_type\"],\n", + " \"url_filters\": url_filters,\n", + " \"rules\": ruleset,\n", " }\n", " ]\n", "\n", - " # populate the in-memory data structure\n", - " temp_extraction_rulesets = [\n", - " {\n", - " \"url_filters\": url_filters,\n", - " \"rules\": ruleset,\n", - " }\n", - " ]\n", - "\n", - " if domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]:\n", " print(\n", " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", " )\n", From 5ca678dd592e4a934f00fd39460d4caccde3f0f6 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Wed, 26 Feb 2025 09:17:15 -0500 Subject: [PATCH 3/4] Revised the if statement to check config_oids too --- .../elastic-crawler-to-open-crawler-migration.ipynb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb index fed6a925..f19ddf99 100644 --- a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb +++ b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb @@ -269,8 +269,12 @@ " config_oid = source[\"configuration_oid\"]\n", " domain_oid = source[\"domain_oid\"]\n", "\n", - " # ensure the domain oid actually exists in our in-memory data structure\n", - " if domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]:\n", + " # ensure the config and domain oids actually exist in our in-memory data structure\n", + " if (\n", + " config_oid in inflight_configuration_data\n", + " and domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]\n", + " ):\n", + "\n", " all_rules = source[\"rules\"]\n", " all_url_filters = source[\"url_filters\"]\n", "\n", From 8cf33a755e788e07220cce2ab70198ae43a110a7 Mon Sep 17 00:00:00 2001 From: mattnowzari Date: Wed, 26 Feb 2025 09:47:54 -0500 Subject: [PATCH 4/4] Fixed related bug where a crawler with multiple extraction rules was only receiving the last-known extraction rule and not all extraction rules --- ...ic-crawler-to-open-crawler-migration.ipynb | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb index f19ddf99..9c47bc86 100644 --- a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb +++ b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb @@ -275,6 +275,15 @@ " and domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]\n", " ):\n", "\n", + " # initialize extraction rulesets an empty array if it doesn't exist yet\n", + " if (\n", + " not \"extraction_rulesets\"\n", + " in inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid]\n", + " ):\n", + " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", + " \"extraction_rulesets\"\n", + " ] = []\n", + "\n", " all_rules = source[\"rules\"]\n", " all_url_filters = source[\"url_filters\"]\n", "\n", @@ -294,7 +303,7 @@ " \"extracted\": \"extract\",\n", " }\n", "\n", - " ruleset = {}\n", + " ruleset = []\n", " if all_rules:\n", " ruleset = [\n", " {\n", @@ -309,13 +318,10 @@ " }\n", " ]\n", "\n", - " # populate the in-memory data structure\n", - " temp_extraction_rulesets = [\n", - " {\n", - " \"url_filters\": url_filters,\n", - " \"rules\": ruleset,\n", - " }\n", - " ]\n", + " temp_extraction_rulesets = {\n", + " \"url_filters\": url_filters,\n", + " \"rules\": ruleset,\n", + " }\n", "\n", " print(\n", " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", @@ -324,7 +330,7 @@ "\n", " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", " \"extraction_rulesets\"\n", - " ] = temp_extraction_rulesets" + " ].append(temp_extraction_rulesets)" ] }, {