From 184e40bfa1ac838f9751be5f226554f9473466c7 Mon Sep 17 00:00:00 2001 From: Matt Nowzari Date: Wed, 26 Feb 2025 13:32:07 -0500 Subject: [PATCH] Bugfix in Crawler Migration Notebook extraction rule cell (#410) --- ...ic-crawler-to-open-crawler-migration.ipynb | 99 +++++++++++-------- 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb index 0c1d48ed..9c47bc86 100644 --- a/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb +++ b/notebooks/enterprise-search/elastic-crawler-to-open-crawler-migration.ipynb @@ -269,55 +269,68 @@ " config_oid = source[\"configuration_oid\"]\n", " domain_oid = source[\"domain_oid\"]\n", "\n", - " all_rules = source[\"rules\"]\n", - " all_url_filters = source[\"url_filters\"]\n", - "\n", - " # extract url filters\n", - " url_filters = []\n", - " if all_url_filters:\n", - " url_filters = [\n", - " {\n", - " \"type\": all_url_filters[0][\"filter\"],\n", - " \"pattern\": all_url_filters[0][\"pattern\"],\n", - " }\n", - " ]\n", - "\n", - " # extract rulesets\n", - " action_translation_map = {\n", - " \"fixed\": \"set\",\n", - " \"extracted\": \"extract\",\n", - " }\n", + " # ensure the config and domain oids actually exist in our in-memory data structure\n", + " if (\n", + " config_oid in inflight_configuration_data\n", + " and domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]\n", + " ):\n", + "\n", + " # initialize extraction rulesets an empty array if it doesn't exist yet\n", + " if (\n", + " not \"extraction_rulesets\"\n", + " in inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid]\n", + " ):\n", + " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", + " \"extraction_rulesets\"\n", + " ] = []\n", + "\n", + " all_rules = source[\"rules\"]\n", + " all_url_filters = source[\"url_filters\"]\n", + "\n", + " # extract url filters\n", + " url_filters = []\n", + " if all_url_filters:\n", + " url_filters = [\n", + " {\n", + " \"type\": all_url_filters[0][\"filter\"],\n", + " \"pattern\": all_url_filters[0][\"pattern\"],\n", + " }\n", + " ]\n", "\n", - " ruleset = {}\n", - " if all_rules:\n", - " ruleset = [\n", - " {\n", - " \"action\": action_translation_map[\n", - " all_rules[0][\"content_from\"][\"value_type\"]\n", - " ],\n", - " \"field_name\": all_rules[0][\"field_name\"],\n", - " \"selector\": all_rules[0][\"selector\"],\n", - " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", - " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", - " \"source\": all_rules[0][\"source_type\"],\n", - " }\n", - " ]\n", + " # extract rulesets\n", + " action_translation_map = {\n", + " \"fixed\": \"set\",\n", + " \"extracted\": \"extract\",\n", + " }\n", "\n", - " # populate the in-memory data structure\n", - " temp_extraction_rulesets = [\n", - " {\n", + " ruleset = []\n", + " if all_rules:\n", + " ruleset = [\n", + " {\n", + " \"action\": action_translation_map[\n", + " all_rules[0][\"content_from\"][\"value_type\"]\n", + " ],\n", + " \"field_name\": all_rules[0][\"field_name\"],\n", + " \"selector\": all_rules[0][\"selector\"],\n", + " \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n", + " \"value\": all_rules[0][\"content_from\"][\"value\"],\n", + " \"source\": all_rules[0][\"source_type\"],\n", + " }\n", + " ]\n", + "\n", + " temp_extraction_rulesets = {\n", " \"url_filters\": url_filters,\n", " \"rules\": ruleset,\n", " }\n", - " ]\n", "\n", - " print(\n", - " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", - " )\n", - " extr_count += 1\n", - " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", - " \"extraction_rulesets\"\n", - " ] = temp_extraction_rulesets" + " print(\n", + " f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n", + " )\n", + " extr_count += 1\n", + "\n", + " inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n", + " \"extraction_rulesets\"\n", + " ].append(temp_extraction_rulesets)" ] }, {