Skip to content

Commit

Permalink
Bugfix in Crawler Migration Notebook extraction rule cell (#410)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattnowzari authored Feb 26, 2025
1 parent d0605d2 commit 184e40b
Showing 1 changed file with 56 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -269,55 +269,68 @@
" config_oid = source[\"configuration_oid\"]\n",
" domain_oid = source[\"domain_oid\"]\n",
"\n",
" all_rules = source[\"rules\"]\n",
" all_url_filters = source[\"url_filters\"]\n",
"\n",
" # extract url filters\n",
" url_filters = []\n",
" if all_url_filters:\n",
" url_filters = [\n",
" {\n",
" \"type\": all_url_filters[0][\"filter\"],\n",
" \"pattern\": all_url_filters[0][\"pattern\"],\n",
" }\n",
" ]\n",
"\n",
" # extract rulesets\n",
" action_translation_map = {\n",
" \"fixed\": \"set\",\n",
" \"extracted\": \"extract\",\n",
" }\n",
" # ensure the config and domain oids actually exist in our in-memory data structure\n",
" if (\n",
" config_oid in inflight_configuration_data\n",
" and domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]\n",
" ):\n",
"\n",
" # initialize extraction rulesets an empty array if it doesn't exist yet\n",
" if (\n",
" not \"extraction_rulesets\"\n",
" in inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid]\n",
" ):\n",
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
" \"extraction_rulesets\"\n",
" ] = []\n",
"\n",
" all_rules = source[\"rules\"]\n",
" all_url_filters = source[\"url_filters\"]\n",
"\n",
" # extract url filters\n",
" url_filters = []\n",
" if all_url_filters:\n",
" url_filters = [\n",
" {\n",
" \"type\": all_url_filters[0][\"filter\"],\n",
" \"pattern\": all_url_filters[0][\"pattern\"],\n",
" }\n",
" ]\n",
"\n",
" ruleset = {}\n",
" if all_rules:\n",
" ruleset = [\n",
" {\n",
" \"action\": action_translation_map[\n",
" all_rules[0][\"content_from\"][\"value_type\"]\n",
" ],\n",
" \"field_name\": all_rules[0][\"field_name\"],\n",
" \"selector\": all_rules[0][\"selector\"],\n",
" \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n",
" \"value\": all_rules[0][\"content_from\"][\"value\"],\n",
" \"source\": all_rules[0][\"source_type\"],\n",
" }\n",
" ]\n",
" # extract rulesets\n",
" action_translation_map = {\n",
" \"fixed\": \"set\",\n",
" \"extracted\": \"extract\",\n",
" }\n",
"\n",
" # populate the in-memory data structure\n",
" temp_extraction_rulesets = [\n",
" {\n",
" ruleset = []\n",
" if all_rules:\n",
" ruleset = [\n",
" {\n",
" \"action\": action_translation_map[\n",
" all_rules[0][\"content_from\"][\"value_type\"]\n",
" ],\n",
" \"field_name\": all_rules[0][\"field_name\"],\n",
" \"selector\": all_rules[0][\"selector\"],\n",
" \"join_as\": all_rules[0][\"multiple_objects_handling\"],\n",
" \"value\": all_rules[0][\"content_from\"][\"value\"],\n",
" \"source\": all_rules[0][\"source_type\"],\n",
" }\n",
" ]\n",
"\n",
" temp_extraction_rulesets = {\n",
" \"url_filters\": url_filters,\n",
" \"rules\": ruleset,\n",
" }\n",
" ]\n",
"\n",
" print(\n",
" f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n",
" )\n",
" extr_count += 1\n",
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
" \"extraction_rulesets\"\n",
" ] = temp_extraction_rulesets"
" print(\n",
" f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n",
" )\n",
" extr_count += 1\n",
"\n",
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
" \"extraction_rulesets\"\n",
" ].append(temp_extraction_rulesets)"
]
},
{
Expand Down

0 comments on commit 184e40b

Please sign in to comment.