Skip to content

Commit

Permalink
Fixed related bug where a crawler with multiple extraction rules was …
Browse files Browse the repository at this point in the history
…only receiving the last-known extraction rule and not all extraction rules
  • Loading branch information
mattnowzari committed Feb 26, 2025
1 parent 5ca678d commit 8cf33a7
Showing 1 changed file with 15 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,15 @@
" and domain_oid in inflight_configuration_data[config_oid][\"domains_temp\"]\n",
" ):\n",
"\n",
" # initialize extraction rulesets an empty array if it doesn't exist yet\n",
" if (\n",
" not \"extraction_rulesets\"\n",
" in inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid]\n",
" ):\n",
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
" \"extraction_rulesets\"\n",
" ] = []\n",
"\n",
" all_rules = source[\"rules\"]\n",
" all_url_filters = source[\"url_filters\"]\n",
"\n",
Expand All @@ -294,7 +303,7 @@
" \"extracted\": \"extract\",\n",
" }\n",
"\n",
" ruleset = {}\n",
" ruleset = []\n",
" if all_rules:\n",
" ruleset = [\n",
" {\n",
Expand All @@ -309,13 +318,10 @@
" }\n",
" ]\n",
"\n",
" # populate the in-memory data structure\n",
" temp_extraction_rulesets = [\n",
" {\n",
" \"url_filters\": url_filters,\n",
" \"rules\": ruleset,\n",
" }\n",
" ]\n",
" temp_extraction_rulesets = {\n",
" \"url_filters\": url_filters,\n",
" \"rules\": ruleset,\n",
" }\n",
"\n",
" print(\n",
" f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n",
Expand All @@ -324,7 +330,7 @@
"\n",
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
" \"extraction_rulesets\"\n",
" ] = temp_extraction_rulesets"
" ].append(temp_extraction_rulesets)"
]
},
{
Expand Down

0 comments on commit 8cf33a7

Please sign in to comment.