Skip to content

Commit

Permalink
Cleanup and more output
Browse files Browse the repository at this point in the history
  • Loading branch information
mattnowzari committed Feb 24, 2025
1 parent 356f845 commit c7debe3
Showing 1 changed file with 19 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,8 @@
" # populate the in-memory data structure\n",
" inflight_configuration_data[crawler_oid][\"domains_temp\"][\n",
" domain_oid\n",
" ] = temp_domain_conf"
" ] = temp_domain_conf\n",
" print()"
]
},
{
Expand All @@ -258,7 +259,7 @@
" _source=[\"configuration_oid\", \"domain_oid\", \"rules\", \"url_filters\"],\n",
")\n",
"\n",
"extr_count = 0\n",
"extr_count = 1\n",
"for exr_rule in extraction_rules[\"hits\"][\"hits\"]:\n",
" source = exr_rule[\"_source\"]\n",
"\n",
Expand Down Expand Up @@ -306,12 +307,14 @@
" \"rules\": ruleset,\n",
" }\n",
" ]\n",
"\n",
" print(\n",
" f\"{extr_count}.) Crawler {config_oid} has extraction rules {temp_extraction_rulesets}\\n\"\n",
" )\n",
" extr_count += 1\n",
" inflight_configuration_data[config_oid][\"domains_temp\"][domain_oid][\n",
" \"extraction_rulesets\"\n",
" ] = temp_extraction_rulesets\n",
"\n",
"print(f\"{extr_count} total extraction rules found!\")"
" ] = temp_extraction_rulesets"
]
},
{
Expand Down Expand Up @@ -348,7 +351,6 @@
"\n",
" # ? comes from Quartz Cron, regular cron doesn't handle it well\n",
" repackaged_definition = repackaged_definition.replace(\"?\", \"*\")\n",
" print(repackaged_definition)\n",
" return repackaged_definition\n",
"\n",
"\n",
Expand Down Expand Up @@ -402,15 +404,16 @@
"source": [
"# Final transform of the in-memory data structure to a form we can dump to YAML\n",
"# for each crawler, collect all of its domain configurations into a list\n",
"for crawler_config in inflight_configuration_data.values():\n",
"for crawler_oid, crawler_config in inflight_configuration_data.items():\n",
" all_crawler_domains = []\n",
"\n",
" for domain_config in crawler_config[\"domains_temp\"].values():\n",
" all_crawler_domains.append(domain_config)\n",
" # create a new key called \"domains\" that points to a list of domain configs only - no domain_oid values as keys\n",
" crawler_config[\"domains\"] = all_crawler_domains\n",
" # delete the temporary domain key\n",
" del crawler_config[\"domains_temp\"]"
" del crawler_config[\"domains_temp\"]\n",
" print(f\"Transform for {crawler_oid} complete!\")"
]
},
{
Expand Down Expand Up @@ -518,6 +521,14 @@
" \"--------------------------------------------------------------------------------\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7aaee4e8-c388-4b22-a8ad-a657550d92c7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit c7debe3

Please sign in to comment.