|
72 | 72 | "The following data sets need to be processed:\n",
|
73 | 73 | "- Daily weather data (daily_weather.parquet)\n",
|
74 | 74 | "- Cities (cities.csv)\n",
|
75 |
| - "- Countries (countries.csv)" |
| 75 | + "- Countries (countries.csv)\n", |
| 76 | + "\n", |
| 77 | + "The subsequent code cell acquires the dataset directly from kaggle.com.\n", |
| 78 | + "To properly configure the notebook to use corresponding credentials\n", |
| 79 | + "after signing up on Kaggle, define the `KAGGLE_USERNAME` and\n", |
| 80 | + "`KAGGLE_KEY` environment variables. Alternatively, put them into the\n", |
| 81 | + "file `~/.kaggle/kaggle.json` in your home folder, like this:\n", |
| 82 | + "```json\n", |
| 83 | + "{\n", |
| 84 | + " \"username\": \"acme\",\n", |
| 85 | + " \"key\": \"2b1dac2af55caaf1f34df76236fada4a\"\n", |
| 86 | + "}\n", |
| 87 | + "```\n", |
| 88 | + "Another variant is to acquire the dataset files manually, and extract\n", |
| 89 | + "them into a folder called `DOWNLOAD`. In this case, you can deactivate\n", |
| 90 | + "those two lines of code, in order to skip automatic dataset acquisition." |
76 | 91 | ]
|
77 | 92 | },
|
| 93 | + { |
| 94 | + "cell_type": "code", |
| 95 | + "execution_count": null, |
| 96 | + "outputs": [], |
| 97 | + "source": [ |
| 98 | + "from cratedb_toolkit.datasets import load_dataset\n", |
| 99 | + "\n", |
| 100 | + "dataset = load_dataset(\"kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet\")\n", |
| 101 | + "dataset.acquire()" |
| 102 | + ], |
| 103 | + "metadata": { |
| 104 | + "collapsed": false |
| 105 | + } |
| 106 | + }, |
78 | 107 | {
|
79 | 108 | "cell_type": "code",
|
80 | 109 | "execution_count": 88,
|
81 |
| - "id": "fa24e753", |
82 |
| - "metadata": {}, |
83 | 110 | "outputs": [],
|
84 | 111 | "source": [
|
85 | 112 | "from dask import dataframe as dd\n",
|
|
88 | 115 | "# Show a progress bar for dask activities\n",
|
89 | 116 | "pbar = ProgressBar()\n",
|
90 | 117 | "pbar.register()"
|
91 |
| - ] |
| 118 | + ], |
| 119 | + "metadata": { |
| 120 | + "collapsed": false |
| 121 | + } |
92 | 122 | },
|
93 | 123 | {
|
94 | 124 | "cell_type": "code",
|
|
288 | 318 | ],
|
289 | 319 | "source": [
|
290 | 320 | "# Load the parquet file. Please adjust the file path as needed.\n",
|
291 |
| - "df_kaggle = dd.read_parquet('DOWNLOAD_PATH/daily_weather.parquet')\n", |
| 321 | + "df_kaggle = dd.read_parquet('DOWNLOAD/daily_weather.parquet')\n", |
292 | 322 | "\n",
|
293 | 323 | "# Show info about the data.\n",
|
294 | 324 | "df_kaggle.info(verbose=True, memory_usage=True)\n",
|
|
421 | 451 | ],
|
422 | 452 | "source": [
|
423 | 453 | "# Read cities, adapt the path to the files accordingly\n",
|
424 |
| - "cities = dd.read_csv(\"DOWNLOAD_PATH/cities.csv\",dtype={'station_id': 'object'})\n", |
| 454 | + "cities = dd.read_csv(\"DOWNLOAD/cities.csv\",dtype={'station_id': 'object'})\n", |
425 | 455 | "\n",
|
426 | 456 | "# Modify lon and lat of cities into an array that can be interpreted directly by CrateDB\n",
|
427 | 457 | "def create_location_column(df):\n",
|
|
442 | 472 | "outputs": [],
|
443 | 473 | "source": [
|
444 | 474 | "# Read countries, adapt the path to the files accordingly\n",
|
445 |
| - "countries = dd.read_csv(\"DOWNLOAD_PATH/countries.csv\")" |
| 475 | + "countries = dd.read_csv(\"DOWNLOAD/countries.csv\")" |
446 | 476 | ]
|
447 | 477 | },
|
448 | 478 | {
|
|
476 | 506 | "metadata": {},
|
477 | 507 | "outputs": [],
|
478 | 508 | "source": [
|
| 509 | + "import os\n", |
479 | 510 | "import sqlalchemy as sa\n",
|
480 | 511 | "from crate.client.sqlalchemy.support import insert_bulk\n",
|
481 | 512 | "\n",
|
482 |
| - "# Connect to CrateDB\n", |
483 |
| - "# For a database running in the cloud, please use a connection string like this:\n", |
484 |
| - "dburi = 'crate://USER:PASSWORD@HOST:4200?ssl=true'\n", |
| 513 | + "# Define database address when using CrateDB Cloud.\n", |
| 514 | + "# Please find these settings on your cluster overview page.\n", |
| 515 | + "CONNECTION_STRING = os.environ.get(\n", |
| 516 | + " \"CRATEDB_CONNECTION_STRING\",\n", |
| 517 | + " \"crate://<USER>:<PASSWORD>@<CRATEDB_HOST>/?ssl=true\",\n", |
| 518 | + ")\n", |
485 | 519 | "\n",
|
486 |
| - "# For a database running locally, please use the following connection string:\n", |
487 |
| - "# dburi = 'crate://localhost:4200?ssl=false'\n", |
| 520 | + "# Define database address when using CrateDB on localhost.\n", |
| 521 | + "#CONNECTION_STRING = os.environ.get(\n", |
| 522 | + "# \"CRATEDB_CONNECTION_STRING\",\n", |
| 523 | + "# \"crate://crate@localhost/\",\n", |
| 524 | + "#)\n", |
488 | 525 | "\n",
|
489 |
| - "engine = sa.create_engine(dburi, echo=False)\n", |
| 526 | + "# Connect to CrateDB using SQLAlchemy.\n", |
| 527 | + "engine = sa.create_engine(CONNECTION_STRING, echo=False)\n", |
490 | 528 | "connection = engine.connect()"
|
491 | 529 | ]
|
492 | 530 | },
|
|
520 | 558 | ],
|
521 | 559 | "source": [
|
522 | 560 | "connection.execute(sa.text(\"\"\"\n",
|
523 |
| - "CREATE TABLE IF NOT EXISTS \"doc\".\"weather_data\" (\n", |
| 561 | + "CREATE TABLE IF NOT EXISTS \"weather_data\" (\n", |
524 | 562 | " \"station_id\" TEXT,\n",
|
525 | 563 | " \"city_name\" TEXT,\n",
|
526 | 564 | " \"date\" TIMESTAMP WITHOUT TIME ZONE,\n",
|
|
567 | 605 | ],
|
568 | 606 | "source": [
|
569 | 607 | "connection.execute(sa.text(\"\"\"\n",
|
570 |
| - "CREATE TABLE \"doc\".\"cities\" (\n", |
| 608 | + "CREATE TABLE \"cities\" (\n", |
571 | 609 | " \"station_id\" TEXT,\n",
|
572 | 610 | " \"city_name\" TEXT,\n",
|
573 | 611 | " \"country\" TEXT,\n",
|
|
626 | 664 | "# Uncomment the following lines to process the actual weather data.\n",
|
627 | 665 | "# They have been disabled in order to avoid long-running operations.\n",
|
628 | 666 | "# df_kaggle = df_kaggle.repartition(26)\n",
|
629 |
| - "# df_kaggle.to_sql(name='weather_data', uri=dburi, schema='doc', if_exists='append', \n", |
| 667 | + "# df_kaggle.to_sql(name='weather_data', uri=dburi, if_exists='append',\n", |
630 | 668 | "# index=False, chunksize=10000, parallel=True, method=insert_bulk)"
|
631 | 669 | ]
|
632 | 670 | },
|
|
659 | 697 | }
|
660 | 698 | ],
|
661 | 699 | "source": [
|
662 |
| - "countries.to_sql('countries', dburi, schema='doc', if_exists='append', \n", |
| 700 | + "countries.to_sql('countries', CONNECTION_STRING, if_exists='append',\n", |
663 | 701 | " index=False, chunksize=1000, parallel=True, method=insert_bulk)"
|
664 | 702 | ]
|
665 | 703 | },
|
|
692 | 730 | }
|
693 | 731 | ],
|
694 | 732 | "source": [
|
695 |
| - "cities.to_sql('cities', dburi, schema='doc', if_exists='append', \n", |
| 733 | + "cities.to_sql('cities', CONNECTION_STRING, if_exists='append',\n", |
696 | 734 | " index=False, chunksize=1000, parallel=True, method=insert_bulk)"
|
697 | 735 | ]
|
698 | 736 | }
|
|
0 commit comments