Skip to content

Commit af3cf3a

Browse files
committed
Time Series QA: Make Dask notebook self-contained and testable
Include data acquisition from Kaggle.
1 parent 46a4fdd commit af3cf3a

File tree

2 files changed

+68
-19
lines changed

2 files changed

+68
-19
lines changed

topic/timeseries/dask-weather-data-import.ipynb

+56-18
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,41 @@
7272
"The following data sets need to be processed:\n",
7373
"- Daily weather data (daily_weather.parquet)\n",
7474
"- Cities (cities.csv)\n",
75-
"- Countries (countries.csv)"
75+
"- Countries (countries.csv)\n",
76+
"\n",
77+
"The subsequent code cell acquires the dataset directly from kaggle.com.\n",
78+
"To properly configure the notebook to use corresponding credentials\n",
79+
"after signing up on Kaggle, define the `KAGGLE_USERNAME` and\n",
80+
"`KAGGLE_KEY` environment variables. Alternatively, put them into the\n",
81+
"file `~/.kaggle/kaggle.json` in your home folder, like this:\n",
82+
"```json\n",
83+
"{\n",
84+
" \"username\": \"acme\",\n",
85+
" \"key\": \"2b1dac2af55caaf1f34df76236fada4a\"\n",
86+
"}\n",
87+
"```\n",
88+
"Another variant is to acquire the dataset files manually, and extract\n",
89+
"them into a folder called `DOWNLOAD`. In this case, you can deactivate\n",
90+
"those two lines of code, in order to skip automatic dataset acquisition."
7691
]
7792
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"outputs": [],
97+
"source": [
98+
"from cratedb_toolkit.datasets import load_dataset\n",
99+
"\n",
100+
"dataset = load_dataset(\"kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet\")\n",
101+
"dataset.acquire()"
102+
],
103+
"metadata": {
104+
"collapsed": false
105+
}
106+
},
78107
{
79108
"cell_type": "code",
80109
"execution_count": 88,
81-
"id": "fa24e753",
82-
"metadata": {},
83110
"outputs": [],
84111
"source": [
85112
"from dask import dataframe as dd\n",
@@ -88,7 +115,10 @@
88115
"# Show a progress bar for dask activities\n",
89116
"pbar = ProgressBar()\n",
90117
"pbar.register()"
91-
]
118+
],
119+
"metadata": {
120+
"collapsed": false
121+
}
92122
},
93123
{
94124
"cell_type": "code",
@@ -288,7 +318,7 @@
288318
],
289319
"source": [
290320
"# Load the parquet file. Please adjust the file path as needed.\n",
291-
"df_kaggle = dd.read_parquet('DOWNLOAD_PATH/daily_weather.parquet')\n",
321+
"df_kaggle = dd.read_parquet('DOWNLOAD/daily_weather.parquet')\n",
292322
"\n",
293323
"# Show info about the data.\n",
294324
"df_kaggle.info(verbose=True, memory_usage=True)\n",
@@ -421,7 +451,7 @@
421451
],
422452
"source": [
423453
"# Read cities, adapt the path to the files accordingly\n",
424-
"cities = dd.read_csv(\"DOWNLOAD_PATH/cities.csv\",dtype={'station_id': 'object'})\n",
454+
"cities = dd.read_csv(\"DOWNLOAD/cities.csv\",dtype={'station_id': 'object'})\n",
425455
"\n",
426456
"# Modify lon and lat of cities into an array that can be interpreted directly by CrateDB\n",
427457
"def create_location_column(df):\n",
@@ -442,7 +472,7 @@
442472
"outputs": [],
443473
"source": [
444474
"# Read countries, adapt the path to the files accordingly\n",
445-
"countries = dd.read_csv(\"DOWNLOAD_PATH/countries.csv\")"
475+
"countries = dd.read_csv(\"DOWNLOAD/countries.csv\")"
446476
]
447477
},
448478
{
@@ -476,17 +506,25 @@
476506
"metadata": {},
477507
"outputs": [],
478508
"source": [
509+
"import os\n",
479510
"import sqlalchemy as sa\n",
480511
"from crate.client.sqlalchemy.support import insert_bulk\n",
481512
"\n",
482-
"# Connect to CrateDB\n",
483-
"# For a database running in the cloud, please use a connection string like this:\n",
484-
"dburi = 'crate://USER:PASSWORD@HOST:4200?ssl=true'\n",
513+
"# Define database address when using CrateDB Cloud.\n",
514+
"# Please find these settings on your cluster overview page.\n",
515+
"CONNECTION_STRING = os.environ.get(\n",
516+
" \"CRATEDB_CONNECTION_STRING\",\n",
517+
" \"crate://<USER>:<PASSWORD>@<CRATEDB_HOST>/?ssl=true\",\n",
518+
")\n",
485519
"\n",
486-
"# For a database running locally, please use the following connection string:\n",
487-
"# dburi = 'crate://localhost:4200?ssl=false'\n",
520+
"# Define database address when using CrateDB on localhost.\n",
521+
"#CONNECTION_STRING = os.environ.get(\n",
522+
"# \"CRATEDB_CONNECTION_STRING\",\n",
523+
"# \"crate://crate@localhost/\",\n",
524+
"#)\n",
488525
"\n",
489-
"engine = sa.create_engine(dburi, echo=False)\n",
526+
"# Connect to CrateDB using SQLAlchemy.\n",
527+
"engine = sa.create_engine(CONNECTION_STRING, echo=False)\n",
490528
"connection = engine.connect()"
491529
]
492530
},
@@ -520,7 +558,7 @@
520558
],
521559
"source": [
522560
"connection.execute(sa.text(\"\"\"\n",
523-
"CREATE TABLE IF NOT EXISTS \"doc\".\"weather_data\" (\n",
561+
"CREATE TABLE IF NOT EXISTS \"weather_data\" (\n",
524562
" \"station_id\" TEXT,\n",
525563
" \"city_name\" TEXT,\n",
526564
" \"date\" TIMESTAMP WITHOUT TIME ZONE,\n",
@@ -567,7 +605,7 @@
567605
],
568606
"source": [
569607
"connection.execute(sa.text(\"\"\"\n",
570-
"CREATE TABLE \"doc\".\"cities\" (\n",
608+
"CREATE TABLE \"cities\" (\n",
571609
" \"station_id\" TEXT,\n",
572610
" \"city_name\" TEXT,\n",
573611
" \"country\" TEXT,\n",
@@ -626,7 +664,7 @@
626664
"# Uncomment the following lines to process the actual weather data.\n",
627665
"# They have been disabled in order to avoid long-running operations.\n",
628666
"# df_kaggle = df_kaggle.repartition(26)\n",
629-
"# df_kaggle.to_sql(name='weather_data', uri=dburi, schema='doc', if_exists='append', \n",
667+
"# df_kaggle.to_sql(name='weather_data', uri=dburi, if_exists='append',\n",
630668
"# index=False, chunksize=10000, parallel=True, method=insert_bulk)"
631669
]
632670
},
@@ -659,7 +697,7 @@
659697
}
660698
],
661699
"source": [
662-
"countries.to_sql('countries', dburi, schema='doc', if_exists='append', \n",
700+
"countries.to_sql('countries', CONNECTION_STRING, if_exists='append',\n",
663701
" index=False, chunksize=1000, parallel=True, method=insert_bulk)"
664702
]
665703
},
@@ -692,7 +730,7 @@
692730
}
693731
],
694732
"source": [
695-
"cities.to_sql('cities', dburi, schema='doc', if_exists='append', \n",
733+
"cities.to_sql('cities', CONNECTION_STRING, if_exists='append',\n",
696734
" index=False, chunksize=1000, parallel=True, method=insert_bulk)"
697735
]
698736
}

topic/timeseries/test.py

+12-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import os
2+
from pathlib import Path
3+
14
import pytest
25
from testbook import testbook
36

@@ -7,6 +10,14 @@ def test_notebook(notebook):
710
Execute Jupyter Notebook, one test case per .ipynb file.
811
"""
912
if notebook.name == "dask-weather-data-import.ipynb":
10-
raise pytest.skip("Depends on DOWNLOAD_PATH/daily_weather.parquet")
13+
14+
# Skip Kaggle tests when having no authentication information.
15+
kaggle_auth_exists = Path("~/.kaggle/kaggle.json").exists() or (
16+
"KAGGLE_USERNAME" in os.environ and "KAGGLE_KEY" in os.environ
17+
)
18+
if not kaggle_auth_exists:
19+
raise pytest.skip(f"Kaggle dataset can not be tested "
20+
f"without authentication: {notebook.name}")
21+
1122
with testbook(notebook) as tb:
1223
tb.execute()

0 commit comments

Comments
 (0)