Skip to content
This repository was archived by the owner on Mar 5, 2025. It is now read-only.

Adding to_datetime code #21

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
340 changes: 340 additions & 0 deletions local/to_datetime_code.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "ec38f3d5-6fb5-4975-818a-1d5a7243a136",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from io import StringIO\n",
"\n",
"from dask.dataframe import from_pandas\n",
"from pandas import read_csv\n",
"\n",
"data = StringIO(\n",
" \"\"\"timestamp_start,time_worked\n",
" 2021-01-01 9:25 AM,3 hours 12 minutes\n",
" 2021-02-03 4:25 PM,2 hours\n",
" 2021-03-05 1:25 PM,15 minutes\n",
" 2021-03-05 11:25 PM,55 minutes\n",
" \"\"\"\n",
")\n",
"df = read_csv(data)\n",
"ddf = from_pandas(df, npartitions=2)\n",
"\n",
"print(ddf.dtypes)\n",
"# timestamp_start object\n",
"# time_worked object\n",
"# dtype: object"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57882342-cc3b-47c8-92c4-b1dcc960772e",
"metadata": {},
"outputs": [],
"source": [
"from dask.dataframe import to_datetime\n",
"\n",
"ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n",
"\n",
"print(ddf.dtypes)\n",
"# timestamp_start object\n",
"# time_worked object\n",
"# converted_timestamp_start datetime64[ns]\n",
"# dtype: object"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40457ca3-d467-46a0-b617-68b0d9587e29",
"metadata": {},
"outputs": [],
"source": [
"ddf[\"day_of_week\"] = ddf[\"converted_timestamp_start\"].dt.dayofweek\n",
"\n",
"print(ddf[[\"converted_timestamp_start\", \"day_of_week\"]].compute())\n",
"# converted_timestamp_start day_of_week\n",
"# 0 2021-01-01 09:25:00 4\n",
"# 1 2021-02-03 16:25:00 2\n",
"# 2 2021-03-05 13:25:00 4\n",
"# 3 2021-03-05 23:25:00 4"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "80dd5017-32cb-4a45-90f4-3d520f84f0cd",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from pandas import Timedelta, to_timedelta\n",
"\n",
"ddf[\"converted_time_worked\"] = (\n",
" ddf[\"time_worked\"].apply(lambda x: to_timedelta(x), meta=Timedelta).compute()\n",
")\n",
"\n",
"print(ddf[[\"converted_timestamp_start\", \"converted_time_worked\"]].compute())\n",
"# converted_timestamp_start converted_time_worked\n",
"# 0 2021-01-01 09:25:00 0 days 03:12:00\n",
"# 1 2021-02-03 16:25:00 0 days 02:00:00\n",
"# 2 2021-03-05 13:25:00 0 days 00:15:00\n",
"# 3 2021-03-05 23:25:00 0 days 00:55:00"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61d13f66-c822-4607-a603-d33b12ae4c73",
"metadata": {},
"outputs": [],
"source": [
"ddf[\"work_completed\"] = ddf[\"converted_timestamp_start\"] + ddf[\"converted_time_worked\"]\n",
"\n",
"print(\n",
" ddf[\n",
" [\"converted_timestamp_start\", \"converted_time_worked\", \"work_completed\"]\n",
" ].compute()\n",
")\n",
"# converted_timestamp_start converted_time_worked work_completed\n",
"# 0 2021-01-01 09:25:00 0 days 03:12:00 2021-01-01 12:37:00\n",
"# 1 2021-02-03 16:25:00 0 days 02:00:00 2021-02-03 18:25:00\n",
"# 2 2021-03-05 13:25:00 0 days 00:15:00 2021-03-05 13:40:00\n",
"# 3 2021-03-05 23:25:00 0 days 00:55:00 2021-03-06 00:20:00"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c1d19aa-f31e-45d9-8a59-84fe94fe5ecb",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ddf[\"converted_timestamp_start\"].dt.floor(\"15 min\").compute()\n",
"# 0 2021-01-01 09:15:00\n",
"# 1 2021-02-03 16:15:00\n",
"# 2 2021-03-05 13:15:00\n",
"# 3 2021-03-05 23:15:00\n",
"# Name: converted_timestamp_start, dtype: datetime64[ns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7532fbcf-132e-4135-98e2-3449fca86a5d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from io import StringIO\n",
"\n",
"from pandas import read_csv\n",
"\n",
"data = StringIO(\n",
" \"\"\"timestamp_start,time_worked\n",
" 2021-01-01 9:25 AM,3 hours 12 minutes\n",
" 2021-02-03 4:25 PM,2 hours\n",
" missing ,15 minutes\n",
" 2021-03-05 11:?? PM,55 minutes\n",
" \"\"\"\n",
")\n",
"\n",
"df = read_csv(data)\n",
"ddf = from_pandas(df, npartitions=2)\n",
"\n",
"print(ddf.dtypes)\n",
"# timestamp_start object\n",
"# time_worked object\n",
"# dtype: object"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "540f36d1-b64c-4ba7-9051-ee5af9df3ea8",
"metadata": {},
"outputs": [],
"source": [
"print(to_datetime(ddf[\"timestamp_start\"], errors=\"coerce\").compute())\n",
"# 0 2021-01-01 09:25:00\n",
"# 1 2021-02-03 14:25:00\n",
"# 2 NaT\n",
"# 3 NaT\n",
"# dtype: datetime64[ns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c55679c-0c4f-4c5e-8456-c6778ade481d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(to_datetime(ddf[\"timestamp_start\"], errors=\"ignore\").compute())\n",
"# 0 2021-01-01 09:25:00\n",
"# 1 2021-02-03 16:25:00\n",
"# 2 missing\n",
"# 3 2021-03-05 11:?? PM\n",
"# dtype: object"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0016b448-9635-4e36-a407-f9e1ed39cbea",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from io import StringIO\n",
"\n",
"from dask.dataframe import from_pandas, to_datetime\n",
"from pandas import read_csv\n",
"\n",
"data = StringIO(\n",
" \"\"\"timestamp_start,time_worked\n",
"2021-01-01 9:25 AM,3 hours 12 minutes\n",
"\"Thursday, October 9, 2022 14:25\",2 hours\n",
"\"January 12, 2022 14:25\",15 minutes\n",
" \"\"\"\n",
")\n",
"\n",
"df = read_csv(data)\n",
"ddf = from_pandas(df, npartitions=2)\n",
"\n",
"ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n",
"print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())\n",
"# timestamp_start converted_timestamp_start\n",
"# 0 2021-01-01 9:25 AM 2021-01-01 09:25:00\n",
"# 1 Thursday, October 9, 2022 14:25 2022-10-09 14:25:00\n",
"# 2 January 12, 2022 14:25 2022-01-12 14:25:00"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29f40937-9bea-499a-a3cb-c17c14b7be71",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from io import StringIO\n",
"\n",
"from dask.dataframe import from_pandas, to_datetime\n",
"from pandas import read_csv\n",
"\n",
"data = StringIO(\n",
" \"\"\"timestamp_start,time_worked\n",
"year 2021: 01/01 9:25 AM,3 hours 12 minutes\n",
"year 2021: 01/03 3:25 PM,2 hours\n",
"year 2021: 01/05 11:25 AM,2 hours\n",
"\"\"\"\n",
")\n",
"df = read_csv(data)\n",
"ddf = from_pandas(df, npartitions=2)\n",
"\n",
"ddf[\"converted_timestamp_start\"] = to_datetime(\n",
" ddf[\"timestamp_start\"], format=\"year %Y: %m/%d %I:%M %p\"\n",
")\n",
"print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())\n",
"# timestamp_start converted_timestamp_start\n",
"# 0 year 2021: 01/01 9:25 AM 2021-01-01 09:25:00\n",
"# 1 year 2021: 01/03 3:25 PM 2021-01-03 15:25:00\n",
"# 2 year 2021: 01/05 11:25 AM 2021-01-05 11:25:00"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "110ce98e-d8bf-4501-a767-064f6bee56a3",
"metadata": {},
"outputs": [],
"source": [
"from io import StringIO\n",
"\n",
"from dask.dataframe import from_pandas, to_datetime\n",
"from pandas import read_csv\n",
"\n",
"data = StringIO(\n",
" \"\"\"timestamp_local,location\n",
"2021-01-01 09:01:12,Asia/Almaty\n",
"2021-01-01 09:01:12,Europe/London\n",
"2021-01-01 09:01:12,America/New_York\n",
"\"\"\"\n",
")\n",
"df = read_csv(data)\n",
"ddf = from_pandas(df, npartitions=2)\n",
"\n",
"ddf[\"converted_date\"] = to_datetime(ddf[\"timestamp_local\"], utc=False)\n",
"print(ddf[[\"timestamp_local\", \"converted_date\"]].compute())\n",
"# timestamp_local converted_date\n",
"# 0 2021-01-01 09:01:12 2021-01-01 09:01:12\n",
"# 1 2021-01-01 09:01:12 2021-01-01 09:01:12\n",
"# 2 2021-01-01 09:01:12 2021-01-01 09:01:12"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2234c76-cf8d-47ed-bf9e-05a3c299f120",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"def convert_tz(datetime_object, local_timezone):\n",
" timezone_aware = datetime_object.tz_localize(local_timezone)\n",
" timezone_est = timezone_aware.tz_convert(\"America/New_York\")\n",
" return timezone_est\n",
"\n",
"\n",
"ddf[\"converted_date_tz_aware\"] = ddf[[\"converted_date\", \"location\"]].apply(\n",
" lambda row: convert_tz(row[\"converted_date\"], row[\"location\"]),\n",
" axis=1,\n",
" meta=(\"converted_date_tz_aware\", \"float\"),\n",
")\n",
"\n",
"print(ddf[[\"location\", \"converted_date_tz_aware\"]].compute())\n",
"# location converted_date_tz_aware\n",
"# 0 Asia/Almaty 2020-12-31 22:01:12-05:00\n",
"# 1 Europe/London 2021-01-01 04:01:12-05:00\n",
"# 2 America/New_York 2021-01-01 09:01:12-05:00"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}