-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdemo_notebook.py
More file actions
108 lines (76 loc) · 1.93 KB
/
demo_notebook.py
File metadata and controls
108 lines (76 loc) · 1.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import marimo
__generated_with = "0.11.12"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
# import duckdb
return (mo,)
@app.cell
def _(duckdb):
c = duckdb.connect("taxi_trips.ddb")
return (c,)
@app.cell
def _(c):
c.sql("select * from duckdb_settings() where name = 'threads';")
return
@app.cell
def _(c):
c.sql("SET enable_progress_bar = true;")
return
@app.cell
def _():
# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "nyctlc"
blob_relative_path = "yellow"
url_path = f"az://{blob_account_name}.blob.core.windows.net/{blob_container_name}/{blob_relative_path}/**/*.parquet"
print(url_path) # 'az://azureopendatastorage.blob.core.windows.net/nyctlc/yellow/**/*.parquet'
return blob_account_name, blob_container_name, blob_relative_path, url_path
@app.cell
def _(c, mo, null, url_path):
_df = mo.sql(
f"""
-- select * from 'az://azureopendatastorage.blob.core.windows.net/nyctlc/yellow/**/*.parquet'
-- select * from read_parquet('az://azureopendatastorage.blob.core.windows.net/nyctlc/yellow/**/*.parquet')
select * from '{url_path}'
""",
engine=c
)
return
@app.cell
def _(mo):
mo.md(r"""## Smallpond""")
return
@app.cell
def _():
import smallpond
import graphviz
return graphviz, smallpond
@app.cell
def _(smallpond):
sp = smallpond.init()
return (sp,)
@app.cell
def _(sp):
sp.config
return
@app.cell
def _(sp, url_path):
taxi_trips = sp.read_parquet(url_path)
return (taxi_trips,)
@app.cell
def _(taxi_trips):
df = taxi_trips.repartition(10000)
return (df,)
@app.cell
def _(df):
df.count()
return
@app.cell
def _(df, sp):
query = "select count(*) from taxi_trips"
count_trips = sp.partial_sql(query, df)
return count_trips, query
if __name__ == "__main__":
app.run()