Skip to content

Commit 73fe825

Browse files
authored
Merge pull request #399 from ClickHouse/make-mv-data-catchup-configurable
Make mv data catchup configurable
2 parents edad093 + e2ab918 commit 73fe825

File tree

6 files changed

+301
-195
lines changed

6 files changed

+301
-195
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
### Release [1.8.6], 2024-12-05
2+
3+
### Improvement
4+
* Today, on mv model creation, the target table is being populated with the historical data based on the query provided in the mv creation. This catchup mechanism is now behind a config flag and enabled by default (as is today). ([#399](https://github.com/ClickHouse/dbt-clickhouse/pull/399))
5+
16
### Release [1.8.5], 2024-11-19
27

38
### New Features

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,22 @@ select a,b,c from {{ source('raw', 'table_2') }}
264264
>
265265
> When updating a model with multiple materialized views (MVs), especially when renaming one of the MV names, dbt-clickhouse does not automatically drop the old MV. Instead,
266266
> you will encounter the following warning: `Warning - Table <previous table name> was detected with the same pattern as model name <your model name> but was not found in this run. In case it is a renamed mv that was previously part of this model, drop it manually (!!!) `
267-
267+
268+
## Data catchup
269+
Currently, when creating a materialized view (MV), the target table is first populated with historical data before the MV itself is created.
270+
271+
In other words, dbt-clickhouse initially creates the target table and preloads it with historical data based on the query defined for the MV. Only after this step is the MV created.
272+
273+
If you prefer not to preload historical data during MV creation, you can disable this behavior by setting the catchup config to False:
274+
275+
```python
276+
{{config(
277+
materialized='materialized_view',
278+
engine='MergeTree()',
279+
order_by='(id)',
280+
catchup=False
281+
)}}
282+
```
268283

269284

270285
# Dictionary materializations (experimental)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
version = '1.8.5'
1+
version = '1.8.6'

dbt/include/clickhouse/macros/materializations/materialized_view.sql

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@
5050

5151
{% if backup_relation is none %}
5252
{{ log('Creating new materialized view ' + target_relation.name )}}
53-
{{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql, views) }}
53+
{% set catchup_data = config.get("catchup", True) %}
54+
{{ clickhouse__get_create_materialized_view_as_sql(target_relation, sql, views, catchup_data) }}
5455
{% elif existing_relation.can_exchange %}
5556
{{ log('Replacing existing materialized view ' + target_relation.name) }}
5657
-- in this section, we look for mvs that has the same pattern as this model, but for some reason,
@@ -132,9 +133,15 @@
132133
2. Create a materialized view using the SQL in the model that inserts
133134
data into the table creating during step 1
134135
#}
135-
{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql, views) -%}
136+
{% macro clickhouse__get_create_materialized_view_as_sql(relation, sql, views, catchup=True ) -%}
136137
{% call statement('main') %}
138+
{% if catchup == True %}
137139
{{ get_create_table_as_sql(False, relation, sql) }}
140+
{% else %}
141+
{{ log('Catchup data config was set to false, skipping mv-target-table initial insertion ')}}
142+
{% set has_contract = config.get('contract').enforced %}
143+
{{ create_table_or_empty(False, relation, sql, has_contract) }}
144+
{% endif %}
138145
{% endcall %}
139146
{%- set cluster_clause = on_cluster_clause(relation) -%}
140147
{%- set mv_relation = relation.derivative('_mv', 'materialized_view') -%}

tests/integration/adapter/materialized_view/test_materialized_view.py

Lines changed: 22 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,11 @@
2828
materialized='materialized_view',
2929
engine='MergeTree()',
3030
order_by='(id)',
31-
schema='custom_schema',
31+
schema='catchup' if var('run_type', '') == 'catchup' else 'custom_schema',
32+
**({'catchup': False} if var('run_type', '') == 'catchup' else {})
3233
) }}
3334
34-
{% if var('run_type', '') == '' %}
35+
{% if var('run_type', '') in ['', 'catchup'] %}
3536
select
3637
id,
3738
name,
@@ -60,74 +61,6 @@
6061
{% endif %}
6162
"""
6263

63-
MULTIPLE_MV_MODEL = """
64-
{{ config(
65-
materialized='materialized_view',
66-
engine='MergeTree()',
67-
order_by='(id)',
68-
schema='custom_schema_for_multiple_mv',
69-
) }}
70-
71-
{% if var('run_type', '') == '' %}
72-
73-
--mv1:begin
74-
select
75-
id,
76-
name,
77-
case
78-
when name like 'Dade' then 'crash_override'
79-
when name like 'Kate' then 'acid burn'
80-
else 'N/A'
81-
end as hacker_alias
82-
from {{ source('raw', 'people') }}
83-
where department = 'engineering'
84-
--mv1:end
85-
86-
union all
87-
88-
--mv2:begin
89-
select
90-
id,
91-
name,
92-
-- sales people are not cool enough to have a hacker alias
93-
'N/A' as hacker_alias
94-
from {{ source('raw', 'people') }}
95-
where department = 'sales'
96-
--mv2:end
97-
98-
{% elif var('run_type', '') == 'extended_schema' %}
99-
100-
--mv1:begin
101-
select
102-
id,
103-
name,
104-
case
105-
-- Dade wasn't always known as 'crash override'!
106-
when name like 'Dade' and age = 11 then 'zero cool'
107-
when name like 'Dade' and age != 11 then 'crash override'
108-
when name like 'Kate' then 'acid burn'
109-
else 'N/A'
110-
end as hacker_alias
111-
from {{ source('raw', 'people') }}
112-
where department = 'engineering'
113-
--mv1:end
114-
115-
union all
116-
117-
--mv2:begin
118-
select
119-
id,
120-
name,
121-
-- sales people are not cool enough to have a hacker alias
122-
'N/A' as hacker_alias
123-
from {{ source('raw', 'people') }}
124-
where department = 'sales'
125-
--mv2:end
126-
127-
{% endif %}
128-
"""
129-
130-
13164
SEED_SCHEMA_YML = """
13265
version: 2
13366
@@ -197,116 +130,30 @@ def test_create(self, project):
197130
result = project.run_sql(f"select count(*) from {schema}.hackers", fetch="all")
198131
assert result[0][0] == 4
199132

200-
201-
class TestUpdateMV:
202-
@pytest.fixture(scope="class")
203-
def seeds(self):
204-
"""
205-
we need a base table to pull from
206-
"""
207-
return {
208-
"people.csv": PEOPLE_SEED_CSV,
209-
"schema.yml": SEED_SCHEMA_YML,
210-
}
211-
212-
@pytest.fixture(scope="class")
213-
def models(self):
214-
return {
215-
"hackers.sql": MV_MODEL,
216-
}
217-
218-
def test_update_incremental(self, project):
219-
schema = quote_identifier(project.test_schema + "_custom_schema")
220-
# create our initial materialized view
221-
run_dbt(["seed"])
222-
run_dbt()
223-
224-
# re-run dbt but this time with the new MV SQL
225-
run_vars = {"run_type": "extended_schema"}
226-
run_dbt(["run", "--vars", json.dumps(run_vars)])
227-
228-
project.run_sql(
229-
f"""
230-
insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department")
231-
values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware');
232-
"""
233-
)
234-
235-
# assert that we now have both of Dade's aliases in our hackers table
236-
result = project.run_sql(
237-
f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all"
238-
)
239-
assert len(result) == 2
240-
241-
def test_update_full_refresh(self, project):
242-
schema = quote_identifier(project.test_schema + "_custom_schema")
243-
# create our initial materialized view
244-
run_dbt(["seed"])
245-
run_dbt()
246-
247-
# re-run dbt but this time with the new MV SQL
248-
run_vars = {"run_type": "extended_schema"}
249-
run_dbt(["run", "--full-refresh", "--vars", json.dumps(run_vars)])
250-
251-
project.run_sql(
252-
f"""
253-
insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department")
254-
values (1232,'Dade',11,'engineering'), (9999,'eugene',40,'malware');
255-
"""
256-
)
257-
258-
# assert that we now have both of Dade's aliases in our hackers table
259-
result = project.run_sql(
260-
f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all"
261-
)
262-
assert len(result) == 2
263-
264-
265-
class TestMultipleMV:
266-
@pytest.fixture(scope="class")
267-
def seeds(self):
268-
"""
269-
we need a base table to pull from
270-
"""
271-
return {
272-
"people.csv": PEOPLE_SEED_CSV,
273-
"schema.yml": SEED_SCHEMA_YML,
274-
}
275-
276-
@pytest.fixture(scope="class")
277-
def models(self):
278-
return {
279-
"hackers.sql": MULTIPLE_MV_MODEL,
280-
}
281-
282-
def test_create(self, project):
133+
def test_disabled_catchup(self, project):
283134
"""
284135
1. create a base table via dbt seed
285-
2. create a model as a materialized view, selecting from the table created in (1)
136+
2. create a model with catchup disabled as a materialized view, selecting from the table created in (1)
286137
3. insert data into the base table and make sure it's there in the target table created in (2)
287138
"""
288-
schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv")
139+
schema = quote_identifier(project.test_schema + "_catchup")
289140
results = run_dbt(["seed"])
290141
assert len(results) == 1
291142
columns = project.run_sql("DESCRIBE TABLE people", fetch="all")
292143
assert columns[0][1] == "Int32"
293144

294-
# create the model
295-
run_dbt(["run"])
145+
# create the model with catchup disabled
146+
run_vars = {"run_type": "catchup"}
147+
run_dbt(["run", "--vars", json.dumps(run_vars)])
148+
# check that we only have the new row, without the historical data
296149
assert len(results) == 1
297150

298151
columns = project.run_sql(f"DESCRIBE TABLE {schema}.hackers", fetch="all")
299152
assert columns[0][1] == "Int32"
300153

301-
columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv1", fetch="all")
302-
assert columns[0][1] == "Int32"
303-
304-
columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv2", fetch="all")
154+
columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all")
305155
assert columns[0][1] == "Int32"
306156

307-
with pytest.raises(Exception):
308-
columns = project.run_sql(f"DESCRIBE {schema}.hackers_mv", fetch="all")
309-
310157
check_relation_types(
311158
project.adapter,
312159
{
@@ -318,25 +165,16 @@ def test_create(self, project):
318165
# insert some data and make sure it reaches the target table
319166
project.run_sql(
320167
f"""
321-
insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department")
322-
values (4000,'Dave',40,'sales'), (9999,'Eugene',40,'engineering');
323-
"""
168+
insert into {quote_identifier(project.test_schema)}.people ("id", "name", "age", "department")
169+
values (1232,'Dade',16,'engineering'), (9999,'eugene',40,'malware');
170+
"""
324171
)
325172

326-
result = project.run_sql(f"select * from {schema}.hackers order by id", fetch="all")
327-
assert result == [
328-
(1000, 'Alfie', 'N/A'),
329-
(1231, 'Dade', 'crash_override'),
330-
(2000, 'Bill', 'N/A'),
331-
(3000, 'Charlie', 'N/A'),
332-
(4000, 'Dave', 'N/A'),
333-
(6666, 'Ksenia', 'N/A'),
334-
(8888, 'Kate', 'acid burn'),
335-
(9999, 'Eugene', 'N/A'),
336-
]
173+
result = project.run_sql(f"select count(*) from {schema}.hackers", fetch="all")
174+
assert result[0][0] == 1
337175

338176

339-
class TestUpdateMultipleMV:
177+
class TestUpdateMV:
340178
@pytest.fixture(scope="class")
341179
def seeds(self):
342180
"""
@@ -350,11 +188,11 @@ def seeds(self):
350188
@pytest.fixture(scope="class")
351189
def models(self):
352190
return {
353-
"hackers.sql": MULTIPLE_MV_MODEL,
191+
"hackers.sql": MV_MODEL,
354192
}
355193

356194
def test_update_incremental(self, project):
357-
schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv")
195+
schema = quote_identifier(project.test_schema + "_custom_schema")
358196
# create our initial materialized view
359197
run_dbt(["seed"])
360198
run_dbt()
@@ -372,15 +210,12 @@ def test_update_incremental(self, project):
372210

373211
# assert that we now have both of Dade's aliases in our hackers table
374212
result = project.run_sql(
375-
f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias",
376-
fetch="all",
213+
f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all"
377214
)
378215
assert len(result) == 2
379-
assert result[0][0] == "crash_override"
380-
assert result[1][0] == "zero cool"
381216

382217
def test_update_full_refresh(self, project):
383-
schema = quote_identifier(project.test_schema + "_custom_schema_for_multiple_mv")
218+
schema = quote_identifier(project.test_schema + "_custom_schema")
384219
# create our initial materialized view
385220
run_dbt(["seed"])
386221
run_dbt()
@@ -398,10 +233,6 @@ def test_update_full_refresh(self, project):
398233

399234
# assert that we now have both of Dade's aliases in our hackers table
400235
result = project.run_sql(
401-
f"select distinct hacker_alias from {schema}.hackers where name = 'Dade' order by hacker_alias",
402-
fetch="all",
236+
f"select distinct hacker_alias from {schema}.hackers where name = 'Dade'", fetch="all"
403237
)
404-
print(result)
405238
assert len(result) == 2
406-
assert result[0][0] == "crash override"
407-
assert result[1][0] == "zero cool"

0 commit comments

Comments
 (0)