From 67de1c4f9d016b9e69aee9d19b48d40abcb5c488 Mon Sep 17 00:00:00 2001 From: Joe S Date: Fri, 21 Nov 2025 17:12:59 -0800 Subject: [PATCH 1/5] always set query_id --- CHANGELOG.md | 1 + clickhouse_connect/common.py | 1 + clickhouse_connect/driver/asyncclient.py | 8 +++-- clickhouse_connect/driver/client.py | 8 +++-- clickhouse_connect/driver/httpclient.py | 26 +++++++++++++++ tests/integration_tests/test_client.py | 41 ++++++++++++++++++++++++ 6 files changed, 81 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb4bbdf4..98e0e0db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ The supported method of passing ClickHouse server settings is to prefix such arg ## UNRELEASED ### Bug Fixes ### Improvements +- Always generate query_id from the client side as a UUID4 if it not explicitly set. Closes [#596](https://github.com/ClickHouse/clickhouse-connect/issues/596) ## 0.10.0, 2025-11-14 diff --git a/clickhouse_connect/common.py b/clickhouse_connect/common.py index dc599302..04f43524 100644 --- a/clickhouse_connect/common.py +++ b/clickhouse_connect/common.py @@ -69,6 +69,7 @@ def _init_common(name: str, options: Sequence[Any], default: Any) -> None: _init_common('autogenerate_session_id', (True, False), True) +_init_common('autogenerate_query_id', (True, False), True) _init_common('dict_parameter_format', ('json', 'map'), 'json') _init_common('invalid_setting_action', ('send', 'drop', 'error'), 'error') _init_common('max_connection_age', (), 10 * 60) # Max time in seconds to keep reusing a database TCP connection diff --git a/clickhouse_connect/driver/asyncclient.py b/clickhouse_connect/driver/asyncclient.py index 41244bbb..f40b3598 100644 --- a/clickhouse_connect/driver/asyncclient.py +++ b/clickhouse_connect/driver/asyncclient.py @@ -518,7 +518,9 @@ def create_query_context(self, Creates or updates a reusable QueryContext object :param query: Query statement/format string :param parameters: Optional dictionary used to format the query - :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param settings: Optional dictionary of ClickHouse settings (key/string values). This includes both + server settings (e.g., max_threads, max_memory_usage) and HTTP interface parameters (e.g., query_id, + session_id, database). All settings are sent as URL query parameters. :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring :param encoding: See QueryContext __init__ docstring @@ -543,7 +545,9 @@ def create_query_context(self, :param use_extended_dtypes: Only relevant to Pandas Dataframe queries. Use Pandas "missing types", such as pandas.NA and pandas.NaT for ClickHouse NULL values, as well as extended Pandas dtypes such as IntegerArray and StringArray. Defaulted to True for query_df methods - :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :param transport_settings: Optional dictionary of transport level settings sent as HTTP headers. Use this for + custom headers (e.g., X-Workload) for load balancers or proxies. ClickHouse parameters like query_id should + go in the settings dict, not here. :return: Reusable QueryContext """ diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index 1a7a53a6..a3e474f5 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -473,7 +473,9 @@ def create_query_context(self, Creates or updates a reusable QueryContext object :param query: Query statement/format string :param parameters: Optional dictionary used to format the query - :param settings: Optional dictionary of ClickHouse settings (key/string values) + :param settings: Optional dictionary of ClickHouse settings (key/string values). This includes both + server settings (e.g., max_threads, max_memory_usage) and HTTP interface parameters (e.g., query_id, + session_id, database). All settings are sent as URL query parameters. :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring :param encoding: See QueryContext __init__ docstring @@ -500,7 +502,9 @@ def create_query_context(self, :param use_extended_dtypes: Only relevant to Pandas Dataframe queries. Use Pandas "missing types", such as pandas.NA and pandas.NaT for ClickHouse NULL values, as well as extended Pandas dtypes such as IntegerArray and StringArray. Defaulted to True for query_df methods - :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) + :param transport_settings: Optional dictionary of transport level settings sent as HTTP headers. Use this for + custom headers (e.g., X-Workload) for load balancers or proxies. ClickHouse parameters like query_id should + go in the settings dict, not here. :return: Reusable QueryContext """ resolved_utc_tz_aware = self.utc_tz_aware if utc_tz_aware is None else utc_tz_aware diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index 0f0ff45e..10e2bc4b 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -79,6 +79,7 @@ def __init__(self, utc_tz_aware: Optional[bool] = None, show_clickhouse_errors: Optional[bool] = None, autogenerate_session_id: Optional[bool] = None, + autogenerate_query_id: Optional[bool] = None, tls_mode: Optional[str] = None, proxy_path: str = '', form_encode_query_params: bool = False, @@ -161,6 +162,11 @@ def __init__(self, elif 'session_id' not in ch_settings and _autogenerate_session_id: ch_settings['session_id'] = str(uuid.uuid4()) + # allow to override the global autogenerate_query_id setting via the constructor params + self._autogenerate_query_id = common.get_setting('autogenerate_query_id') \ + if autogenerate_query_id is None \ + else autogenerate_query_id + if coerce_bool(compress): compression = ','.join(available_compression) self.write_compression = available_compression[0] @@ -191,6 +197,18 @@ def __init__(self, self._setting_status('http_headers_progress_interval_ms').is_writable: self._progress_interval = str(min(120000, max(10000, (send_receive_timeout - 5) * 1000))) + def _ensure_query_id(self, settings: Optional[dict]) -> Optional[dict]: + if not self._autogenerate_query_id: + return settings + + if settings is None: + settings = {} + + if "query_id" not in settings: + settings["query_id"] = str(uuid.uuid4()) + + return settings + def set_client_setting(self, key, value): str_value = self._validate_setting(key, value, common.get_setting('invalid_setting_action')) if str_value is not None: @@ -215,6 +233,8 @@ def _prep_query(self, context: QueryContext): return final_query + fmt def _query_with_context(self, context: QueryContext) -> QueryResult: + context.settings = self._ensure_query_id(context.settings) + headers = {} params = {} if self.database: @@ -307,6 +327,8 @@ def data_insert(self, context: InsertContext) -> QuerySummary: logger.debug('No data included in insert, skipping') return QuerySummary() + context.settings = self._ensure_query_id(context.settings) + def error_handler(resp: HTTPResponse): # If we actually had a local exception when building the insert, throw that instead if context.insert_exception: @@ -342,6 +364,8 @@ def raw_insert(self, table: str = None, """ See BaseClient doc_string for this method """ + settings = self._ensure_query_id(settings) + params = {} headers = {'Content-Type': 'application/octet-stream'} if compression: @@ -385,6 +409,8 @@ def command(self, """ See BaseClient doc_string for this method """ + settings = self._ensure_query_id(settings) + cmd, params = bind_query(cmd, parameters, self.server_tz) headers = {} payload = None diff --git a/tests/integration_tests/test_client.py b/tests/integration_tests/test_client.py index 00451285..faf2eb0b 100644 --- a/tests/integration_tests/test_client.py +++ b/tests/integration_tests/test_client.py @@ -1,6 +1,7 @@ from pathlib import Path from time import sleep from typing import Callable +import uuid import pytest @@ -382,3 +383,43 @@ def test_role_setting_works(test_client: Client, test_config: TestConfig): ) res = role_client.query('SELECT currentRoles()') assert res.result_rows == [([role_limited],)] + + +# pylint: disable=protected-access +def test_autogenerate_query_id(test_client: Client, test_table_engine: str, test_config: TestConfig): + def _is_valid_uuid_v4_string(id_string: str) -> bool: + parsed_uuid = uuid.UUID(id_string) + return parsed_uuid.version == 4 + + result = test_client.query("SELECT 1") + assert _is_valid_uuid_v4_string(result.query_id) + + manual_query_id = "test_manual_query_id_12345" + result = test_client.query("SELECT 2", settings={"query_id": manual_query_id}) + assert result.query_id == manual_query_id + + summary = test_client.command("DROP TABLE IF EXISTS does_not_exist") + assert _is_valid_uuid_v4_string(summary.query_id()) + + test_client.command("DROP TABLE IF EXISTS test_autogen_query_id") + test_client.command(f"CREATE TABLE test_autogen_query_id (id UInt32) ENGINE {test_table_engine} ORDER BY id") + summary = test_client.insert("test_autogen_query_id", [[1], [2], [3]], column_names=["id"]) + assert _is_valid_uuid_v4_string(summary.query_id()) + test_client.command("DROP TABLE test_autogen_query_id") + + # Create client with autogenerate_query_id disabled + client_no_autogen = create_client( + host=test_config.host, + port=test_config.port, + username=test_config.username, + password=test_config.password, + autogenerate_query_id=False, + ) + result = client_no_autogen.query("SELECT 4") + # Even with autogen disabled the server generates a query_id + # so we still expect a query_id in the result, it's just generated by the server... + # not sure how to verify that specifically though. + assert _is_valid_uuid_v4_string(result.query_id) + + # We can verify that the setting on the client is False though + assert client_no_autogen._autogenerate_query_id is False From 478060d889a8e0ee49eed4bdc4c87cd37cf183af Mon Sep 17 00:00:00 2001 From: Joe S Date: Thu, 4 Dec 2025 09:40:00 -0800 Subject: [PATCH 2/5] improve tests and refactor --- clickhouse_connect/driver/httpclient.py | 24 ++----- tests/integration_tests/test_client.py | 91 ++++++++++++++++++------- 2 files changed, 71 insertions(+), 44 deletions(-) diff --git a/clickhouse_connect/driver/httpclient.py b/clickhouse_connect/driver/httpclient.py index 10e2bc4b..4c58a30d 100644 --- a/clickhouse_connect/driver/httpclient.py +++ b/clickhouse_connect/driver/httpclient.py @@ -197,18 +197,6 @@ def __init__(self, self._setting_status('http_headers_progress_interval_ms').is_writable: self._progress_interval = str(min(120000, max(10000, (send_receive_timeout - 5) * 1000))) - def _ensure_query_id(self, settings: Optional[dict]) -> Optional[dict]: - if not self._autogenerate_query_id: - return settings - - if settings is None: - settings = {} - - if "query_id" not in settings: - settings["query_id"] = str(uuid.uuid4()) - - return settings - def set_client_setting(self, key, value): str_value = self._validate_setting(key, value, common.get_setting('invalid_setting_action')) if str_value is not None: @@ -233,8 +221,6 @@ def _prep_query(self, context: QueryContext): return final_query + fmt def _query_with_context(self, context: QueryContext) -> QueryResult: - context.settings = self._ensure_query_id(context.settings) - headers = {} params = {} if self.database: @@ -327,8 +313,6 @@ def data_insert(self, context: InsertContext) -> QuerySummary: logger.debug('No data included in insert, skipping') return QuerySummary() - context.settings = self._ensure_query_id(context.settings) - def error_handler(resp: HTTPResponse): # If we actually had a local exception when building the insert, throw that instead if context.insert_exception: @@ -364,8 +348,6 @@ def raw_insert(self, table: str = None, """ See BaseClient doc_string for this method """ - settings = self._ensure_query_id(settings) - params = {} headers = {'Content-Type': 'application/octet-stream'} if compression: @@ -409,8 +391,6 @@ def command(self, """ See BaseClient doc_string for this method """ - settings = self._ensure_query_id(settings) - cmd, params = bind_query(cmd, parameters, self.server_tz) headers = {} payload = None @@ -521,6 +501,10 @@ def _raw_request(self, final_params['http_headers_progress_interval_ms'] = self._progress_interval final_params = dict_copy(self.params, final_params) final_params = dict_copy(final_params, params) + + if self._autogenerate_query_id and "query_id" not in final_params: + final_params["query_id"] = str(uuid.uuid4()) + url = f'{self.url}?{urlencode(final_params)}' kwargs = { 'headers': headers, diff --git a/tests/integration_tests/test_client.py b/tests/integration_tests/test_client.py index 4dc5afbb..1407d3e9 100644 --- a/tests/integration_tests/test_client.py +++ b/tests/integration_tests/test_client.py @@ -20,6 +20,15 @@ klm,1,""" +def _is_valid_uuid_v4(id_string: str) -> bool: + """Helper function to validate that a string is a valid UUID v4""" + try: + parsed_uuid = uuid.UUID(id_string) + return parsed_uuid.version == 4 + except (ValueError, AttributeError): + return False + + def test_ping(test_client: Client): assert test_client.ping() is True @@ -388,29 +397,31 @@ def test_role_setting_works(test_client: Client, test_config: TestConfig): assert res.result_rows == [([role_limited],)] -# pylint: disable=protected-access -def test_autogenerate_query_id(test_client: Client, test_table_engine: str, test_config: TestConfig): - def _is_valid_uuid_v4_string(id_string: str) -> bool: - parsed_uuid = uuid.UUID(id_string) - return parsed_uuid.version == 4 - +def test_query_id_autogeneration(test_client: Client, test_table_engine: str): + """Test that query_id is auto-generated for query(), command(), and insert() methods""" result = test_client.query("SELECT 1") - assert _is_valid_uuid_v4_string(result.query_id) + assert _is_valid_uuid_v4(result.query_id) - manual_query_id = "test_manual_query_id_12345" - result = test_client.query("SELECT 2", settings={"query_id": manual_query_id}) - assert result.query_id == manual_query_id + summary = test_client.command("DROP TABLE IF EXISTS test_query_id_nonexistent") + assert _is_valid_uuid_v4(summary.query_id()) - summary = test_client.command("DROP TABLE IF EXISTS does_not_exist") - assert _is_valid_uuid_v4_string(summary.query_id()) + test_client.command("DROP TABLE IF EXISTS test_query_id_insert") + test_client.command(f"CREATE TABLE test_query_id_insert (id UInt32) ENGINE {test_table_engine} ORDER BY id") + summary = test_client.insert("test_query_id_insert", [[1], [2], [3]], column_names=["id"]) + assert _is_valid_uuid_v4(summary.query_id()) + test_client.command("DROP TABLE test_query_id_insert") + + +def test_query_id_manual_override(test_client: Client): + """Test that manually specified query_id is respected and not overwritten""" + manual_query_id = "test_manual_query_id_override" + result = test_client.query("SELECT 1", settings={"query_id": manual_query_id}) + assert result.query_id == manual_query_id - test_client.command("DROP TABLE IF EXISTS test_autogen_query_id") - test_client.command(f"CREATE TABLE test_autogen_query_id (id UInt32) ENGINE {test_table_engine} ORDER BY id") - summary = test_client.insert("test_autogen_query_id", [[1], [2], [3]], column_names=["id"]) - assert _is_valid_uuid_v4_string(summary.query_id()) - test_client.command("DROP TABLE test_autogen_query_id") - # Create client with autogenerate_query_id disabled +# pylint: disable=protected-access +def test_query_id_disabled(test_config: TestConfig): + """Test that autogenerate_query_id=False works correctly""" client_no_autogen = create_client( host=test_config.host, port=test_config.port, @@ -418,11 +429,43 @@ def _is_valid_uuid_v4_string(id_string: str) -> bool: password=test_config.password, autogenerate_query_id=False, ) - result = client_no_autogen.query("SELECT 4") - # Even with autogen disabled the server generates a query_id - # so we still expect a query_id in the result, it's just generated by the server... - # not sure how to verify that specifically though. - assert _is_valid_uuid_v4_string(result.query_id) - # We can verify that the setting on the client is False though assert client_no_autogen._autogenerate_query_id is False + + # Even with autogen disabled, server generates a query_id + result = client_no_autogen.query("SELECT 1") + assert _is_valid_uuid_v4(result.query_id) + + client_no_autogen.close() + + +def test_query_id_in_query_logs(test_client: Client, test_config: TestConfig): + """Test that query_id appears in ClickHouse's system.query_log for observability""" + if test_config.cloud: + pytest.skip("Skipping query_log test in cloud environment") + + def check_in_logs(test_query_id): + max_retries = 30 + for _ in range(max_retries): + log_result = test_client.query( + f"SELECT query_id FROM system.query_log WHERE query_id = '{test_query_id}' AND " "event_time > now() - 30 LIMIT 1" + ) + + if len(log_result.result_set) > 0: + assert log_result.result_set[0][0] == test_query_id + return + + sleep(0.1) + + # If we get here, query_id never appeared in logs + pytest.fail(f"query_id '{test_query_id}' did not appear in system.query_log after {max_retries * 0.1}s") + + # Manual override check + test_query_id_manual = "test_query_id_in_logs" + test_client.query("SELECT 1 as num", settings={"query_id": test_query_id_manual}) + check_in_logs(test_query_id_manual) + + # Autogen check + result = test_client.query("SELECT 2 as num") + test_query_id_auto = result.query_id + check_in_logs(test_query_id_auto) From cb157ed56a6bee924d242dee59aabcab47bd12e4 Mon Sep 17 00:00:00 2001 From: Joe S Date: Thu, 4 Dec 2025 09:49:33 -0800 Subject: [PATCH 3/5] revert docstring changes --- clickhouse_connect/driver/asyncclient.py | 8 ++------ clickhouse_connect/driver/client.py | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/clickhouse_connect/driver/asyncclient.py b/clickhouse_connect/driver/asyncclient.py index f40b3598..41244bbb 100644 --- a/clickhouse_connect/driver/asyncclient.py +++ b/clickhouse_connect/driver/asyncclient.py @@ -518,9 +518,7 @@ def create_query_context(self, Creates or updates a reusable QueryContext object :param query: Query statement/format string :param parameters: Optional dictionary used to format the query - :param settings: Optional dictionary of ClickHouse settings (key/string values). This includes both - server settings (e.g., max_threads, max_memory_usage) and HTTP interface parameters (e.g., query_id, - session_id, database). All settings are sent as URL query parameters. + :param settings: Optional dictionary of ClickHouse settings (key/string values) :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring :param encoding: See QueryContext __init__ docstring @@ -545,9 +543,7 @@ def create_query_context(self, :param use_extended_dtypes: Only relevant to Pandas Dataframe queries. Use Pandas "missing types", such as pandas.NA and pandas.NaT for ClickHouse NULL values, as well as extended Pandas dtypes such as IntegerArray and StringArray. Defaulted to True for query_df methods - :param transport_settings: Optional dictionary of transport level settings sent as HTTP headers. Use this for - custom headers (e.g., X-Workload) for load balancers or proxies. ClickHouse parameters like query_id should - go in the settings dict, not here. + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) :return: Reusable QueryContext """ diff --git a/clickhouse_connect/driver/client.py b/clickhouse_connect/driver/client.py index a3e474f5..1a7a53a6 100644 --- a/clickhouse_connect/driver/client.py +++ b/clickhouse_connect/driver/client.py @@ -473,9 +473,7 @@ def create_query_context(self, Creates or updates a reusable QueryContext object :param query: Query statement/format string :param parameters: Optional dictionary used to format the query - :param settings: Optional dictionary of ClickHouse settings (key/string values). This includes both - server settings (e.g., max_threads, max_memory_usage) and HTTP interface parameters (e.g., query_id, - session_id, database). All settings are sent as URL query parameters. + :param settings: Optional dictionary of ClickHouse settings (key/string values) :param query_formats: See QueryContext __init__ docstring :param column_formats: See QueryContext __init__ docstring :param encoding: See QueryContext __init__ docstring @@ -502,9 +500,7 @@ def create_query_context(self, :param use_extended_dtypes: Only relevant to Pandas Dataframe queries. Use Pandas "missing types", such as pandas.NA and pandas.NaT for ClickHouse NULL values, as well as extended Pandas dtypes such as IntegerArray and StringArray. Defaulted to True for query_df methods - :param transport_settings: Optional dictionary of transport level settings sent as HTTP headers. Use this for - custom headers (e.g., X-Workload) for load balancers or proxies. ClickHouse parameters like query_id should - go in the settings dict, not here. + :param transport_settings: Optional dictionary of transport level settings (HTTP headers, etc.) :return: Reusable QueryContext """ resolved_utc_tz_aware = self.utc_tz_aware if utc_tz_aware is None else utc_tz_aware From 0f494081b6ece1035b3ae5adbe66d1a4e82fd748 Mon Sep 17 00:00:00 2001 From: "Joe S." Date: Fri, 5 Dec 2025 09:03:07 -0800 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joe S. --- CHANGELOG.md | 2 +- tests/integration_tests/test_client.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a15e5053..82da9d5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,7 +29,7 @@ The supported method of passing ClickHouse server settings is to prefix such arg ### Improvements - Add support for QBit data type. Closes [#570](https://github.com/ClickHouse/clickhouse-connect/issues/570) - Add the ability to create table from PyArrow objects. Addresses [#588](https://github.com/ClickHouse/clickhouse-connect/issues/588) -- Always generate query_id from the client side as a UUID4 if it not explicitly set. Closes [#596](https://github.com/ClickHouse/clickhouse-connect/issues/596) +- Always generate query_id from the client side as a UUID4 if it is not explicitly set. Closes [#596](https://github.com/ClickHouse/clickhouse-connect/issues/596) ## 0.10.0, 2025-11-14 diff --git a/tests/integration_tests/test_client.py b/tests/integration_tests/test_client.py index 1407d3e9..f3ee5e18 100644 --- a/tests/integration_tests/test_client.py +++ b/tests/integration_tests/test_client.py @@ -448,7 +448,8 @@ def check_in_logs(test_query_id): max_retries = 30 for _ in range(max_retries): log_result = test_client.query( - f"SELECT query_id FROM system.query_log WHERE query_id = '{test_query_id}' AND " "event_time > now() - 30 LIMIT 1" + "SELECT query_id FROM system.query_log WHERE query_id = {query_id} AND event_time > now() - 30 LIMIT 1", + parameters={"query_id": test_query_id} ) if len(log_result.result_set) > 0: From 3b9aa36f3d100190ae5f13599806224db45d576c Mon Sep 17 00:00:00 2001 From: "Joe S." Date: Fri, 5 Dec 2025 09:19:36 -0800 Subject: [PATCH 5/5] specify type in binding Signed-off-by: Joe S. --- tests/integration_tests/test_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_client.py b/tests/integration_tests/test_client.py index f3ee5e18..eb24a6b5 100644 --- a/tests/integration_tests/test_client.py +++ b/tests/integration_tests/test_client.py @@ -448,7 +448,7 @@ def check_in_logs(test_query_id): max_retries = 30 for _ in range(max_retries): log_result = test_client.query( - "SELECT query_id FROM system.query_log WHERE query_id = {query_id} AND event_time > now() - 30 LIMIT 1", + "SELECT query_id FROM system.query_log WHERE query_id = {query_id:String} AND event_time > now() - 30 LIMIT 1", parameters={"query_id": test_query_id} )