Skip to content

Commit 49644f2

Browse files
committed
SQL: Stronger read-only mode, using sqlparse
1 parent 418af7a commit 49644f2

File tree

11 files changed

+274
-13
lines changed

11 files changed

+274
-13
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@
1414
- MCP docs: Used Hishel for transparently caching documentation resources,
1515
by default for one hour. Use the `CRATEDB_MCP_DOCS_CACHE_TTL` environment
1616
variable to adjust (default: 3600)
17+
- SQL: Stronger read-only mode, using `sqlparse`

README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,12 @@ LLMs can still hallucinate and give incorrect answers.
4040
* What is the storage consumption of my tables, give it in a graph.
4141
* How can I format a timestamp column to '2019 Jan 21'.
4242

43-
# Data integrity
44-
We do not recommend letting the LLMs insert data or modify columns by itself, as such we tell the
45-
LLMs to only allow 'SELECT' statements in the `cratedb_sql` tool, you can jailbreak this against
46-
our recommendation.
43+
# Read-only access
44+
We do not recommend letting LLM-based agents insert or modify data by itself.
45+
As such, only `SELECT` statements are permitted and forwarded to the database.
46+
All other operations will raise a `ValueError` exception, unless the
47+
`CRATEDB_MCP_PERMIT_ALL_STATEMENTS` environment variable is set to a
48+
truthy value.
4749

4850
# Install
4951
```shell

cratedb_mcp/__main__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from .knowledge import DocumentationIndex, Queries, documentation_url_permitted
66
from .settings import DOCS_CACHE_TTL, HTTP_TIMEOUT, HTTP_URL
7+
from .util.sql import sql_is_permitted
78

89
# Configure Hishel, an httpx client with caching.
910
# Define one hour of caching time.
@@ -25,7 +26,7 @@ def query_cratedb(query: str) -> list[dict]:
2526
@mcp.tool(description="Send a SQL query to CrateDB, only 'SELECT' queries are allows, queries that"
2627
" modify data, columns or are otherwise deemed un-safe are rejected.")
2728
def query_sql(query: str):
28-
if 'select' not in query.lower():
29+
if not sql_is_permitted(query):
2930
raise ValueError('Only queries that have a SELECT statement are allowed.')
3031
return query_cratedb(query)
3132

cratedb_mcp/settings.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
import warnings
33

4+
from attr.converters import to_bool
5+
46
HTTP_URL: str = os.getenv("CRATEDB_MCP_HTTP_URL", "http://localhost:4200")
57

68
# Configure cache lifetime for documentation resources.
@@ -15,3 +17,11 @@
1517

1618
# Configure HTTP timeout for all conversations.
1719
HTTP_TIMEOUT = 10.0
20+
21+
# Whether to permit all statements. By default, only SELECT operations are permitted.
22+
PERMIT_ALL_STATEMENTS: bool = to_bool(os.getenv("CRATEDB_MCP_PERMIT_ALL_STATEMENTS", "false"))
23+
24+
# TODO: Refactor into code which is not on the module level. Use OOM early.
25+
if PERMIT_ALL_STATEMENTS: # pragma: no cover
26+
warnings.warn("All types of SQL statements are permitted. This means the LLM "
27+
"agent can write and modify the connected database", category=UserWarning, stacklevel=2)

cratedb_mcp/util/__init__.py

Whitespace-only changes.

cratedb_mcp/util/sql.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
import dataclasses
2+
import logging
3+
import typing as t
4+
5+
import sqlparse
6+
from sqlparse.tokens import Keyword
7+
8+
from cratedb_mcp.settings import PERMIT_ALL_STATEMENTS
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
def sql_is_permitted(expression: str) -> bool:
14+
"""
15+
Validate the SQL expression, only permit read queries by default.
16+
17+
When the `CRATEDB_MCP_PERMIT_ALL_STATEMENTS` environment variable is set,
18+
allow all types of statements.
19+
20+
FIXME: Revisit implementation, it might be too naive or weak.
21+
Issue: https://github.com/crate/cratedb-mcp/issues/10
22+
Question: Does SQLAlchemy provide a solid read-only mode, or any other library?
23+
"""
24+
is_dql = SqlStatementClassifier(expression=expression, permit_all=PERMIT_ALL_STATEMENTS).is_dql
25+
if is_dql:
26+
logger.info(f"Permitted SQL expression: {expression and expression[:50]}...")
27+
else:
28+
logger.warning(f"Denied SQL expression: {expression and expression[:50]}...")
29+
return is_dql
30+
31+
32+
@dataclasses.dataclass
33+
class SqlStatementClassifier:
34+
"""
35+
Helper to classify an SQL statement.
36+
37+
Here, most importantly: Provide the `is_dql` property that
38+
signals truthfulness for read-only SQL SELECT statements only.
39+
"""
40+
expression: str
41+
permit_all: bool = False
42+
43+
_parsed_sqlparse: t.Any = dataclasses.field(init=False, default=None)
44+
45+
def __post_init__(self) -> None:
46+
if self.expression is None:
47+
self.expression = ""
48+
if self.expression:
49+
self.expression = self.expression.strip()
50+
51+
def parse_sqlparse(self) -> t.List[sqlparse.sql.Statement]:
52+
"""
53+
Parse expression using traditional `sqlparse` library.
54+
"""
55+
if self._parsed_sqlparse is None:
56+
self._parsed_sqlparse = sqlparse.parse(self.expression)
57+
return self._parsed_sqlparse
58+
59+
@property
60+
def is_dql(self) -> bool:
61+
"""
62+
Whether the statement is a DQL statement, which effectively invokes read-only operations only.
63+
"""
64+
65+
if not self.expression:
66+
return False
67+
68+
if self.permit_all:
69+
return True
70+
71+
# Check if the expression is valid and if it's a DQL/SELECT statement,
72+
# also trying to consider `SELECT ... INTO ...` and evasive
73+
# `SELECT * FROM users; \uff1b DROP TABLE users` statements.
74+
return self.is_select and not self.is_camouflage
75+
76+
@property
77+
def is_select(self) -> bool:
78+
"""
79+
Whether the expression is an SQL SELECT statement.
80+
"""
81+
return self.operation == 'SELECT'
82+
83+
@property
84+
def operation(self) -> str:
85+
"""
86+
The SQL operation: SELECT, INSERT, UPDATE, DELETE, CREATE, etc.
87+
"""
88+
parsed = self.parse_sqlparse()
89+
return parsed[0].get_type().upper()
90+
91+
@property
92+
def is_camouflage(self) -> bool:
93+
"""
94+
Innocent-looking `SELECT` statements can evade filters.
95+
"""
96+
return self.is_select_into or self.is_evasive
97+
98+
@property
99+
def is_select_into(self) -> bool:
100+
"""
101+
Use traditional `sqlparse` for catching `SELECT ... INTO ...` statements.
102+
Examples:
103+
SELECT * INTO foobar FROM bazqux
104+
SELECT * FROM bazqux INTO foobar
105+
"""
106+
# Flatten all tokens (including nested ones) and match on type+value.
107+
statement = self.parse_sqlparse()[0]
108+
return any(
109+
token.ttype is Keyword and token.value.upper() == "INTO"
110+
for token in statement.flatten()
111+
)
112+
113+
@property
114+
def is_evasive(self) -> bool:
115+
"""
116+
Use traditional `sqlparse` for catching evasive SQL statements.
117+
118+
A practice picked up from CodeRabbit was to reject multiple statements
119+
to prevent potential SQL injections. Is it a viable suggestion?
120+
121+
Examples:
122+
123+
SELECT * FROM users; \uff1b DROP TABLE users
124+
"""
125+
parsed = self.parse_sqlparse()
126+
return len(parsed) > 1

docs/backlog.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# CrateDB MCP backlog
2+
3+
## Iteration +1
4+
- Docs: HTTP caching
5+
- Docs: Load documentation index from custom outline file
6+
- SQL: Extract `SqlFilter` or `SqlGateway` functionality to `cratedb-sqlparse`
7+
8+
## Done
9+
- SQL: Stronger read-only mode

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ dynamic = [
2020
"version",
2121
]
2222
dependencies = [
23+
"attrs",
2324
"cachetools<6",
2425
"cratedb-about==0.0.4",
2526
"hishel<0.2",
2627
"mcp[cli]>=1.5.0",
28+
"sqlparse<0.6",
2729
]
2830
optional-dependencies.develop = [
2931
"mypy<1.16",

tests/test_knowledge.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ def test_queries():
2020
assert "sys.health" in Queries.TABLES_METADATA
2121
assert "WITH partitions_health" in Queries.TABLES_METADATA
2222
assert "LEFT JOIN" in Queries.TABLES_METADATA
23+
24+

tests/test_mcp.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,20 @@ def test_fetch_docs_permitted_cratedb_com():
2929
assert "initcap" in response
3030

3131

32-
def test_query_sql_forbidden():
32+
def test_query_sql_permitted():
33+
assert query_sql("SELECT 42")["rows"] == [[42]]
34+
35+
36+
def test_query_sql_forbidden_easy():
3337
with pytest.raises(ValueError) as ex:
3438
assert "RelationUnknown" in str(query_sql("INSERT INTO foobar (id) VALUES (42) RETURNING id"))
3539
assert ex.match("Only queries that have a SELECT statement are allowed")
3640

3741

38-
def test_query_sql_permitted():
39-
assert query_sql("SELECT 42")["rows"] == [[42]]
40-
41-
42-
def test_query_sql_permitted_exploit():
43-
# FIXME: Read-only protection must become stronger.
44-
query_sql("INSERT INTO foobar (operation) VALUES ('select')")
42+
def test_query_sql_forbidden_sneak_value():
43+
with pytest.raises(ValueError) as ex:
44+
query_sql("INSERT INTO foobar (operation) VALUES ('select')")
45+
assert ex.match("Only queries that have a SELECT statement are allowed")
4546

4647

4748
def test_get_table_metadata():

tests/test_util.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import cratedb_mcp
2+
from cratedb_mcp.util.sql import sql_is_permitted
3+
4+
5+
def test_sql_select_permitted():
6+
"""Regular SQL SELECT statements are permitted"""
7+
assert sql_is_permitted("SELECT 42") is True
8+
assert sql_is_permitted(" SELECT 42") is True
9+
assert sql_is_permitted("select 42") is True
10+
11+
12+
def test_sql_select_rejected():
13+
"""Bogus SQL SELECT statements are rejected"""
14+
assert sql_is_permitted(r"--\; select 42") is False
15+
16+
17+
def test_sql_insert_allowed(mocker):
18+
"""When explicitly allowed, permit any kind of statement"""
19+
mocker.patch.object(cratedb_mcp.util.sql, "PERMIT_ALL_STATEMENTS", True)
20+
assert sql_is_permitted("INSERT INTO foobar") is True
21+
22+
23+
def test_sql_select_multiple_rejected():
24+
"""Multiple SQL statements are rejected"""
25+
assert sql_is_permitted("SELECT 42; SELECT 42;") is False
26+
27+
28+
def test_sql_create_rejected():
29+
"""DDL statements are rejected"""
30+
assert sql_is_permitted("CREATE TABLE foobar AS SELECT 42") is False
31+
32+
33+
def test_sql_insert_rejected():
34+
"""DML statements are rejected"""
35+
assert sql_is_permitted("INSERT INTO foobar") is False
36+
37+
38+
def test_sql_select_into_rejected():
39+
"""SELECT+DML statements are rejected"""
40+
assert sql_is_permitted("SELECT * INTO foobar FROM bazqux") is False
41+
assert sql_is_permitted("SELECT * FROM bazqux INTO foobar") is False
42+
assert sql_is_permitted("WITH FOO AS (SELECT * FROM (SELECT * bazqux INTO foobar))") is False
43+
44+
45+
def test_sql_select_into_permitted():
46+
"""Forbidden keywords do not contribute to classification when used as labels"""
47+
assert sql_is_permitted('SELECT * FROM "into"') is True
48+
assert sql_is_permitted('SELECT * FROM "INTO"') is True
49+
50+
51+
def test_sql_empty_rejected():
52+
"""Empty statements are rejected"""
53+
assert sql_is_permitted("") is False
54+
55+
56+
def test_sql_almost_empty_rejected():
57+
"""Quasi-empty statements are rejected"""
58+
assert sql_is_permitted(" ") is False
59+
60+
61+
def test_sql_none_rejected():
62+
"""Void statements are rejected"""
63+
assert sql_is_permitted(None) is False
64+
65+
66+
def test_sql_multiple_statements_rejected():
67+
assert sql_is_permitted("SELECT 42; INSERT INTO foo VALUES (1)") is False
68+
69+
70+
def test_sql_with_comments_rejected():
71+
assert sql_is_permitted(
72+
"/* Sneaky comment */ INSERT /* another comment */ INTO foo VALUES (1)") is False
73+
74+
75+
def test_sql_update_rejected():
76+
"""UPDATE statements are rejected"""
77+
assert sql_is_permitted("UPDATE foobar SET column = 'value'") is False
78+
79+
80+
def test_sql_delete_rejected():
81+
"""DELETE statements are rejected"""
82+
assert sql_is_permitted("DELETE FROM foobar") is False
83+
84+
85+
def test_sql_truncate_rejected():
86+
"""TRUNCATE statements are rejected"""
87+
assert sql_is_permitted("TRUNCATE TABLE foobar") is False
88+
89+
90+
def test_sql_drop_rejected():
91+
"""DROP statements are rejected"""
92+
assert sql_is_permitted("DROP TABLE foobar") is False
93+
94+
95+
def test_sql_alter_rejected():
96+
"""ALTER statements are rejected"""
97+
assert sql_is_permitted("ALTER TABLE foobar ADD COLUMN newcol INTEGER") is False
98+
99+
100+
def test_sql_case_manipulation_rejected():
101+
"""Statements with case manipulation to hide intent are rejected"""
102+
assert sql_is_permitted("SeLeCt * FrOm users; DrOp TaBlE users") is False
103+
104+
105+
def test_sql_unicode_evasion_rejected():
106+
"""Statements with unicode characters to evade filters are rejected"""
107+
assert sql_is_permitted("SELECT * FROM users; \uFF1B DROP TABLE users") is False

0 commit comments

Comments
 (0)