Skip to content

Commit 52e3025

Browse files
author
Augusto Hack
authored
Merge pull request #362 from aiven/jjaakola-aiven-avro-schema-anonymized-export
Export anonymized Avro schema with schema backup tool #362
2 parents a90987a + 31ae379 commit 52e3025

5 files changed

Lines changed: 842 additions & 29 deletions

File tree

karapace/anonymize_schemas/__init__.py

Whitespace-only changes.
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
"""
2+
karapace - anonymize avro
3+
4+
Copyright (c) 2022 Aiven Ltd
5+
See LICENSE for details
6+
"""
7+
from typing import Any, Dict, List, Union
8+
9+
import hashlib
10+
11+
ALIASES = "aliases"
12+
DEFAULT = "default"
13+
DOC = "doc"
14+
ENUM = "enum"
15+
FIELDS = "fields"
16+
ITEMS = "items"
17+
LOGICAL_TYPE = "logicalType"
18+
NAME = "name"
19+
NAMESPACE = "namespace"
20+
ORDER = "order"
21+
PRECISION = "precision"
22+
SCALE = "scale"
23+
SIZE = "size"
24+
SYMBOLS = "symbols"
25+
TYPE = "type"
26+
VALUES = "values"
27+
28+
29+
# Doc and order field are special case and not included
30+
KEYWORDS = [
31+
ALIASES,
32+
DEFAULT,
33+
ENUM,
34+
FIELDS,
35+
ITEMS,
36+
LOGICAL_TYPE,
37+
NAME,
38+
NAMESPACE,
39+
PRECISION,
40+
SCALE,
41+
SIZE,
42+
SYMBOLS,
43+
TYPE,
44+
VALUES,
45+
]
46+
PRIMITIVE_TYPES = ["null", "boolean", "int", "long", "float", "double", "bytes", "string"]
47+
LOGICAL_TYPES = [
48+
"date",
49+
"decimal",
50+
"duration",
51+
"local-timestamp-micros",
52+
"local-timestamp-millis",
53+
"time-micros",
54+
"time-millis",
55+
"timestamp-micros",
56+
"timestamp-millis",
57+
"uuid",
58+
]
59+
ALL_TYPES = PRIMITIVE_TYPES + LOGICAL_TYPES + ["array", ENUM, "fixed", "map", "record"]
60+
61+
ORDER_VALID_VALUES = ["ascending", "descending", "ignore"]
62+
63+
64+
def anonymize_name(name: str) -> str:
65+
"""Anonymize the name.
66+
67+
Name is splitted by dot to logical elements that the whole name consists of.
68+
The element is sha1 hashed. After hashing element is cleaned to conform to Avro name
69+
format of:
70+
* start with [A-Za-z_]
71+
* subsequently contain only [A-Za-z0-9_]
72+
73+
Returns anonymized name.
74+
"""
75+
anonymized_elements = []
76+
for element in name.split("."):
77+
element = hashlib.sha1(element.encode("utf-8")).hexdigest()
78+
# SHA-1 can start with number, just add 'a' as first character.
79+
# This breaks the hash, but is still consistent as required.
80+
as_list_element = list(element)
81+
as_list_element[0] = "a"
82+
anonymized_elements.append("".join(as_list_element))
83+
84+
return ".".join(anonymized_elements)
85+
86+
87+
Schema = Union[str, Dict[str, Any], List[Any]]
88+
89+
90+
def anonymize(input_schema: Schema) -> Schema:
91+
if not input_schema: # pylint: disable=no-else-return
92+
return input_schema
93+
elif isinstance(input_schema, str):
94+
if input_schema in ALL_TYPES:
95+
return input_schema
96+
return anonymize_name(input_schema)
97+
elif isinstance(input_schema, List):
98+
return [anonymize(value) for value in input_schema]
99+
elif isinstance(input_schema, Dict):
100+
output_schema: Dict[str, Any] = {}
101+
for key, value in input_schema.items():
102+
if key in KEYWORDS:
103+
output_schema[key] = anonymize(value)
104+
elif key == DOC:
105+
pass # Doc attribute may contain non-public and sensitive data
106+
elif key == ORDER:
107+
if value in ORDER_VALID_VALUES:
108+
output_schema[key] = value
109+
else:
110+
output_schema[key] = anonymize(value)
111+
else:
112+
if isinstance(key, str):
113+
key = anonymize_name(key)
114+
output_schema[key] = anonymize(value)
115+
return output_schema
116+
else:
117+
return input_schema

karapace/schema_backup.py

Lines changed: 69 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
from kafka.admin import KafkaAdminClient
99
from kafka.errors import NoBrokersAvailable, NodeNotReadyError, TopicAlreadyExistsError
1010
from karapace import constants
11+
from karapace.anonymize_schemas import anonymize_avro
1112
from karapace.config import Config, read_config
1213
from karapace.schema_reader import KafkaSchemaReader
1314
from karapace.utils import json_encode, KarapaceKafkaClient
14-
from typing import Optional
15+
from typing import Dict, List, Optional, Tuple
1516

1617
import argparse
1718
import logging
@@ -138,33 +139,7 @@ def close(self):
138139
self.admin_client = None
139140

140141
def request_backup(self):
141-
if not self.consumer:
142-
self.init_consumer()
143-
self.log.info("Starting schema backup read for topic: %r", self.topic_name)
144-
145-
values = []
146-
topic_fully_consumed = False
147-
148-
while not topic_fully_consumed:
149-
150-
raw_msg = self.consumer.poll(timeout_ms=self.timeout_ms)
151-
topic_fully_consumed = len(raw_msg) == 0
152-
153-
for _, messages in raw_msg.items():
154-
for message in messages:
155-
key = message.key.decode("utf8")
156-
try:
157-
key = ujson.loads(key)
158-
except ValueError:
159-
self.log.debug("Invalid JSON in message.key: %r, value: %r", message.key, message.value)
160-
value = None
161-
if message.value:
162-
value = message.value.decode("utf8")
163-
try:
164-
value = ujson.loads(value)
165-
except ValueError:
166-
self.log.debug("Invalid JSON in message.value: %r, key: %r", message.value, message.key)
167-
values.append((key, value))
142+
values = self._export()
168143

169144
ser = ujson.dumps(values)
170145
if self.backup_location:
@@ -203,6 +178,65 @@ def restore_backup(self):
203178
self.log.debug("Sent kafka msg key: %r, value: %r, offset: %r", key, value, msg.offset)
204179
self.close()
205180

181+
def export_anonymized_avro_schemas(self):
182+
values = self._export()
183+
anonymized_schemas = []
184+
185+
for value in values:
186+
# The schemas topic contain all changes to schema metadata.
187+
# Check that the message has key `schema` and type is Avro schema.
188+
# The Avro schemas may have `schemaType` key, if not present the schema is Avro.
189+
if value[1] and "schema" in value[1] and value[1].get("schemaType", "AVRO") == "AVRO":
190+
original_schema = ujson.loads(value[1].get("schema"))
191+
anonymized_schema = anonymize_avro.anonymize(original_schema)
192+
if anonymized_schema:
193+
if "subject" in value[0]:
194+
value[0]["subject"] = anonymize_avro.anonymize_name(value[0]["subject"])
195+
if "subject" in value[1]:
196+
value[1]["subject"] = anonymize_avro.anonymize_name(value[1]["subject"])
197+
value[1]["schema"] = anonymized_schema
198+
anonymized_schemas.append((value[0], value[1]))
199+
ser = ujson.dumps(anonymized_schemas)
200+
if self.backup_location:
201+
with open(self.backup_location, mode="w", encoding="utf8") as fp:
202+
fp.write(ser)
203+
self.log.info("Anonymized Avro schema export written to %r", self.backup_location)
204+
else:
205+
print(ser)
206+
self.log.info("Anonymized Avro schema export written to stdout")
207+
self.close()
208+
209+
def _export(self) -> List[Tuple[str, Dict[str, str]]]:
210+
if not self.consumer:
211+
self.init_consumer()
212+
self.log.info("Starting schema backup read for topic: %r", self.topic_name)
213+
214+
values = []
215+
topic_fully_consumed = False
216+
217+
while not topic_fully_consumed:
218+
219+
raw_msg = self.consumer.poll(timeout_ms=self.timeout_ms)
220+
topic_fully_consumed = len(raw_msg) == 0
221+
222+
for _, messages in raw_msg.items():
223+
for message in messages:
224+
key = message.key.decode("utf8")
225+
try:
226+
key = ujson.loads(key)
227+
except ValueError:
228+
self.log.debug("Invalid JSON in message.key: %r, value: %r", message.key, message.value)
229+
value = None
230+
if message.value:
231+
value = message.value.decode("utf8")
232+
try:
233+
value = ujson.loads(value)
234+
except ValueError:
235+
self.log.debug("Invalid JSON in message.value: %r, key: %r", message.value, message.key)
236+
values.append((key, value))
237+
238+
return values
239+
206240

207241
def encode_value(value):
208242
if value == "null":
@@ -218,7 +252,10 @@ def parse_args():
218252

219253
parser_get = subparsers.add_parser("get", help="Store the schema backup into a file")
220254
parser_restore = subparsers.add_parser("restore", help="Restore the schema backup from a file")
221-
for p in [parser_get, parser_restore]:
255+
parser_export_anonymized_avro_schemas = subparsers.add_parser(
256+
"export-anonymized-avro-schemas", help="Export anonymized Avro schemas into a file"
257+
)
258+
for p in [parser_get, parser_restore, parser_export_anonymized_avro_schemas]:
222259
p.add_argument("--config", help="Configuration file path", required=True)
223260
p.add_argument("--location", default="", help="File path for the backup file")
224261
p.add_argument("--topic", help="Kafka topic name to be used", required=False)
@@ -240,6 +277,9 @@ def main() -> int:
240277
if args.command == "restore":
241278
sb.restore_backup()
242279
return 0
280+
if args.command == "export-anonymized-avro-schemas":
281+
sb.export_anonymized_avro_schemas()
282+
return 0
243283
return 1
244284

245285

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
"""
2+
karapace - test schema backup
3+
4+
Copyright (c) 2019 Aiven Ltd
5+
See LICENSE for details
6+
"""
7+
from karapace.config import set_config_defaults
8+
from karapace.schema_backup import SchemaBackup
9+
from karapace.utils import Client
10+
from pathlib import Path
11+
from tests.utils import KafkaServers
12+
from typing import Any, Dict
13+
14+
import os
15+
import ujson
16+
17+
baseurl = "http://localhost:8081"
18+
19+
20+
JSON_SUBJECT = "json-schemas"
21+
JSON_SUBJECT_HASH = "a2a0483c6ce0d38798ef218420e3f132608dbebf"
22+
JSON_SCHEMA = {
23+
"type": "object",
24+
"title": "JSON-schema",
25+
"description": "example",
26+
"properties": {"test": {"type": "integer", "title": "my test number", "default": 5}},
27+
}
28+
29+
AVRO_SUBJECT = "avro-schemas"
30+
AVRO_SUBJECT_HASH = "a801beafef1fb8c03907b44ec7baca341a58420d"
31+
AVRO_SCHEMA = {
32+
"type": "record",
33+
"namespace": "io.aiven",
34+
"name": "myrecord",
35+
"fields": [
36+
{
37+
"type": "string",
38+
"name": "f1",
39+
},
40+
],
41+
}
42+
EXPECTED_AVRO_SCHEMA = {
43+
"type": "record",
44+
"namespace": "aa258230180d9c643f761089d7e33b8b52288ed3.ae02f26b082c5f3bc7027f72335dd1186a2cd382",
45+
"name": "afe8733e983101f1f4ff50d24152890d0da71418",
46+
"fields": [
47+
{
48+
"type": "string",
49+
"name": "a09bb890b096f7306f688cc6d1dad34e7e52a223",
50+
},
51+
],
52+
}
53+
54+
55+
async def insert_data(c: Client, schemaType: str, subject: str, data: Dict[str, Any]) -> None:
56+
schema_string = ujson.dumps(data)
57+
res = await c.post(
58+
"subjects/{}/versions".format(subject),
59+
json={"schema": f"{schema_string}", "schemaType": schemaType},
60+
)
61+
assert res.status == 200
62+
assert "id" in res.json()
63+
64+
65+
async def test_export_anonymized_avro_schemas(
66+
registry_async_client: Client, kafka_servers: KafkaServers, tmp_path: Path
67+
) -> None:
68+
await insert_data(registry_async_client, "JSON", JSON_SUBJECT, JSON_SCHEMA)
69+
await insert_data(registry_async_client, "AVRO", AVRO_SUBJECT, AVRO_SCHEMA)
70+
71+
# Get the backup
72+
export_location = tmp_path / "export.log"
73+
config = set_config_defaults({"bootstrap_uri": kafka_servers.bootstrap_servers})
74+
sb = SchemaBackup(config, str(export_location))
75+
sb.export_anonymized_avro_schemas()
76+
77+
# The export file has been created
78+
assert os.path.exists(export_location)
79+
80+
expected_subject_hash_found = False
81+
json_schema_subject_hash_found = False
82+
with export_location.open("r") as fp:
83+
exported_data = ujson.load(fp)
84+
for msg in exported_data:
85+
assert len(msg) == 2
86+
key = msg[0]
87+
subject_hash = key.get("subject", None)
88+
if subject_hash == AVRO_SUBJECT_HASH:
89+
expected_subject_hash_found = True
90+
schema_data = msg[1]
91+
92+
assert schema_data["subject"] == AVRO_SUBJECT_HASH
93+
assert schema_data["schema"] == EXPECTED_AVRO_SCHEMA
94+
if subject_hash == JSON_SUBJECT_HASH:
95+
json_schema_subject_hash_found = True
96+
97+
assert expected_subject_hash_found
98+
assert not json_schema_subject_hash_found

0 commit comments

Comments
 (0)