Aiven-Open
diff --git a/‎karapace/anonymize_schemas/__init__.py‎ b/‎karapace/anonymize_schemas/__init__.py‎
diff --git a/‎karapace/anonymize_schemas/anonymize_avro.py‎
Lines changed: 117 additions & 0 deletions b/‎karapace/anonymize_schemas/anonymize_avro.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎karapace/schema_backup.py‎
Lines changed: 69 additions & 29 deletions b/‎karapace/schema_backup.py‎
Lines changed: 69 additions & 29 deletions
diff --git a/‎tests/integration/test_schema_backup_avro_export.py‎
Lines changed: 98 additions & 0 deletions b/‎tests/integration/test_schema_backup_avro_export.py‎
Lines changed: 98 additions & 0 deletions
@@ -0,0 +1,117 @@
+"""
+karapace - anonymize avro
+
+Copyright (c) 2022 Aiven Ltd
+See LICENSE for details
+"""
+from typing import Any, Dict, List, Union
+
+import hashlib
+
+ALIASES = "aliases"
+DEFAULT = "default"
+DOC = "doc"
+ENUM = "enum"
+FIELDS = "fields"
+ITEMS = "items"
+LOGICAL_TYPE = "logicalType"
+NAME = "name"
+NAMESPACE = "namespace"
+ORDER = "order"
+PRECISION = "precision"
+SCALE = "scale"
+SIZE = "size"
+SYMBOLS = "symbols"
+TYPE = "type"
+VALUES = "values"
+
+
+# Doc and order field are special case and not included
+KEYWORDS = [
+    ALIASES,
+    DEFAULT,
+    ENUM,
+    FIELDS,
+    ITEMS,
+    LOGICAL_TYPE,
+    NAME,
+    NAMESPACE,
+    PRECISION,
+    SCALE,
+    SIZE,
+    SYMBOLS,
+    TYPE,
+    VALUES,
+]
+PRIMITIVE_TYPES = ["null", "boolean", "int", "long", "float", "double", "bytes", "string"]
+LOGICAL_TYPES = [
+    "date",
+    "decimal",
+    "duration",
+    "local-timestamp-micros",
+    "local-timestamp-millis",
+    "time-micros",
+    "time-millis",
+    "timestamp-micros",
+    "timestamp-millis",
+    "uuid",
+]
+ALL_TYPES = PRIMITIVE_TYPES + LOGICAL_TYPES + ["array", ENUM, "fixed", "map", "record"]
+
+ORDER_VALID_VALUES = ["ascending", "descending", "ignore"]
+
+
+def anonymize_name(name: str) -> str:
+    """Anonymize the name.
+
+    Name is splitted by dot to logical elements that the whole name consists of.
+    The element is sha1 hashed. After hashing element is cleaned to conform to Avro name
+    format of:
+      * start with [A-Za-z_]
+      * subsequently contain only [A-Za-z0-9_]
+
+    Returns anonymized name.
+    """
+    anonymized_elements = []
+    for element in name.split("."):
+        element = hashlib.sha1(element.encode("utf-8")).hexdigest()
+        # SHA-1 can start with number, just add 'a' as first character.
+        # This breaks the hash, but is still consistent as required.
+        as_list_element = list(element)
+        as_list_element[0] = "a"
+        anonymized_elements.append("".join(as_list_element))
+
+    return ".".join(anonymized_elements)
+
+
+Schema = Union[str, Dict[str, Any], List[Any]]
+
+
+def anonymize(input_schema: Schema) -> Schema:
+    if not input_schema:  # pylint: disable=no-else-return
+        return input_schema
+    elif isinstance(input_schema, str):
+        if input_schema in ALL_TYPES:
+            return input_schema
+        return anonymize_name(input_schema)
+    elif isinstance(input_schema, List):
+        return [anonymize(value) for value in input_schema]
+    elif isinstance(input_schema, Dict):
+        output_schema: Dict[str, Any] = {}
+        for key, value in input_schema.items():
+            if key in KEYWORDS:
+                output_schema[key] = anonymize(value)
+            elif key == DOC:
+                pass  # Doc attribute may contain non-public and sensitive data
+            elif key == ORDER:
+                if value in ORDER_VALID_VALUES:
+                    output_schema[key] = value
+                else:
+                    output_schema[key] = anonymize(value)
+            else:
+                if isinstance(key, str):
+                    key = anonymize_name(key)
+                output_schema[key] = anonymize(value)
+        return output_schema
+    else:
+        return input_schema
@@ -8,10 +8,11 @@
 from kafka.admin import KafkaAdminClient
 from kafka.errors import NoBrokersAvailable, NodeNotReadyError, TopicAlreadyExistsError
 from karapace import constants
+from karapace.anonymize_schemas import anonymize_avro
 from karapace.config import Config, read_config
 from karapace.schema_reader import KafkaSchemaReader
 from karapace.utils import json_encode, KarapaceKafkaClient
-from typing import Optional
+from typing import Dict, List, Optional, Tuple
 
 import argparse
 import logging
@@ -138,33 +139,7 @@ def close(self):
             self.admin_client = None
 
     def request_backup(self):
-        if not self.consumer:
-            self.init_consumer()
-        self.log.info("Starting schema backup read for topic: %r", self.topic_name)
-
-        values = []
-        topic_fully_consumed = False
-
-        while not topic_fully_consumed:
-
-            raw_msg = self.consumer.poll(timeout_ms=self.timeout_ms)
-            topic_fully_consumed = len(raw_msg) == 0
-
-            for _, messages in raw_msg.items():
-                for message in messages:
-                    key = message.key.decode("utf8")
-                    try:
-                        key = ujson.loads(key)
-                    except ValueError:
-                        self.log.debug("Invalid JSON in message.key: %r, value: %r", message.key, message.value)
-                    value = None
-                    if message.value:
-                        value = message.value.decode("utf8")
-                        try:
-                            value = ujson.loads(value)
-                        except ValueError:
-                            self.log.debug("Invalid JSON in message.value: %r, key: %r", message.value, message.key)
-                    values.append((key, value))
+        values = self._export()
 
         ser = ujson.dumps(values)
         if self.backup_location:
@@ -203,6 +178,65 @@ def restore_backup(self):
             self.log.debug("Sent kafka msg key: %r, value: %r, offset: %r", key, value, msg.offset)
         self.close()
 
+    def export_anonymized_avro_schemas(self):
+        values = self._export()
+        anonymized_schemas = []
+
+        for value in values:
+            # The schemas topic contain all changes to schema metadata.
+            # Check that the message has key `schema` and type is Avro schema.
+            # The Avro schemas may have `schemaType` key, if not present the schema is Avro.
+            if value[1] and "schema" in value[1] and value[1].get("schemaType", "AVRO") == "AVRO":
+                original_schema = ujson.loads(value[1].get("schema"))
+                anonymized_schema = anonymize_avro.anonymize(original_schema)
+                if anonymized_schema:
+                    if "subject" in value[0]:
+                        value[0]["subject"] = anonymize_avro.anonymize_name(value[0]["subject"])
+                    if "subject" in value[1]:
+                        value[1]["subject"] = anonymize_avro.anonymize_name(value[1]["subject"])
+                    value[1]["schema"] = anonymized_schema
+                    anonymized_schemas.append((value[0], value[1]))
+        ser = ujson.dumps(anonymized_schemas)
+        if self.backup_location:
+            with open(self.backup_location, mode="w", encoding="utf8") as fp:
+                fp.write(ser)
+                self.log.info("Anonymized Avro schema export written to %r", self.backup_location)
+        else:
+            print(ser)
+            self.log.info("Anonymized Avro schema export written to stdout")
+        self.close()
+
+    def _export(self) -> List[Tuple[str, Dict[str, str]]]:
+        if not self.consumer:
+            self.init_consumer()
+        self.log.info("Starting schema backup read for topic: %r", self.topic_name)
+
+        values = []
+        topic_fully_consumed = False
+
+        while not topic_fully_consumed:
+
+            raw_msg = self.consumer.poll(timeout_ms=self.timeout_ms)
+            topic_fully_consumed = len(raw_msg) == 0
+
+            for _, messages in raw_msg.items():
+                for message in messages:
+                    key = message.key.decode("utf8")
+                    try:
+                        key = ujson.loads(key)
+                    except ValueError:
+                        self.log.debug("Invalid JSON in message.key: %r, value: %r", message.key, message.value)
+                    value = None
+                    if message.value:
+                        value = message.value.decode("utf8")
+                        try:
+                            value = ujson.loads(value)
+                        except ValueError:
+                            self.log.debug("Invalid JSON in message.value: %r, key: %r", message.value, message.key)
+                    values.append((key, value))
+
+        return values
+
 
 def encode_value(value):
     if value == "null":
@@ -218,7 +252,10 @@ def parse_args():
 
     parser_get = subparsers.add_parser("get", help="Store the schema backup into a file")
     parser_restore = subparsers.add_parser("restore", help="Restore the schema backup from a file")
-    for p in [parser_get, parser_restore]:
+    parser_export_anonymized_avro_schemas = subparsers.add_parser(
+        "export-anonymized-avro-schemas", help="Export anonymized Avro schemas into a file"
+    )
+    for p in [parser_get, parser_restore, parser_export_anonymized_avro_schemas]:
         p.add_argument("--config", help="Configuration file path", required=True)
         p.add_argument("--location", default="", help="File path for the backup file")
         p.add_argument("--topic", help="Kafka topic name to be used", required=False)
@@ -240,6 +277,9 @@ def main() -> int:
     if args.command == "restore":
         sb.restore_backup()
         return 0
+    if args.command == "export-anonymized-avro-schemas":
+        sb.export_anonymized_avro_schemas()
+        return 0
     return 1
 
 
 
@@ -0,0 +1,98 @@
+"""
+karapace - test schema backup
+
+Copyright (c) 2019 Aiven Ltd
+See LICENSE for details
+"""
+from karapace.config import set_config_defaults
+from karapace.schema_backup import SchemaBackup
+from karapace.utils import Client
+from pathlib import Path
+from tests.utils import KafkaServers
+from typing import Any, Dict
+
+import os
+import ujson
+
+baseurl = "http://localhost:8081"
+
+
+JSON_SUBJECT = "json-schemas"
+JSON_SUBJECT_HASH = "a2a0483c6ce0d38798ef218420e3f132608dbebf"
+JSON_SCHEMA = {
+    "type": "object",
+    "title": "JSON-schema",
+    "description": "example",
+    "properties": {"test": {"type": "integer", "title": "my test number", "default": 5}},
+}
+
+AVRO_SUBJECT = "avro-schemas"
+AVRO_SUBJECT_HASH = "a801beafef1fb8c03907b44ec7baca341a58420d"
+AVRO_SCHEMA = {
+    "type": "record",
+    "namespace": "io.aiven",
+    "name": "myrecord",
+    "fields": [
+        {
+            "type": "string",
+            "name": "f1",
+        },
+    ],
+}
+EXPECTED_AVRO_SCHEMA = {
+    "type": "record",
+    "namespace": "aa258230180d9c643f761089d7e33b8b52288ed3.ae02f26b082c5f3bc7027f72335dd1186a2cd382",
+    "name": "afe8733e983101f1f4ff50d24152890d0da71418",
+    "fields": [
+        {
+            "type": "string",
+            "name": "a09bb890b096f7306f688cc6d1dad34e7e52a223",
+        },
+    ],
+}
+
+
+async def insert_data(c: Client, schemaType: str, subject: str, data: Dict[str, Any]) -> None:
+    schema_string = ujson.dumps(data)
+    res = await c.post(
+        "subjects/{}/versions".format(subject),
+        json={"schema": f"{schema_string}", "schemaType": schemaType},
+    )
+    assert res.status == 200
+    assert "id" in res.json()
+
+
+async def test_export_anonymized_avro_schemas(
+    registry_async_client: Client, kafka_servers: KafkaServers, tmp_path: Path
+) -> None:
+    await insert_data(registry_async_client, "JSON", JSON_SUBJECT, JSON_SCHEMA)
+    await insert_data(registry_async_client, "AVRO", AVRO_SUBJECT, AVRO_SCHEMA)
+
+    # Get the backup
+    export_location = tmp_path / "export.log"
+    config = set_config_defaults({"bootstrap_uri": kafka_servers.bootstrap_servers})
+    sb = SchemaBackup(config, str(export_location))
+    sb.export_anonymized_avro_schemas()
+
+    # The export file has been created
+    assert os.path.exists(export_location)
+
+    expected_subject_hash_found = False
+    json_schema_subject_hash_found = False
+    with export_location.open("r") as fp:
+        exported_data = ujson.load(fp)
+        for msg in exported_data:
+            assert len(msg) == 2
+            key = msg[0]
+            subject_hash = key.get("subject", None)
+            if subject_hash == AVRO_SUBJECT_HASH:
+                expected_subject_hash_found = True
+                schema_data = msg[1]
+
+                assert schema_data["subject"] == AVRO_SUBJECT_HASH
+                assert schema_data["schema"] == EXPECTED_AVRO_SCHEMA
+            if subject_hash == JSON_SUBJECT_HASH:
+                json_schema_subject_hash_found = True
+
+    assert expected_subject_hash_found
+    assert not json_schema_subject_hash_found