Skip to content

Commit eb85d9b

Browse files
authored
Merge pull request #182 from marklogic/develop
Merge develop into master for 1.9 release
2 parents b892912 + d37e553 commit eb85d9b

File tree

71 files changed

+1053
-519
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+1053
-519
lines changed

CONTRIBUTING.md

+135-177
Large diffs are not rendered by default.

build.gradle

+18-106
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ plugins {
55
id "com.github.jk1.dependency-license-report" version "1.19"
66

77
// Only used for testing
8-
id 'com.marklogic.ml-gradle' version '4.6.0'
8+
id 'com.marklogic.ml-gradle' version '4.8.0'
99
id 'jacoco'
1010
id "org.sonarqube" version "4.4.1.3373"
1111

@@ -31,24 +31,26 @@ configurations {
3131
}
3232

3333
ext {
34-
kafkaVersion = "3.5.1"
34+
// Even though Kafka Connect 3.7.0 is out, we're staying with 3.6.1 in order to continue
35+
// using the third-party Kafka JUnit tool. See https://github.com/mguenther/kafka-junit?tab=readme-ov-file
36+
kafkaVersion = "3.6.1"
3537
}
3638

3739
dependencies {
3840
compileOnly "org.apache.kafka:connect-api:${kafkaVersion}"
3941
compileOnly "org.apache.kafka:connect-json:${kafkaVersion}"
4042
compileOnly "org.apache.kafka:connect-runtime:${kafkaVersion}"
41-
compileOnly "org.slf4j:slf4j-api:1.7.36"
43+
compileOnly "org.slf4j:slf4j-api:2.0.13"
4244

43-
implementation 'com.marklogic:ml-javaclient-util:4.6.0'
45+
implementation 'com.marklogic:ml-javaclient-util:4.8.0'
4446
// Force DHF to use the latest version of ml-app-deployer, which minimizes security vulnerabilities
45-
implementation "com.marklogic:ml-app-deployer:4.6.0"
47+
implementation "com.marklogic:ml-app-deployer:4.8.0"
4648

47-
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.15.2"
49+
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:2.15.3"
4850

4951
// Note that in general, the version of the DHF jar must match that of the deployed DHF instance. Different versions
5052
// may work together, but that behavior is not guaranteed.
51-
implementation("com.marklogic:marklogic-data-hub:5.8.0") {
53+
implementation("com.marklogic:marklogic-data-hub:6.0.0") {
5254
exclude module: "marklogic-client-api"
5355
exclude module: "ml-javaclient-util"
5456
exclude module: "ml-app-deployer"
@@ -63,13 +65,13 @@ dependencies {
6365

6466
testImplementation "org.apache.kafka:connect-api:${kafkaVersion}"
6567
testImplementation "org.apache.kafka:connect-json:${kafkaVersion}"
66-
testImplementation 'net.mguenther.kafka:kafka-junit:3.5.1'
68+
testImplementation 'net.mguenther.kafka:kafka-junit:3.6.0'
6769

68-
testImplementation "org.apache.avro:avro-compiler:1.11.1"
70+
testImplementation "org.apache.avro:avro-compiler:1.11.3"
6971

7072
// Forcing logback to be used for test logging
71-
testImplementation "ch.qos.logback:logback-classic:1.3.5"
72-
testImplementation "org.slf4j:jcl-over-slf4j:1.7.36"
73+
testImplementation "ch.qos.logback:logback-classic:1.3.14"
74+
testImplementation "org.slf4j:jcl-over-slf4j:2.0.13"
7375

7476
documentation files('LICENSE.txt')
7577
documentation files('NOTICE.txt')
@@ -115,29 +117,6 @@ shadowJar {
115117
exclude "scaffolding/**"
116118
}
117119

118-
task copyJarToKafka(type: Copy, dependsOn: shadowJar) {
119-
description = "Used for local development and testing; copies the jar to your local Kafka install"
120-
from "build/libs"
121-
into "${kafkaHome}/libs"
122-
}
123-
124-
task copyPropertyFilesToKafka(type: Copy) {
125-
description = "Used for local development and testing; copies the properties files to your local Kafka install"
126-
from "config"
127-
into "${kafkaHome}/config"
128-
filter { String line ->
129-
line.startsWith('ml.connection.username=') ? 'ml.connection.username=' + kafkaMlUsername : line
130-
}
131-
filter { String line ->
132-
line.startsWith('ml.connection.password=') ? 'ml.connection.password=' + kafkaMlPassword : line
133-
}
134-
}
135-
136-
task deploy {
137-
description = "Used for local development and testing; builds the jar and copies it and the properties files to your local Kafka install"
138-
dependsOn = ["copyJarToKafka", "copyPropertyFilesToKafka"]
139-
}
140-
141120
ext {
142121
confluentArchiveGroup = "Confluent Connector Archive"
143122
confluentTestingGroup = "Confluent Platform Local Testing"
@@ -212,77 +191,10 @@ task connectorArchive(type: Zip, dependsOn: connectorArchive_BuildDirectory, gro
212191
destinationDirectory = file('build/distro')
213192
}
214193

215-
task installConnectorInConfluent(type: Exec, group: confluentTestingGroup, dependsOn: [connectorArchive]) {
216-
description = "Uses 'Confluent-hub' to install the connector in your local Confluent Platform"
217-
commandLine "confluent-hub", "install", "--no-prompt", "build/distro/${baseArchiveName}.zip"
218-
ignoreExitValue = true
219-
}
220-
221-
// See https://docs.confluent.io/confluent-cli/current/command-reference/local/confluent_local_destroy.html
222-
task destroyLocalConfluent(type: Exec, group: confluentTestingGroup) {
223-
description = "Destroy the local Confluent Platform instance"
224-
commandLine "confluent", "local", "destroy"
225-
// Main reason this will fail is because Confluent is not running, which shouldn't cause a failure
226-
ignoreExitValue = true
227-
}
228-
229-
// See https://docs.confluent.io/confluent-cli/current/command-reference/local/services/confluent_local_services_start.html
230-
task startLocalConfluent(type: Exec, group: confluentTestingGroup) {
231-
description = "Convenience task for starting a local instance of Confluent Platform"
232-
commandLine "confluent", "local", "services", "start"
233-
}
234-
235-
task loadDatagenPurchasesConnector(type: Exec, group: confluentTestingGroup) {
236-
description = "Load an instance of the Datagen connector into Confluent Platform for sending JSON documents to " +
237-
"the 'purchases' topic"
238-
commandLine "confluent", "local", "services", "connect", "connector", "load", "datagen-purchases-source", "-c",
239-
"src/test/resources/confluent/datagen-purchases-source.json"
240-
}
241-
242-
task loadMarkLogicPurchasesSinkConnector(type: Exec, group: confluentTestingGroup) {
243-
description = "Load an instance of the MarkLogic Kafka connector into Confluent Platform for writing data to " +
244-
"MarkLogic from the 'purchases' topic"
245-
commandLine "confluent", "local", "services", "connect", "connector", "load", "marklogic-purchases-sink", "-c",
246-
"src/test/resources/confluent/marklogic-purchases-sink.json"
247-
}
248-
249-
task loadMarkLogicPurchasesSourceConnector(type: Exec, group: confluentTestingGroup) {
250-
description = "Load an instance of the MarkLogic Kafka connector into Confluent Platform for reading rows from " +
251-
"the demo/purchases view"
252-
commandLine "confluent", "local", "services", "connect", "connector", "load", "marklogic-purchases-source", "-c",
253-
"src/test/resources/confluent/marklogic-purchases-source.json"
254-
}
255-
256-
task loadMarkLogicAuthorsSourceConnector(type: Exec, group: confluentTestingGroup) {
257-
description = "Loads a source connector that retrieves authors from the citations.xml file, which is also used for " +
258-
"all the automated tests"
259-
commandLine "confluent", "local", "services", "connect", "connector", "load", "marklogic-authors-source", "-c",
260-
"src/test/resources/confluent/marklogic-authors-source.json"
261-
}
194+
// Tasks for using the connector with Confluent Platform on Docker
262195

263-
task loadMarkLogicEmployeesSourceConnector(type: Exec, group: confluentTestingGroup) {
264-
commandLine "confluent", "local", "services", "connect", "connector", "load", "marklogic-employees-source", "-c",
265-
"src/test/resources/confluent/marklogic-employees-source.json"
266-
}
267-
268-
task setupLocalConfluent(group: confluentTestingGroup) {
269-
description = "Start a local Confluent Platform instance and load the Datagen and MarkLogic connectors"
270-
}
271-
272-
// Temporarily only loading the source connector to make manual testing easier, will re-enable all of these before 1.8.0
273-
//setupLocalConfluent.dependsOn startLocalConfluent, loadDatagenPurchasesConnector, loadMarkLogicPurchasesSinkConnector, loadMarkLogicPurchasesSourceConnector
274-
setupLocalConfluent.dependsOn startLocalConfluent, loadMarkLogicEmployeesSourceConnector
275-
276-
loadDatagenPurchasesConnector.mustRunAfter startLocalConfluent
277-
loadMarkLogicPurchasesSinkConnector.mustRunAfter startLocalConfluent
278-
loadMarkLogicPurchasesSourceConnector.mustRunAfter startLocalConfluent
279-
loadMarkLogicAuthorsSourceConnector.mustRunAfter startLocalConfluent
280-
loadMarkLogicEmployeesSourceConnector.mustRunAfter startLocalConfluent
281-
282-
task insertAuthors(type: Test) {
283-
useJUnitPlatform()
284-
systemProperty "AUTHOR_IDS", authorIds
285-
description = "Insert a new author into the kafka-test-content database via a new citations XML document; " +
286-
"use e.g. -PauthorIds=7,8,9 to insert 3 new authors with IDs of 7, 8, and 9"
287-
include "com/marklogic/kafka/connect/source/debug/InsertAuthorsTest.class"
196+
task copyConnectorToDockerVolume(type: Copy, dependsOn: connectorArchive, group: confluentTestingGroup) {
197+
description = "Copies the connector's archive directory to the Docker volume shared with the Connect server"
198+
from "build/connectorArchive"
199+
into "test-app/docker/confluent-marklogic-components"
288200
}

docker-compose.yml

-50
This file was deleted.

examples/ScalingConsiderations.md renamed to docs/ScalingConsiderations.md

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
---
2+
layout: default
3+
title: Scaling Considerations
4+
nav_order: 7
5+
---
6+
17
# Scaling Considerations
28
Each of the three parts of parts (Kafka, MarkLogic, and this connector) of this system maybe easily scaled to handle
39
your throughput requirements. To use the connector in a clustered environment you only need to ensure a couple of
@@ -11,8 +17,8 @@ started or shutdown, this information is also relayed to the connectors so that
1117
## MarkLogic
1218
MarkLogic is designed to be used in large clusters of servers. In order to spread the load of data I/O across the
1319
cluster, a load balancer is typically used. In this case, the connector should be configured to be aware of the use
14-
of a load balancer. This is accomplished by setting the "ml.connection.host" to point to the load balancer, and by setting "ml.connection.type" to "gateway" in the marklogic-sink.properties
15-
file.
20+
of a load balancer. This is accomplished by setting the "ml.connection.host" to point to the load balancer, and by
21+
setting "ml.connection.type" to "gateway" in the marklogic-sink.properties file.
1622

1723
<pre><code># A MarkLogic host to connect to. The connector uses the Data Movement SDK, and thus it will connect to each of the
1824
# hosts in a cluster.
@@ -22,6 +28,9 @@ ml.connection.host=MarkLogic-LoadBalancer-1024238516.us-east-1.elb.amazonaws.com
2228
# See https://docs.marklogic.com/guide/java/data-movement#id_26583 for more information.
2329
ml.connection.type=gateway</code></pre>
2430

31+
For additional information regarding scaling a MarkLogic cluster, please see the MarkLogic
32+
[Scalability, Availability, and Failover Guide](https://docs.marklogic.com/guide/cluster/scalability).
33+
2534
## Connector
2635
When configuring multiple instances of the connector to consume the same topic(s), the Kafka Connect framework
2736
automatically handles dividing up the connections by assigning specific topic partitions (spread across the Kafka

docs/writing-data.md

+26-1
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,24 @@ This will result in the following pieces of Kafka record metadata being in each
113113
- `kafka.partition` = the partition of the Kafka record
114114
- `kafka.timestamp` = the timestamp of the Kafka record
115115

116+
### Including Kafka headers
117+
118+
Each Kafka record passed to the MarkLogic connector also has headers that may contain useful information which can be
119+
included in the metadata written with documents. This includes the headers that are included in Kafka records by
120+
default as well as any custom headers. Kafka headers can be included in each document by configuring the following
121+
property:
122+
123+
- `ml.dmsdk.includeKafkaHeaders` = `true` to include Kafka headers
124+
125+
When the headers are added to the document metadata, they are simply given the same name as the key for the header.
126+
However, you may also specify a prefix that will be prepended to each header key. To set that prefix, use the following
127+
property:
128+
129+
- `ml.dmsdk.includeKafkaHeaders.prefix` = `<prefix>` to be prepended to header keys in the metadata.
130+
131+
The headers that are on the Kafka records will depend on the Kafka distribution you are using and the message producer
132+
configuration.
133+
116134
### Configuring DMSDK performance
117135

118136
The performance of how data is written to MarkLogic can be configured via the following properties:
@@ -211,7 +229,7 @@ endpointConstants = fn.head(xdmp.fromJSON(endpointConstants));
211229
212230
for (let item of inputSequence) {
213231
item = fn.head(xdmp.fromJSON(item));
214-
// TODO Determine what to do with each item
232+
// Determine what to do with each item
215233
}
216234
```
217235

@@ -311,3 +329,10 @@ required to catch any error that occurs, an unexpected error in the sink connect
311329
and logged by Kafka. However, nothing will be sent to the user-configured DLQ topic in this scenario as the error will
312330
not be associated with a particular sink record. Kafka and MarkLogic server logs should be examined to determine the
313331
cause of the error.
332+
333+
## JSON-based Connector Configuration
334+
335+
Some Kafka environments permit REST-based instantiation of connectors. Confluent is one of those environments.
336+
[Please see the Confluent documentation](https://docs.confluent.io/kafka-connectors/maprdb/current/map_r_d_b_sink_connector_example.html)
337+
to read about this technique. Examples of JSON files to use with the REST service can be found in
338+
examples/ConfluentConnectorConfigs.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"name": "marklogic-purchases-sink",
3+
"config": {
4+
"topics": "purchases",
5+
"connector.class": "com.marklogic.kafka.connect.sink.MarkLogicSinkConnector",
6+
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
7+
"value.converter": "org.apache.kafka.connect.storage.StringConverter",
8+
"tasks.max": "1",
9+
"ml.connection.host": "marklogic",
10+
"ml.connection.port": 8011,
11+
"ml.connection.username": "kafka-test-user",
12+
"ml.connection.password": "kafkatest",
13+
"ml.connection.securityContextType": "DIGEST",
14+
"ml.document.format": "JSON",
15+
"ml.document.uriPrefix": "/purchase/",
16+
"ml.document.uriSuffix": ".json",
17+
"ml.document.collections": "purchases,kafka-data",
18+
"ml.document.permissions": "kafka-test-minimal-user,read,kafka-test-minimal-user,update",
19+
"ml.dmsdk.includeKafkaMetadata": "false",
20+
"ml.dmsdk.includeKafkaHeaders": "true",
21+
"ml.dmsdk.includeKafkaHeaders.prefix": ""
22+
}
23+
}

src/test/resources/confluent/marklogic-purchases-source.json renamed to examples/ConfluentConnectorConfigs/marklogic-purchases-source.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
66
"value.converter": "org.apache.kafka.connect.storage.StringConverter",
77
"tasks.max": "1",
8-
"ml.connection.host": "localhost",
9-
"ml.connection.port": 8018,
8+
"ml.connection.host": "marklogic",
9+
"ml.connection.port": 8011,
1010
"ml.connection.username": "kafka-test-user",
1111
"ml.connection.password": "kafkatest",
1212
"ml.connection.securityContextType": "DIGEST",
-1.71 MB
Binary file not shown.

examples/dmsdk-transform/installTransform.sh

-1
This file was deleted.

0 commit comments

Comments
 (0)