Skip to content

Commit 8bcb270

Browse files
Merge branch 'Azure:master' into master
2 parents ab0f42e + d3dc644 commit 8bcb270

File tree

87 files changed

+6790
-2641
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+6790
-2641
lines changed

.github/workflows/build.yml

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
name: AzureKustoSpark
2+
on:
3+
push:
4+
branches: [ "**" ]
5+
pull_request:
6+
branches: [ "**" ]
7+
8+
jobs:
9+
build:
10+
name: Build
11+
runs-on: ubuntu-latest
12+
environment: build
13+
permissions:
14+
checks: write
15+
pull-requests: write
16+
id-token: write
17+
contents: read
18+
steps:
19+
- name: Check out code into the Spark module directory
20+
uses: actions/checkout@v4
21+
- name: Setup java
22+
uses: actions/setup-java@v4
23+
with:
24+
distribution: 'adopt'
25+
java-version: 8
26+
cache: 'maven'
27+
cache-dependency-path: |
28+
pom.xml
29+
connector/pom.xml
30+
samples/pom.xml
31+
- name: Azure login
32+
uses: azure/login@v2
33+
with:
34+
client-id: ${{ secrets.APP_ID }}
35+
tenant-id: ${{ secrets.TENANT_ID }}
36+
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
37+
- name: "Run az commands"
38+
run: |
39+
access_token=$(az account get-access-token --resource=${{ secrets.APP_ID }} --scope=${{ secrets.CLUSTER }}/.default --query accessToken -o tsv)
40+
echo "ACCESS_TOKEN=$access_token" >> $GITHUB_ENV
41+
- name: Run the Maven verify phase
42+
env:
43+
kustoAadAuthorityID: ${{ secrets.TENANT_ID }}
44+
kustoDatabase: ${{ secrets.DATABASE }}
45+
kustoCluster: ${{ secrets.CLUSTER }}
46+
kustoAadAppId: ${{secrets.APP_ID}}
47+
accessToken: ${{env.ACCESS_TOKEN}}
48+
storageAccountUrl: '${{secrets.STORAGE_CONTAINER_URL}}'
49+
ingestStorageUrl: ${{ secrets.INGEST_STORAGE_URL }}
50+
ingestStorageContainer: ${{ secrets.INGEST_STORAGE_CONTAINER }}
51+
run: |
52+
mvn clean verify -DkustoAadAppId=${{ secrets.APP_ID }} -DkustoAadAuthorityID=${{ secrets.TENANT_ID }} -DkustoDatabase=${{ secrets.DATABASE }} -DkustoCluster=${{ secrets.CLUSTER }} -DaccessToken=${{env.ACCESS_TOKEN}} -DingestStorageUrl=${{secrets.INGEST_STORAGE_URL}} -DingestStorageContainer=${{secrets.INGEST_STORAGE_CONTAINER}}
53+
54+
- name: Publish Unit Test Results
55+
uses: EnricoMi/publish-unit-test-result-action@v2
56+
if: always()
57+
with:
58+
files: |
59+
connector/target/surefire-reports/*.xml
60+
- name: Upload coverage to Codecov
61+
uses: codecov/codecov-action@v2

.github/workflows/release.yml

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
name: release
2+
permissions:
3+
checks: write
4+
pull-requests: write
5+
packages: write
6+
deployments: write
7+
contents: write
8+
id-token: write
9+
10+
on:
11+
push:
12+
tags:
13+
- "v(2.4|3.0)_[0-9]+.[0-9]+.[0-9]+"
14+
workflow_dispatch:
15+
inputs:
16+
tag:
17+
description: 'Tag'
18+
required: true
19+
default: 'refs/tags/v0.0.0'
20+
upload_to_azure:
21+
description: 'Upload to Azure storage'
22+
required: false
23+
default: false
24+
github_release:
25+
description: 'Create Github release'
26+
required: false
27+
default: false
28+
29+
jobs:
30+
release:
31+
runs-on: ubuntu-latest
32+
environment: build
33+
strategy:
34+
matrix:
35+
java: [ '8' ]
36+
name: Java ${{ matrix.java }}
37+
steps:
38+
- uses: actions/checkout@v4
39+
with:
40+
ref: ${{ github.event.inputs.tag || github.ref }}
41+
- name: Azure login
42+
uses: azure/login@v2
43+
with:
44+
client-id: ${{ secrets.APP_ID }}
45+
tenant-id: ${{ secrets.TENANT_ID }}
46+
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
47+
- name: Setup java ${{ matrix.java }}
48+
uses: actions/setup-java@v4
49+
with:
50+
distribution: 'adopt'
51+
java-version: ${{ matrix.java }}
52+
cache: 'maven'
53+
- name: "Run az commands"
54+
run: |
55+
access_token=$(az account get-access-token --resource=${{ secrets.APP_ID }} --scope=${{ secrets.CLUSTER }}/.default --query accessToken -o tsv)
56+
echo "ACCESS_TOKEN=$access_token" >> $GITHUB_ENV
57+
- name: Run the Maven verify phase
58+
env:
59+
kustoAadAuthorityID: ${{ secrets.TENANT_ID }}
60+
kustoDatabase: ${{ secrets.DATABASE }}
61+
kustoCluster: ${{ secrets.CLUSTER }}
62+
kustoAadAppId: ${{secrets.APP_ID}}
63+
accessToken: ${{env.ACCESS_TOKEN}}
64+
storageAccountUrl: ${{ secrets.STORAGE_CONTAINER_URL }}
65+
run: |
66+
mvn clean verify -DkustoAadAppId=${{ secrets.APP_ID }} -DkustoAadAuthorityID=${{ secrets.TENANT_ID }} -DkustoDatabase=${{ secrets.DATABASE }} -DkustoCluster=${{ secrets.CLUSTER }} -DaccessToken=${{env.ACCESS_TOKEN}}
67+
- name: Get versions
68+
id: get_version
69+
run: |
70+
echo ::set-output name=VERSION::$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
71+
echo ::set-output name=SCALA_VERSION::$(mvn help:evaluate -Dexpression=scala.version.major -q -DforceStdout)
72+
echo ::set-output name=SPARK_VERSION::$(mvn help:evaluate -Dexpression=spark.version.major -q -DforceStdout)
73+
- name: Move artifacts to staging
74+
run: |
75+
version=${{ steps.get_version.outputs.VERSION }}
76+
scalaversion=${{steps.get_version.outputs.SCALA_VERSION}}
77+
sparkversion=${{steps.get_version.outputs.SPARK_VERSION}}
78+
mkdir staging
79+
echo ${{steps.get_version.outputs.SPARK_VERSION}}_${{steps.get_version.outputs.SCALA_VERSION}}-$version
80+
cp connector/target/*.jar staging
81+
cp connector/.flattened-pom.xml staging/kusto-spark_${{steps.get_version.outputs.SPARK_VERSION}}_${{steps.get_version.outputs.SCALA_VERSION}}-$version.pom
82+
- name: Github Release
83+
uses: docker://antonyurchenko/git-release:v6
84+
env:
85+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
86+
with:
87+
args: |
88+
staging/kusto-spark_${{ steps.get_version.outputs.SPARK_VERSION }}_${{ steps.get_version.outputs.SCALA_VERSION }}-${{ steps.get_version.outputs.VERSION }}-jar-with-dependencies.jar
89+
staging/kusto-spark_${{ steps.get_version.outputs.SPARK_VERSION }}_${{ steps.get_version.outputs.SCALA_VERSION }}-${{ steps.get_version.outputs.VERSION }}-sources.jar
90+
continue-on-error: true
91+
if: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && github.event.inputs.github_release == 'true') }}
92+
- name: Azure login for azuresdkpartnerdrops SA
93+
uses: azure/login@v2
94+
with:
95+
client-id: ${{ secrets.AZURESDKPARTNERDROPS_CLIENT_ID }}
96+
tenant-id: ${{ secrets.TENANT_ID }}
97+
subscription-id:
98+
${{ secrets.AZURESDKPARTNERDROPS_SUBSCRIPTION_ID }}
99+
- name: Upload file to Blob Storage
100+
run: |
101+
az storage blob upload-batch \
102+
--account-name ${{ secrets.AZURE_RELEASE_STORAGE_ACCOUNT }} \
103+
--destination ${{ secrets.AZURE_STORAGE_CONTAINER }}/kusto/spark/${{ steps.get_version.outputs.SPARK_VERSION }}_${{ steps.get_version.outputs.VERSION }} \
104+
--source staging \
105+
--auth-mode login
106+
if: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && github.event.inputs.upload_to_azure == 'true') }}

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,7 @@ src_managed/
5151
project/boot/
5252
project/plugins/project/
5353

54+
# VS Code and extensions
55+
.vscode/
56+
.bloop/
57+
.metals/

.scalafix.conf

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
rules = [
2+
NoAutoTupling,
3+
RemoveUnused,
4+
DisableSyntax,
5+
LeakingImplicitClassVal,
6+
NoValInForComprehension,
7+
ProcedureSyntax
8+
]
9+
10+
DisableSyntax.noVars = true
11+
DisableSyntax.noThrows = true
12+
DisableSyntax.noNulls = true
13+
DisableSyntax.noReturns = true
14+
DisableSyntax.noWhileLoops = true
15+
DisableSyntax.noAsInstanceOf = true
16+
DisableSyntax.noIsInstanceOf = true
17+
DisableSyntax.noXml = true
18+
DisableSyntax.noDefaultArgs = true
19+
DisableSyntax.noFinalVal = true
20+
DisableSyntax.noFinalize = true
21+
DisableSyntax.noValPatterns = true

.scalafmt.conf

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
align = none
19+
align.openParenDefnSite = false
20+
align.openParenCallSite = false
21+
align.tokens = []
22+
optIn = {
23+
configStyleArguments = false
24+
}
25+
danglingParentheses.preset = false
26+
docstrings.style = Asterisk
27+
maxColumn = 98
28+
runner.dialect = scala212
29+
fileOverride {
30+
"glob:**/src/**/scala-2.13/**.scala" {
31+
runner.dialect = scala213
32+
}
33+
}
34+
version = 3.5.9

README.md

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,30 @@
33
</p>
44

55
# Azure Data Explorer Connector for Apache Spark
6+
7+
master: [![Build](https://github.com/Azure/azure-kusto-spark/actions/workflows/build.yml/badge.svg?branch=master)](https://github.com/Azure/azure-kusto-spark/actions/workflows/build.yml)
68

7-
master: [![Build status](https://msazure.visualstudio.com/One/_apis/build/status/Custom/Kusto/azure-kusto-spark%20ci?branchName=master)](https://msazure.visualstudio.com/One/_build/latest?definitionId=58677)
8-
99
This library contains the source code for Azure Data Explorer Data Source and Data Sink Connector for Apache Spark.
1010

1111
Azure Data Explorer (A.K.A. [Kusto](https://azure.microsoft.com/services/data-explorer/)) is a lightning-fast indexing and querying service.
1212

1313
[Spark](https://spark.apache.org/) is a unified analytics engine for large-scale data processing.
1414

15-
Making Azure Data Explorer and Spark work together enables building fast and scalable applications, targeting a variety of Machine Learning, Extract-Transform-Load, Log Analytics and other data-driven scenarios.
15+
Making Azure Data Explorer and Spark work together enables building fast and scalable applications, targeting a variety of Machine Learning, Extract-Transform-Load, Log Analytics and other data-driven scenarios.
16+
17+
This connector works with the following spark environments:
18+
[Azure Databricks](https://azure.microsoft.com/products/databricks),
19+
[Azure Synapse Data Explorer](https://docs.microsoft.com/azure/synapse-analytics/data-explorer/data-explorer-overview) and
20+
[Real time analytics in Fabric](https://learn.microsoft.com/fabric/real-time-analytics/overview)
1621

1722
## Changelog
1823

24+
**Breaking changes in versions 5.2.x** - From these versions, the published packages are shaded and packaged as a self contained jar. This is to avoid issues with common OSS libraries, spark runtimes and/or application dependencies.
25+
1926
For major changes from previous releases, please refer to [Releases](https://github.com/Azure/azure-kusto-spark/releases).
2027
For known or new issues, please refer to the [issues](https://github.com/Azure/azure-kusto-spark/issues) section.
21-
> Note: Use the 4.x series only if you are using JDK 11 and 3.x in JDK 8
28+
> Note: Use the 4.x series only if you are using JDK 11. Versions 3.x and 5.x will work with JDK8 and all versions up
29+
From versions 5.2.0 and up, the connector is packaged as an uber jar to avoid conflicts with other jars that are added as part of the spark job definitions.
2230

2331
## Usage
2432

@@ -33,14 +41,14 @@ link your application with the artifact below to use the Azure Data Explorer Con
3341
```
3442
groupId = com.microsoft.azure.kusto
3543
artifactId = kusto-spark_3.0_2.12
36-
version = 4.0.2
44+
version = 5.3.0
3745
```
3846

3947
**In Maven**:
4048

4149
Look for the following coordinates:
4250
```
43-
com.microsoft.azure.kusto:kusto-spark_3.0_2.12:4.0.2
51+
com.microsoft.azure.kusto:kusto-spark_3.0_2.12:5.3.0
4452
```
4553

4654
Or clone this repository and build it locally to add it to your local maven repository,.
@@ -50,15 +58,15 @@ The jar can also be found under the [released package](https://github.com/Azure/
5058
<dependency>
5159
<groupId>com.microsoft.azure.kusto</groupId>
5260
<artifactId>kusto-spark_3.0_2.12</artifactId>
53-
<version>4.0.2</version>
61+
<version>5.2.2</version>
5462
</dependency>
5563
```
5664

5765
**In SBT**:
5866

5967
```scala
6068
libraryDependencies ++= Seq(
61-
"com.microsoft.azure.kusto" %% "kusto-spark_3.0" % "4.0.2"
69+
"com.microsoft.azure.kusto" %% "kusto-spark_3.0" % "5.2.2"
6270
)
6371
```
6472

@@ -67,7 +75,7 @@ libraryDependencies ++= Seq(
6775
Libraries -> Install New -> Maven -> copy the following coordinates:
6876

6977
```
70-
com.microsoft.azure.kusto:kusto-spark_3.0_2.12:4.0.2
78+
com.microsoft.azure.kusto:kusto-spark_3.0_2.12:5.2.2
7179
```
7280

7381
#### Building Samples Module
@@ -95,23 +103,50 @@ To use the connector, you need:
95103
> Note: when working with Spark version 2.3 or lower, build the jar locally from branch "2.4" and
96104
simply change the spark version in the pom file.
97105

106+
## Local Run - Build Setup
107+
108+
The newer options in the connector have tests pertaining to Blob storage, providing support for user impersonation based data export and also providing a custom blob storage for ingestion.
109+
110+
These are set up on the CI already. To configure these on local machines, set up is required on the machine. The following are commands to be executed on AzCli, the setup can be done through the Azure portal as well.
111+
112+
```
113+
az login
114+
az ad signed-in-user show --query "id" --output json
115+
```
116+
This will usually output a GUID
117+
118+
```
119+
"10ac405f-8d3f-4f95-a012-201801b257d2"
120+
```
121+
This ID can then be used to grant access to storage as follows
122+
123+
```shell
124+
az role assignment create --assignee 10ac405f-8d3f-4f95-a012-201801b257d2 --role "Storage Blob Delegator" --scope /subscriptions/<sub-id>/resourceGroups/<rg-name>/providers/Microsoft.Storage/storageAccounts/<storageacc>
125+
126+
az role assignment create --assignee 10ac405f-8d3f-4f95-a012-201801b257d2 --role "Storage Blob Data Contributor" --scope /subscriptions/<sub-id>/resourceGroups/<rg-name>/providers/Microsoft.Storage/storageAccounts/<storageacc>/containers/<container-name>
127+
```
128+
129+
These commands will set up test storage accounts required for tests.
130+
131+
Once this is set up, you can use the following commands to build and run the tests
132+
98133
## Build Commands
99134

100135
```shell
101-
// Builds jar and runs all tests
102-
mvn clean package
136+
mvn clean package -DkustoCluster='https://cluster.westus2.kusto.windows.net' -DkustoDatabase='spark' -DkustoAadAuthorityID='72f988bf-86f1-41af-91ab-2d7cd011db47' -DkustoIngestionUri='https://ingest-cluster.westus2.kusto.windows.net' -DingestStorageUrl='https://storageacc.blob.core.windows.net' -DingestStorageContainer='ingestcontainer' -DstorageAccountUrl='https://storageacc.blob.core.windows.net/synapseppe\;impersonate'
137+
138+
139+
# You can pass all the properties as env variables too
140+
export kustoCluster="https://cluster.westus2.kusto.windows.net"
103141

104-
// Builds jar, runs all tests, and installs jar to your local maven repository
105-
mvn clean install
106142
```
107143

108144
## Pre-Compiled Libraries
109145
To facilitate ramp-up from local jar on platforms such as Azure Databricks, pre-compiled libraries
110146
are published under [GitHub Releases](https://github.com/Azure/azure-kusto-spark/releases).
111147
These libraries include:
112148
* Azure Data Explorer connector library
113-
* User may also need to include Kusto Java SDK libraries (kusto-data and kusto-ingest), which are published under
114-
[GitHub Releases](https://github.com/Azure/azure-kusto-java/releases)
149+
* Version 5.2.0 and up of the library publish uber jars to maven. This is because of conflicts between custom jars that are added as part of the job and the exclude/include process that has to be followed to avoid conflicts.
115150

116151
## Dependencies
117152
Spark Azure Data Explorer connector depends on [Azure Data Explorer Data Client Library](https://mvnrepository.com/artifact/com.microsoft.azure.kusto/kusto-data)

0 commit comments

Comments
 (0)