Skip to content

Commit 2c9bbd1

Browse files
author
theodoresiu
authored
Merge pull request #45 from santhh/gcs-s3-inspect
solution publish update
2 parents 1e77d38 + 5c7f300 commit 2c9bbd1

10 files changed

+144
-712
lines changed

README.md

+39-688
Large diffs are not rendered by default.

build.gradle

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ dependencies {
8181
compile group: 'org.slf4j', name: 'slf4j-jdk14', version: '1.7.5'
8282
compile 'com.google.cloud:google-cloud-kms:0.70.0-beta'
8383
compile 'com.google.guava:guava:27.0-jre'
84-
compile group: 'com.google.cloud', name: 'google-cloud-dlp', version: '0.100.0-beta'
84+
compile group: 'com.google.cloud', name: 'google-cloud-dlp', version: '0.99.0-beta'
8585
compile 'com.google.api-client:google-api-client:1.27.0'
8686
compile group: 'com.google.apis', name: 'google-api-services-cloudkms', version: 'v1-rev53-1.23.0'
8787
compile group: 'org.apache.beam', name: 'beam-sdks-java-io-amazon-web-services', version: dataflowBeamVersion

create-df-template.sh

+6-6
Original file line numberDiff line numberDiff line change
@@ -51,29 +51,29 @@ INSPECT_TEMPLATE_NAME=$(jq -c '.name' ${INSPECT_TEMPLATE_OUTPUT})
5151
DYNAMIC_TEMPLATE_BUCKET_SPEC=gs://dynamic-template/dynamic_template_dlp_inspect.json
5252
JOB_NAME="dlp-inspect-pipeline-`date +%Y%m%d-%H%M%S-%N`"
5353
echo $JOB_NAME
54-
GCS_STAGING_LOCATION=gs://dynamic-template/log
55-
TEMP_LOCATION=gs://dynamic-template/temp
54+
GCS_STAGING_LOCATION=$GCS_BUCKET_URL/log
55+
TEMP_LOCATION=$GCS_BUCKET_URL/temp
5656
PARAMETERS_CONFIG='{
5757
"jobName":"'$JOB_NAME'",
5858
"parameters":{
5959
"streaming":"true",
6060
"enableStreamingEngine":"true",
6161
"autoscalingAlgorithm":"NONE",
6262
"workerMachineType": "n1-standard-8",
63-
"numWorkers":"50",
64-
"maxNumWorkers":"50",
63+
"numWorkers":"3",
64+
"maxNumWorkers":"3",
6565
"awsAccessKey":"'$AWS_ACCESS_KEY'",
6666
"awsSecretKey":"'$AWS_SECRET_KEY'",
6767
"s3BucketUrl":"'$S3_BUCKET_URL'",
68-
"gcsBucketUrl":"'$GCS_BUCKET_URL'",
68+
"gcsBucketUrl":"'$GCS_BUCKET_URL'/*.csv",
6969
"inspectTemplateName":'$INSPECT_TEMPLATE_NAME',
7070
"s3ThreadPoolSize":"1000",
7171
"maxConnections":"1000000",
7272
"socketTimeout":"100",
7373
"connectionTimeout":"100",
7474
"tempLocation":"'$TEMP_LOCATION'",
7575
"awsRegion":"'$AWS_REGION'",
76-
"dataSetId":"'$BQ_DATASET'",
76+
"dataSetId":"'$BQ_DATASET'"
7777
}
7878
}'
7979
DF_API_ROOT_URL="https://dataflow.googleapis.com"

create-kek.sh

+1-14
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,7 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.#!/usr/bin/env bash
15-
# Copyright 2019 Google Inc.
16-
#
17-
# Licensed under the Apache License, Version 2.0 (the "License");
18-
# you may not use this file except in compliance with the License.
19-
# You may obtain a copy of the License at
20-
#
21-
# http://www.apache.org/licenses/LICENSE-2.0
22-
#
23-
# Unless required by applicable law or agreed to in writing, software
24-
# distributed under the License is distributed on an "AS IS" BASIS,
25-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26-
# See the License for the specific language governing permissions and
27-
# limitations under the License.
28-
15+
2916
set -x
3017
PROJECT_ID=$1
3118
KEY_RING_NAME=$2

deploy-data-tokeninzation-solution.sh

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env bash
2+
# Copyright 2019 Google Inc.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.#!/usr/bin/env bash
15+
16+
# please make sure you have owner permission in your project
17+
set -x
18+
# export some env variables
19+
export PROJECT_ID=$(gcloud config get-value project)
20+
export DATA_STORAGE_BUCKET=${PROJECT_ID}-demo-data
21+
export TEK=$(openssl rand -base64 32)
22+
export KEY_RING_NAME=demo-key-ring
23+
export KEY_NAME=demo-key
24+
export KEK_FILE_NAME=kek.json
25+
export PROJECT_NUMBER=$(gcloud projects list --filter=${PROJECT_ID} --format="value(PROJECT_NUMBER)")
26+
export SERVICE_ACCOUNT_NAME=demo-service-account
27+
export REGION=us-central1
28+
export BQ_DATASET_NAME=demo_dataset
29+
# enable the required APIs
30+
gcloud services enable dlp.googleapis.com
31+
gcloud services enable cloudkms.googleapis.com
32+
gcloud services enable bigquery
33+
gcloud services enable storage_component
34+
gcloud services enable dataflow
35+
gcloud services enable cloudbuild.googleapis.com
36+
# create BQ dataset. Table will be dynamically generated from dataflow pipeline
37+
bq --location=US mk -d --description "De-Identified PII Dataset" ${BQ_DATASET_NAME}
38+
# create a data bucket to store the PII data
39+
gsutil mb -c standard -l ${REGION} gs://${DATA_STORAGE_BUCKET}
40+
# allow some additional access to cloud build service account
41+
gcloud projects add-iam-policy-binding ${PROJECT_ID} --member serviceAccount:$PROJECT_NUMBER@cloudbuild.gserviceaccount.com --role roles/cloudkms.cryptoKeyEncrypter
42+
gcloud projects add-iam-policy-binding ${PROJECT_ID} --member serviceAccount:$PROJECT_NUMBER@cloudbuild.gserviceaccount.com --role roles/cloudkms.admin
43+
# trigger the first cloud build script to create the KEK
44+
gcloud builds submit . --config dlp-demo-part-1-crypto-key.yaml --substitutions _GCS_BUCKET_NAME=gs://${DATA_STORAGE_BUCKET},_KEY_RING_NAME=${KEY_RING_NAME},_KEY_NAME=${KEY_NAME},_TEK=${TEK},_KEK=${KEK_FILE_NAME},_API_KEY=$(gcloud auth print-access-token)
45+
# DLP requires a service account to be used for API call
46+
gcloud iam service-accounts create ${SERVICE_ACCOUNT_NAME} --display-name "DLP Demo Service Account"
47+
gcloud projects add-iam-policy-binding ${PROJECT_ID} --member serviceAccount:${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com --role roles/editor
48+
gcloud projects add-iam-policy-binding ${PROJECT_ID} --member serviceAccount:${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com --role roles/storage.admin
49+
gcloud iam service-accounts keys create --iam-account ${SERVICE_ACCOUNT_NAME}@${PROJECT_ID}.iam.gserviceaccount.com demo_key.json --user-output-enabled
50+
gcloud auth activate-service-account --key-file demo_key.json
51+
# trigger the cloud build script to create DLP templates
52+
gcloud builds submit . --config dlp-demo-part-2-dlp-template.yaml --substitutions _KEK_CONFIG_FILE=gs://${DATA_STORAGE_BUCKET}/${KEK_FILE_NAME},_GCS_BUCKET_NAME=gs://${DATA_STORAGE_BUCKET},_API_KEY=$(gcloud auth print-access-token)
53+
# download the json file to parse template name using jq
54+
gsutil cp gs://${DATA_STORAGE_BUCKET}/deid-template.json .
55+
gsutil cp gs://${DATA_STORAGE_BUCKET}/inspect-template.json .
56+
export DEID_TEMPLATE_NAME=$(jq -r '.name' deid-template.json)
57+
export INSPECT_TEMPLATE_NAME=$(jq -r '.name' inspect-template.json)
58+
# trigger the dataflow pipeline
59+
export jobId="demo-dlp-deid-pipeline-`date +%Y%m%d-%H%M%S`"
60+
gcloud dataflow jobs run ${jobId} --gcs-location gs://dataflow-templates/latest/Stream_DLP_GCS_Text_to_BigQuery --parameters --region=us-central1,inputFilePattern=gs://${DATA_STORAGE_BUCKET}/CCRecords_1564602825.csv,dlpProjectId=${PROJECT_ID},deidentifyTemplateName=${DEID_TEMPLATE_NAME},inspectTemplateName=${INSPECT_TEMPLATE_NAME},datasetName=${BQ_DATASET_NAME},batchSize=500
61+
62+

deploy-s3-inspect-solution.sh

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright 2019 Google Inc.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
set -x
16+
export PROJECT_ID=$(gcloud config get-value project)
17+
gcloud services enable dlp.googleapis.com
18+
gcloud services enable cloudkms.googleapis.com
19+
gcloud services enable bigquery
20+
gcloud services enable storage_component
21+
gcloud services enable dataflow
22+
gcloud services enable cloudbuild.googleapis.com
23+
export AWS_ACCESS_KEY="<access_key>"
24+
export AWS_SECRET_KEY="<secret_key>"
25+
export S3_BUCKET_URL="s3://<path>"
26+
export GCS_BUCKET_URL="gs://<path>"
27+
export AWS_REGION="<region>"
28+
export BQ_DATASET="dlp_inspection"
29+
30+
gcloud builds submit . --config dlp-demo-s3-gcs-inspect.yaml --substitutions _AWS_ACCESS_KEY=$AWS_ACCESS_KEY,_API_KEY=$(gcloud auth print-access-token),_AWS_SECRET_KEY=$AWS_SECRET_KEY,_S3_BUCKET_URL=$S3_BUCKET_URL,_GCS_BUCKET_URL=$GCS_BUCKET_URL,_AWS_REGION=$AWS_REGION,_BQ_DATASET=$BQ_DATASET

diagrams/ref_arch_solution.png

106 KB
Loading

dlp-demo-part-1-crypto-key.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
args: ['kms', 'keys', 'create' ,'${_KEY_NAME}', '--location=global','--purpose=encryption','--keyring=${_KEY_RING_NAME}']
2020
- name: 'gcr.io/cloud-builders/docker'
2121
entrypoint: 'bash'
22-
args: ['-c', 'sh create-kek.sh ${PROJECT_ID} ${_KEY_RING_NAME} ${_KEY_NAME} ${_TEK} ${_KEK} ${_API_KEY}']
22+
args: ['-c', 'sh create-kek.sh ${PROJECT_ID} ${_KEY_RING_NAME} ${_KEY_NAME} ${_TEK} ${_KEK} ${_API_KEY}']
2323
- name: gcr.io/cloud-builders/curl
2424
args: ['http://storage.googleapis.com/dataflow-dlp-solution-sample-data/sample_data_scripts.tar.gz', '-o', 'sample_data_scripts.tar.gz']
2525
- name: 'gcr.io/cloud-builders/docker'
@@ -28,4 +28,4 @@
2828
- name: gcr.io/cloud-builders/gsutil
2929
args: ['cp', '${_KEK}', '${_GCS_BUCKET_NAME}']
3030
- name: gcr.io/cloud-builders/gsutil
31-
args: ['-m', 'cp', './solution-test/*.csv', '${_GCS_BUCKET_NAME}']
31+
args: ['-m', 'cp', './solution-test/*.csv', '${_GCS_BUCKET_NAME}']

dlp-demo-part-2-dlp-template.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@ steps:
2424
- name: gcr.io/cloud-builders/gsutil
2525
args: ['cp', 'deid-template.json','${_GCS_BUCKET_NAME}/deid-template.json']
2626
- name: gcr.io/cloud-builders/gsutil
27-
args: ['cp', 'inspect-template.json','${_GCS_BUCKET_NAME}/inspect-template.json']
27+
args: ['cp', 'inspect-template.json','${_GCS_BUCKET_NAME}/inspect-template.json']

dlp-demo-s3-gcs-inspect.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,7 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515
steps:
16+
- name: gcr.io/cloud-solutions-images/bq
17+
args: ['mk', '-d', '--description', 'DLP Inspection', '--location=US', '${_BQ_DATASET}']
1618
- name: 'ubuntu'
1719
args: ['bash', '-c','apt-get -q update && apt-get install -qqy curl && apt-get install -qqy jq;sh create-df-template.sh ${PROJECT_ID} ${_AWS_ACCESS_KEY} ${_API_KEY} ${_AWS_SECRET_KEY} ${_S3_BUCKET_URL} ${_GCS_BUCKET_URL} ${_AWS_REGION} ${_BQ_DATASET}']

0 commit comments

Comments
 (0)