Skip to content

Commit ea44778

Browse files
committed
feat(firestore-bigquery-export): prepare RC (#2206)
* chore(firestore-bigquery-changetracker): bump version * fix(firestore-bigquery-export): added ts-expect-error and TODOs in the import script * feat: try to immediately write to bq first * chore: remove legacy backfill code * feat: add max enqueue attempts param * test: add flags to test, remove unused resource * feat: add backup to gcs * chore(firestore-bigquery-export): temporarily disable GCS * chore: bump ext version * fix(firstore-bigquery-export): comment out unused role for now and use logging * fix(firestore-bigquery-export): implemented RC changes including logging keys * chore(firestore-bigquery-export): update README and CHANGELOG * chore(firestore-bigquery-export): update CHANGELOG
1 parent f7561e5 commit ea44778

File tree

16 files changed

+673
-292
lines changed

16 files changed

+673
-292
lines changed

_emulator/.firebaserc

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
{
22
"projects": {
33
"default": "demo-test"
4+
},
5+
"targets": {},
6+
"etags": {
7+
"dev-extensions-testing": {
8+
"extensionInstances": {
9+
"firestore-bigquery-export": "02acbd8b443b9635716d52d65758a78db1e51140191caecaaf60d932d314a62a"
10+
}
11+
}
412
}
513
}

firestore-bigquery-export/CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## Version 0.1.56
2+
3+
feat - improve sync strategy by immediately writing to BQ, and using cloud tasks only as a last resort
4+
5+
refactor - improve observability/logging of events
6+
7+
chore - remove legacy backfill code
8+
9+
fix - improved usage of the types from change tracker package
10+
111
## Version 0.1.55
212

313
feat - log failed queued tasks

firestore-bigquery-export/README.md

+2-6
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,6 @@ To install an extension, your project must be on the [Blaze (pay as you go) plan
126126

127127
* Collection path: What is the path of the collection that you would like to export? You may use `{wildcard}` notation to match a subcollection of all documents in a collection (for example: `chatrooms/{chatid}/posts`). Parent Firestore Document IDs from `{wildcards}` can be returned in `path_params` as a JSON formatted string.
128128

129-
* Enable logging failed exports: If enabled, the extension will log event exports that failed to enqueue to Cloud Logging, to mitigate data loss.
130-
131129
* Enable Wildcard Column field with Parent Firestore Document IDs: If enabled, creates a column containing a JSON object of all wildcard ids from a documents path.
132130

133131
* Dataset ID: What ID would you like to use for your BigQuery dataset? This extension will create the dataset, if it doesn't already exist.
@@ -158,18 +156,16 @@ essential for the script to insert data into an already partitioned table.)
158156

159157
* Exclude old data payloads: If enabled, table rows will never contain old data (document snapshot before the Firestore onDocumentUpdate event: `change.before.data()`). The reduction in data should be more performant, and avoid potential resource limitations.
160158

161-
* Use Collection Group query: Do you want to use a [collection group](https://firebase.google.com/docs/firestore/query-data/queries#collection-group-query) query for importing existing documents? You have to enable collectionGroup query if your import path contains subcollections. Warning: A collectionGroup query will target every collection in your Firestore project that matches the 'Existing documents collection'. For example, if you have 10,000 documents with a subcollection named: landmarks, this will query every document in 10,000 landmarks collections.
162-
163159
* Cloud KMS key name: Instead of Google managing the key encryption keys that protect your data, you control and manage key encryption keys in Cloud KMS. If this parameter is set, the extension will specify the KMS key name when creating the BQ table. See the PREINSTALL.md for more details.
164160

161+
* Maximum number of enqueue attempts: This parameter will set the maximum number of attempts to enqueue a document to cloud tasks for export to BigQuery. If the maximum number of attempts is reached, the failed export will be handled according to the `LOG_FAILED_EXPORTS` parameter.
162+
165163

166164

167165
**Cloud Functions:**
168166

169167
* **fsexportbigquery:** Listens for document changes in your specified Cloud Firestore collection, then exports the changes into BigQuery.
170168

171-
* **fsimportexistingdocs:** Imports existing documents from the specified collection into BigQuery. Imported documents will have a special changelog with the operation of `IMPORT` and the timestamp of epoch.
172-
173169
* **syncBigQuery:** A task-triggered function that gets called on BigQuery sync
174170

175171
* **initBigQuerySync:** Runs configuration for sycning with BigQuery

firestore-bigquery-export/extension.yaml

+36-95
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
name: firestore-bigquery-export
16-
version: 0.1.55
16+
version: 0.1.56
1717
specVersion: v1beta
1818

1919
displayName: Stream Firestore to BigQuery
@@ -48,6 +48,9 @@ roles:
4848
- role: datastore.user
4949
reason: Allows the extension to write updates to the database.
5050

51+
# - role: storage.objectAdmin
52+
# reason: Allows the extension to create objects in the storage bucket.
53+
5154
resources:
5255
- name: fsexportbigquery
5356
type: firebaseextensions.v1beta.function
@@ -60,19 +63,6 @@ resources:
6063
eventType: providers/cloud.firestore/eventTypes/document.write
6164
resource: projects/${param:PROJECT_ID}/databases/(default)/documents/${param:COLLECTION_PATH}/{documentId}
6265

63-
- name: fsimportexistingdocs
64-
type: firebaseextensions.v1beta.function
65-
description:
66-
Imports existing documents from the specified collection into BigQuery.
67-
Imported documents will have a special changelog with the operation of
68-
`IMPORT` and the timestamp of epoch.
69-
properties:
70-
runtime: nodejs18
71-
taskQueueTrigger:
72-
retryConfig:
73-
maxAttempts: 15
74-
minBackoffSeconds: 60
75-
7666
- name: syncBigQuery
7767
type: firebaseextensions.v1beta.function
7868
description: >-
@@ -206,19 +196,6 @@ params:
206196
default: posts
207197
required: true
208198

209-
- param: LOG_FAILED_EXPORTS
210-
label: Enable logging failed exports
211-
description: >-
212-
If enabled, the extension will log event exports that failed to enqueue to
213-
Cloud Logging, to mitigate data loss.
214-
type: select
215-
options:
216-
- label: Yes
217-
value: yes
218-
- label: No
219-
value: no
220-
required: true
221-
222199
- param: WILDCARD_IDS
223200
label: Enable Wildcard Column field with Parent Firestore Document IDs
224201
description: >-
@@ -409,74 +386,6 @@ params:
409386
- label: No
410387
value: no
411388

412-
# - param: DO_BACKFILL
413-
# label: Import existing Firestore documents into BigQuery?
414-
# description: >-
415-
# Do you want to import existing documents from your Firestore collection
416-
# into BigQuery? These documents will have each have a special changelog
417-
# with the operation of `IMPORT` and the timestamp of epoch. This ensures
418-
# that any operation on an imported document supersedes the import record.
419-
# type: select
420-
# required: true
421-
# default: no
422-
# options:
423-
# - label: Yes
424-
# value: yes
425-
# - label: No
426-
# value: no
427-
428-
# - param: IMPORT_COLLECTION_PATH
429-
# label: Existing Documents Collection
430-
# description: >-
431-
# Specify the path of the Cloud Firestore Collection you would like to
432-
# import from. This may or may not be the same Collection for which you plan
433-
# to mirror changes. If you want to use a collectionGroup query, provide the
434-
# collection name value here, and set 'Use Collection Group query' to true.
435-
# You may use `{wildcard}` notation with an enabled collectionGroup query to
436-
# match a subcollection of all documents in a collection (e.g.,
437-
# `chatrooms/{chatid}/posts`).
438-
# type: string
439-
# validationRegex: "^[^/]+(/[^/]+/[^/]+)*$"
440-
# validationErrorMessage:
441-
# Firestore collection paths must be an odd number of segments separated by
442-
# slashes, e.g. "path/to/collection".
443-
# example: posts
444-
# required: false
445-
446-
- param: USE_COLLECTION_GROUP_QUERY
447-
label: Use Collection Group query
448-
description: >-
449-
Do you want to use a [collection
450-
group](https://firebase.google.com/docs/firestore/query-data/queries#collection-group-query)
451-
query for importing existing documents? You have to enable collectionGroup
452-
query if your import path contains subcollections. Warning: A
453-
collectionGroup query will target every collection in your Firestore
454-
project that matches the 'Existing documents collection'. For example, if
455-
you have 10,000 documents with a subcollection named: landmarks, this will
456-
query every document in 10,000 landmarks collections.
457-
type: select
458-
default: no
459-
options:
460-
- label: Yes
461-
value: yes
462-
- label: No
463-
value: no
464-
465-
# - param: DOCS_PER_BACKFILL
466-
# label: Docs per backfill
467-
# description: >-
468-
# When importing existing documents, how many should be imported at once?
469-
# The default value of 200 should be ok for most users. If you are using a
470-
# transform function or have very large documents, you may need to set this
471-
# to a lower number. If the lifecycle event function times out, lower this
472-
# value.
473-
# type: string
474-
# example: 200
475-
# validationRegex: "^[1-9][0-9]*$"
476-
# validationErrorMessage: Must be a postive integer.
477-
# default: 200
478-
# required: true
479-
480389
- param: KMS_KEY_NAME
481390
label: Cloud KMS key name
482391
description: >-
@@ -491,6 +400,38 @@ params:
491400
'projects/PROJECT_NAME/locations/KEY_RING_LOCATION/keyRings/KEY_RING_ID/cryptoKeys/KEY_ID'.
492401
required: false
493402

403+
- param: MAX_ENQUEUE_ATTEMPTS
404+
label: Maximum number of enqueue attempts
405+
description: >-
406+
This parameter will set the maximum number of attempts to enqueue a
407+
document to cloud tasks for export to BigQuery. If the maximum number of
408+
attempts is reached, the failed export will be handled according to the
409+
`LOG_FAILED_EXPORTS` parameter.
410+
type: string
411+
validationRegex: ^(10|[1-9])$
412+
validationErrorMessage: Please select an integer between 1 and 10
413+
default: 3
414+
415+
# - param: BACKUP_TO_GCS
416+
# label: Backup to GCS
417+
# description: >-
418+
# If enabled, failed BigQuery updates will be written to a GCS bucket.
419+
# type: select
420+
# options:
421+
# - label: Yes
422+
# value: yes
423+
# - label: No
424+
# value: no
425+
# default: no
426+
# required: true
427+
428+
# - param: BACKUP_GCS_BUCKET
429+
# label: Backup GCS Bucket Name
430+
# description: >-
431+
# This (optional) parameter will allow you to specify a GCS bucket for which
432+
# failed BigQuery updates will be written to, if this feature is enabled.
433+
# type: string
434+
494435
events:
495436
- type: firebase.extensions.firestore-counter.v1.onStart
496437
description:

firestore-bigquery-export/functions/__tests__/__snapshots__/config.test.ts.snap

+4-4
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22

33
exports[`extension config config loaded from environment variables 1`] = `
44
Object {
5+
"backupBucketName": "undefined.appspot.com",
56
"backupCollectionId": undefined,
7+
"backupDir": "_firestore-bigquery-export",
8+
"backupToGCS": false,
69
"bqProjectId": undefined,
710
"clustering": Array [
811
"data",
@@ -12,23 +15,20 @@ Object {
1215
"databaseId": "(default)",
1316
"datasetId": "my_dataset",
1417
"datasetLocation": undefined,
15-
"doBackfill": false,
16-
"docsPerBackfill": 200,
1718
"excludeOldData": false,
1819
"importCollectionPath": undefined,
1920
"initialized": false,
2021
"instanceId": undefined,
2122
"kmsKeyName": "test",
2223
"location": "us-central1",
23-
"logFailedExportData": false,
2424
"maxDispatchesPerSecond": 10,
25+
"maxEnqueueAttempts": 3,
2526
"tableId": "my_table",
2627
"timePartitioning": null,
2728
"timePartitioningField": undefined,
2829
"timePartitioningFieldType": undefined,
2930
"timePartitioningFirestoreField": undefined,
3031
"transformFunction": "",
31-
"useCollectionGroupQuery": false,
3232
"useNewSnapshotQuerySyntax": false,
3333
"wildcardIds": false,
3434
}

firestore-bigquery-export/functions/__tests__/e2e.test.ts

+4-4
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@ import * as admin from "firebase-admin";
22
import { BigQuery } from "@google-cloud/bigquery";
33

44
/** Set defaults */
5-
const bqProjectId = "dev-extensions-testing";
6-
const datasetId = "firestore_export";
7-
const tableId = "bq_e2e_test_raw_changelog";
5+
const bqProjectId = process.env.BQ_PROJECT_ID || "dev-extensions-testing";
6+
const datasetId = process.env.DATASET_ID || "firestore_export";
7+
const tableId = process.env.TABLE_ID || "bq_e2e_test_raw_changelog";
88

99
/** Init resources */
1010
admin.initializeApp({ projectId: bqProjectId });
@@ -34,7 +34,7 @@ describe("e2e", () => {
3434

3535
/** Get the latest record from this table */
3636
const [changeLogQuery] = await bq.createQueryJob({
37-
query: `SELECT * FROM \`${bqProjectId}.${datasetId}.${tableId}\` ORDER BY timestamp DESC \ LIMIT 1`,
37+
query: `SELECT * FROM \`${bqProjectId}.${datasetId}.${tableId}\` ORDER BY timestamp DESC LIMIT 1`,
3838
});
3939

4040
const [rows] = await changeLogQuery.getQueryResults();

firestore-bigquery-export/functions/__tests__/functions.test.ts

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ jest.mock("firebase-admin/functions", () => ({
3737
}));
3838

3939
jest.mock("../src/logs", () => ({
40+
...jest.requireActual("../src/logs"),
4041
start: jest.fn(() =>
4142
logger.log("Started execution of extension with configuration", config)
4243
),

firestore-bigquery-export/functions/package-lock.json

+4-4
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

firestore-bigquery-export/functions/package.json

+8-8
Original file line numberDiff line numberDiff line change
@@ -13,31 +13,31 @@
1313
"author": "Jan Wyszynski <[email protected]>",
1414
"license": "Apache-2.0",
1515
"dependencies": {
16-
"@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.37",
16+
"@firebaseextensions/firestore-bigquery-change-tracker": "^1.1.38",
1717
"@google-cloud/bigquery": "^7.6.0",
1818
"@types/chai": "^4.1.6",
1919
"@types/express-serve-static-core": "4.17.30",
20+
"@types/jest": "29.5.0",
2021
"@types/node": "^20.4.4",
2122
"chai": "^4.2.0",
2223
"firebase-admin": "^12.0.0",
2324
"firebase-functions": "^4.9.0",
2425
"firebase-functions-test": "^0.3.3",
2526
"generate-schema": "^2.6.0",
2627
"inquirer": "^6.4.0",
28+
"jest": "29.5.0",
29+
"jest-config": "29.5.0",
2730
"lodash": "^4.17.14",
2831
"nyc": "^14.0.0",
2932
"rimraf": "^2.6.3",
3033
"sql-formatter": "^2.3.3",
34+
"ts-jest": "29.1.2",
3135
"ts-node": "^9.0.0",
32-
"typescript": "^4.8.4",
33-
"@types/jest": "29.5.0",
34-
"jest": "29.5.0",
35-
"jest-config": "29.5.0",
36-
"ts-jest": "29.1.2"
36+
"typescript": "^4.8.4"
3737
},
3838
"private": true,
3939
"devDependencies": {
40-
"mocked-env": "^1.3.2",
41-
"faker": "^5.1.0"
40+
"faker": "^5.1.0",
41+
"mocked-env": "^1.3.2"
4242
}
4343
}

firestore-bigquery-export/functions/src/config.ts

+8-4
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,10 @@ export function clustering(clusters: string | undefined) {
3232
}
3333

3434
export default {
35-
logFailedExportData: process.env.LOG_FAILED_EXPORTS === "yes",
3635
bqProjectId: process.env.BIGQUERY_PROJECT_ID,
3736
databaseId: "(default)",
3837
collectionPath: process.env.COLLECTION_PATH,
3938
datasetId: process.env.DATASET_ID,
40-
doBackfill: process.env.DO_BACKFILL === "yes",
41-
docsPerBackfill: parseInt(process.env.DOCS_PER_BACKFILL) || 200,
4239
tableId: process.env.TABLE_ID,
4340
location: process.env.LOCATION,
4441
initialized: false,
@@ -63,5 +60,12 @@ export default {
6360
process.env.MAX_DISPATCHES_PER_SECOND || "10"
6461
),
6562
kmsKeyName: process.env.KMS_KEY_NAME,
66-
useCollectionGroupQuery: process.env.USE_COLLECTION_GROUP_QUERY === "yes",
63+
maxEnqueueAttempts: isNaN(parseInt(process.env.MAX_ENQUEUE_ATTEMPTS))
64+
? 3
65+
: parseInt(process.env.MAX_ENQUEUE_ATTEMPTS),
66+
// backup bucket defaults to default firebase cloud storage bucket
67+
backupToGCS: process.env.BACKUP_TO_GCS === "yes" ? true : false,
68+
backupBucketName:
69+
process.env.BACKUP_GCS_BUCKET || `${process.env.PROJECT_ID}.appspot.com`,
70+
backupDir: `_${process.env.INSTANCE_ID || "firestore-bigquery-export"}`,
6771
};

0 commit comments

Comments
 (0)