tap-salesforce/tap_salesforce/__init__.py at a2940a04ab67e21bfe3ad273e40f7fd5d7284189 · MeltanoLabs/tap-salesforce · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
#!/usr/bin/env python3
from __future__ import annotations
import asyncio
import concurrent.futures
import json
import sys
from copy import deepcopy

import singer
import singer.utils as singer_utils
from singer import metadata, metrics

import tap_salesforce.salesforce
from tap_salesforce.sync import (sync_stream, resume_syncing_bulk_query, get_stream_version)
from tap_salesforce.salesforce import Salesforce
from tap_salesforce.salesforce.exceptions import (
    TapSalesforceException, TapSalesforceQuotaExceededException)
from tap_salesforce.salesforce.credentials import (
    OAuthCredentials,
    PasswordCredentials,
    parse_credentials
)

LOGGER = singer.get_logger()

# the tap requires these keys
REQUIRED_CONFIG_KEYS = ['api_type',
                        'select_fields_by_default']

# and either one of these credentials

# OAuth:
# - client_id
# - client_secret
# - refresh_token
OAUTH_CONFIG_KEYS = OAuthCredentials._fields

# Password:
# - username
# - password
# - security_token
PASSWORD_CONFIG_KEYS = PasswordCredentials._fields

CONFIG = {
    'refresh_token': None,
    'client_id': None,
    'client_secret': None,
    'start_date': None
}

FORCED_FULL_TABLE = {
    'BackgroundOperationResult' # Does not support ordering by CreatedDate
}

def get_replication_key(sobject_name, fields):
    if sobject_name in FORCED_FULL_TABLE:
        return None

    fields_list = [f['name'] for f in fields]

    if 'SystemModstamp' in fields_list:
        return 'SystemModstamp'
    elif 'LastModifiedDate' in fields_list:
        return 'LastModifiedDate'
    elif 'CreatedDate' in fields_list:
        return 'CreatedDate'
    elif 'LoginTime' in fields_list and sobject_name == 'LoginHistory':
        return 'LoginTime'
    return None

def stream_is_selected(mdata):
    return mdata.get((), {}).get('selected', False)

def build_state(raw_state, catalog):
    state = {}

    for catalog_entry in catalog['streams']:
        tap_stream_id = catalog_entry['tap_stream_id']
        catalog_metadata = metadata.to_map(catalog_entry['metadata'])
        replication_method = catalog_metadata.get((), {}).get('replication-method')

        version = singer.get_bookmark(raw_state,
                                      tap_stream_id,
                                      'version')

        # Preserve state that deals with resuming an incomplete bulk job
        if singer.get_bookmark(raw_state, tap_stream_id, 'JobID'):
            job_id = singer.get_bookmark(raw_state, tap_stream_id, 'JobID')
            batches = singer.get_bookmark(raw_state, tap_stream_id, 'BatchIDs')
            current_bookmark = singer.get_bookmark(raw_state, tap_stream_id, 'JobHighestBookmarkSeen')
            state = singer.write_bookmark(state, tap_stream_id, 'JobID', job_id)
            state = singer.write_bookmark(state, tap_stream_id, 'BatchIDs', batches)
            state = singer.write_bookmark(state, tap_stream_id, 'JobHighestBookmarkSeen', current_bookmark)

        if replication_method == 'INCREMENTAL':
            replication_key = catalog_metadata.get((), {}).get('replication-key')
            replication_key_value = singer.get_bookmark(raw_state,
                                                        tap_stream_id,
                                                        replication_key)
            if version is not None:
                state = singer.write_bookmark(
                    state, tap_stream_id, 'version', version)
            if replication_key_value is not None:
                state = singer.write_bookmark(
                    state, tap_stream_id, replication_key, replication_key_value)
        elif replication_method == 'FULL_TABLE' and version is None:
            state = singer.write_bookmark(state, tap_stream_id, 'version', version)

    return state

# pylint: disable=undefined-variable
def create_property_schema(field, mdata):
    field_name = field['name']

    if field_name == "Id":
        mdata = metadata.write(
            mdata, ('properties', field_name), 'inclusion', 'automatic')
    else:
        mdata = metadata.write(
            mdata, ('properties', field_name), 'inclusion', 'available')

    property_schema, mdata = salesforce.field_to_property_schema(field, mdata)

    return (property_schema, mdata)


# pylint: disable=too-many-branches,too-many-statements
def do_discover(sf: Salesforce, streams: list[str]):
    if not streams:
        """Describes a Salesforce instance's objects and generates a JSON schema for each field."""
        LOGGER.info(f"Start discovery for all streams")
        global_description = sf.describe()
        objects_to_discover = {o['name'] for o in global_description['sobjects']}
    else:
        LOGGER.info(f"Start discovery: {streams=}")
        objects_to_discover = streams

    key_properties = ['Id']

    sf_custom_setting_objects = []
    object_to_tag_references = {}

    # For each SF Object describe it, loop its fields and build a schema
    entries = []
    for sobject_name in objects_to_discover:

        # Skip blacklisted SF objects depending on the api_type in use
        # ChangeEvent objects are not queryable via Bulk or REST (undocumented)
        if sobject_name in sf.get_blacklisted_objects() \
           or sobject_name.endswith("ChangeEvent"):
            continue

        sobject_description = sf.describe(sobject_name)

        # Cache customSetting and Tag objects to check for blacklisting after
        # all objects have been described
        if sobject_description.get("customSetting"):
            sf_custom_setting_objects.append(sobject_name)
        elif sobject_name.endswith("__Tag"):
            relationship_field = next(
                (f for f in sobject_description["fields"] if f.get("relationshipName") == "Item"),
                None)
            if relationship_field:
                # Map {"Object":"Object__Tag"}
                object_to_tag_references[relationship_field["referenceTo"]
                                         [0]] = sobject_name

        fields = sobject_description['fields']
        replication_key = get_replication_key(sobject_name, fields)

        unsupported_fields = set()
        properties = {}
        mdata = metadata.new()

        found_id_field = False

        # Loop over the object's fields
        for f in fields:
            field_name = f['name']
            field_type = f['type']

            if field_name == "Id":
                found_id_field = True

            property_schema, mdata = create_property_schema(
                f, mdata)

            # Compound Address fields cannot be queried by the Bulk API
            if f['type'] in ("address", "location") and sf.api_type == tap_salesforce.salesforce.BULK_API_TYPE:
                unsupported_fields.add(
                    (field_name, 'cannot query compound address fields with bulk API'))

            # we haven't been able to observe any records with a json field, so we
            # are marking it as unavailable until we have an example to work with
            if f['type'] == "json":
                unsupported_fields.add(
                    (field_name, 'do not currently support json fields - please contact support'))

            # Blacklisted fields are dependent on the api_type being used
            field_pair = (sobject_name, field_name)
            if field_pair in sf.get_blacklisted_fields():
                unsupported_fields.add(
                    (field_name, sf.get_blacklisted_fields()[field_pair]))

            inclusion = metadata.get(
                mdata, ('properties', field_name), 'inclusion')

            if sf.select_fields_by_default and inclusion != 'unsupported':
                mdata = metadata.write(
                    mdata, ('properties', field_name), 'selected-by-default', True)

            properties[field_name] = property_schema

            if ((field_name=='OldValue') or (field_name=='NewValue')) and ('History' in sobject_name):
                properties[field_name] = {'type': ['null', 'string']}

        if replication_key:
            mdata = metadata.write(
                mdata, ('properties', replication_key), 'inclusion', 'automatic')

        # There are cases where compound fields are referenced by the associated
        # subfields but are not actually present in the field list
        field_name_set = {f['name'] for f in fields}
        filtered_unsupported_fields = [f for f in unsupported_fields if f[0] in field_name_set]
        missing_unsupported_field_names = [f[0] for f in unsupported_fields if f[0] not in field_name_set]

        if missing_unsupported_field_names:
            LOGGER.info("Ignoring the following unsupported fields for object %s as they are missing from the field list: %s",
                        sobject_name,
                        ', '.join(sorted(missing_unsupported_field_names)))

        if filtered_unsupported_fields:
            LOGGER.info("Not syncing the following unsupported fields for object %s: %s",
                        sobject_name,
                        ', '.join(sorted([k for k, _ in filtered_unsupported_fields])))

        # Salesforce Objects are skipped when they do not have an Id field
        if not found_id_field:
            LOGGER.info(
                "Skipping Salesforce Object %s, as it has no Id field",
                sobject_name)
            continue

        # Any property added to unsupported_fields has metadata generated and
        # removed
        for prop, description in filtered_unsupported_fields:
            if metadata.get(mdata, ('properties', prop),
                            'selected-by-default'):
                metadata.delete(
                    mdata, ('properties', prop), 'selected-by-default')

            mdata = metadata.write(
                mdata, ('properties', prop), 'unsupported-description', description)
            mdata = metadata.write(
                mdata, ('properties', prop), 'inclusion', 'unsupported')

        if replication_key:
            mdata = metadata.write(
                mdata, (), 'valid-replication-keys', [replication_key])
            mdata = metadata.write(
                mdata, (), 'replication-key', replication_key
            )
            mdata = metadata.write(
                mdata, (), 'replication-method', "INCREMENTAL"
            )
        else:
            mdata = metadata.write(
                mdata,
                (),
                'forced-replication-method',
                {
                    'replication-method': 'FULL_TABLE',
                    'reason': 'No replication keys found from the Salesforce API'})

        mdata = metadata.write(mdata, (), 'table-key-properties', key_properties)

        schema = {
            'type': 'object',
            'additionalProperties': False,
            'properties': properties
        }

        entry = {
            'stream': sobject_name,
            'tap_stream_id': sobject_name,
            'schema': schema,
            'metadata': metadata.to_list(mdata)
        }

        entries.append(entry)

    # For each custom setting field, remove its associated tag from entries
    # See Blacklisting.md for more information
    unsupported_tag_objects = [object_to_tag_references[f]
                               for f in sf_custom_setting_objects if f in object_to_tag_references]
    if unsupported_tag_objects:
        LOGGER.info( #pylint:disable=logging-not-lazy
            "Skipping the following Tag objects, Tags on Custom Settings Salesforce objects " +
            "are not supported by the Bulk API:")
        LOGGER.info(unsupported_tag_objects)
        entries = [e for e in entries if e['stream']
                   not in unsupported_tag_objects]

    result = {'streams': entries}
    json.dump(result, sys.stdout, indent=4)


def is_object_type(property_schema):
    """
    Return true if the JSON Schema type is an object or None if detection fails.
    This code is based on the Meltano SDK:
    https://github.com/meltano/sdk/blob/c9c0967b0caca51fe7c87082f9e7c5dd54fa5dfa/singer_sdk/helpers/_typing.py#L50
    """
    if "anyOf" not in property_schema and "type" not in property_schema:
        return None  # Could not detect data type
    for property_type in property_schema.get("anyOf", [property_schema.get("type")]):
        if "object" in property_type or property_type == "object":
            return True
    return False


def is_property_selected(  # noqa: C901  # ignore 'too complex'
        stream_name,
        metadata_map,
        breadcrumb
) -> bool:
    """
    Return True if the property is selected for extract.
    Breadcrumb of `[]` or `None` indicates the stream itself. Otherwise, the
    breadcrumb is the path to a property within the stream.

    The code is based on the Meltano SDK:
    https://github.com/meltano/sdk/blob/c9c0967b0caca51fe7c87082f9e7c5dd54fa5dfa/singer_sdk/helpers/_catalog.py#L63
    """
    breadcrumb = breadcrumb or ()
    if isinstance(breadcrumb, str):
        breadcrumb = tuple([breadcrumb])

    if not metadata:
        # Default to true if no metadata to say otherwise
        return True

    md_entry = metadata_map.get(breadcrumb, {})
    parent_value = None
    if len(breadcrumb) > 0:
        parent_breadcrumb = tuple(list(breadcrumb)[:-2])
        parent_value = is_property_selected(
            stream_name, metadata_map, parent_breadcrumb
        )
    if parent_value is False:
        return parent_value

    selected = md_entry.get("selected")
    selected_by_default = md_entry.get("selected-by-default")
    inclusion = md_entry.get("inclusion")

    if inclusion == "unsupported":
        if selected is True:
            LOGGER.debug(
                "Property '%s' was selected but is not supported. "
                "Ignoring selected==True input.",
                ":".join(breadcrumb),
            )
        return False

    if inclusion == "automatic":
        if selected is False:
            LOGGER.debug(
                "Property '%s' was deselected while also set "
                "for automatic inclusion. Ignoring selected==False input.",
                ":".join(breadcrumb),
            )
        return True

    if selected is not None:
        return selected

    if selected_by_default is not None:
        return selected_by_default

    LOGGER.debug(
        "Selection metadata omitted for '%s':'%s'. "
        "Using parent value of selected=%s.",
        stream_name,
        breadcrumb,
        parent_value,
    )
    return parent_value or False


def pop_deselected_schema(
    schema,
    stream_name,
    breadcrumb,
    metadata_map
):
    """Remove anything from schema that is not selected.
    Walk through schema, starting at the index in breadcrumb, recursively updating in
    place.
    This code is based on https://github.com/meltano/sdk/blob/c9c0967b0caca51fe7c87082f9e7c5dd54fa5dfa/singer_sdk/helpers/_catalog.py#L146
    """
    for property_name, val in list(schema.get("properties", {}).items()):
        property_breadcrumb = tuple(
            list(breadcrumb) + ["properties", property_name]
        )
        selected = is_property_selected(
            stream_name, metadata_map, property_breadcrumb
        )
        LOGGER.info(stream_name + '.' + property_name + ' - ' + str(selected))
        if not selected:
            schema["properties"].pop(property_name)
            continue

        if is_object_type(val):
            # call recursively in case any subproperties are deselected.
            pop_deselected_schema(
                val, stream_name, property_breadcrumb, metadata_map
            )


async def sync_catalog_entry(sf, catalog_entry, state):
    stream_version = get_stream_version(catalog_entry, state)
    stream = catalog_entry['stream']
    stream_alias = catalog_entry.get('stream_alias')
    stream_name = catalog_entry["tap_stream_id"]
    activate_version_message = singer.ActivateVersionMessage(
        stream=(stream_alias or stream), version=stream_version)

    catalog_metadata = metadata.to_map(catalog_entry['metadata'])
    replication_key = catalog_metadata.get((), {}).get('replication-key')

    mdata = metadata.to_map(catalog_entry['metadata'])

    if not stream_is_selected(mdata):
        LOGGER.debug("%s: Skipping - not selected", stream_name)
        return

    LOGGER.info("%s: Starting", stream_name)

    singer.write_state(state)
    key_properties = metadata.to_map(catalog_entry['metadata']).get((), {}).get('table-key-properties')

    # Filter the schema for selected fields
    schema = deepcopy(catalog_entry['schema'])
    pop_deselected_schema(schema, stream_name, (), mdata)

    singer.write_schema(
        stream,
        schema,
        key_properties,
        replication_key,
        stream_alias)
    loop = asyncio.get_event_loop()

    job_id = singer.get_bookmark(state, catalog_entry['tap_stream_id'], 'JobID')
    if job_id:
        with metrics.record_counter(stream) as counter:
            LOGGER.info("Found JobID from previous Bulk Query. Resuming sync for job: %s", job_id)
            # Resuming a sync should clear out the remaining state once finished
            await loop.run_in_executor(None, resume_syncing_bulk_query, sf, catalog_entry, job_id, state, counter)
            LOGGER.info("Completed sync for %s", stream_name)
            state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobID', None)
            state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('BatchIDs', None)
            bookmark = state.get('bookmarks', {}).get(catalog_entry['tap_stream_id'], {}).pop('JobHighestBookmarkSeen', None)
            state = singer.write_bookmark(
                state,
                catalog_entry['tap_stream_id'],
                replication_key,
                bookmark)
            singer.write_state(state)
    else:
        state_msg_threshold = CONFIG.get('state_message_threshold', 1000)

        # Tables with a replication_key or an empty bookmark will emit an
        # activate_version at the beginning of their sync
        bookmark_is_empty = state.get('bookmarks', {}).get(
            catalog_entry['tap_stream_id']) is None

        if replication_key or bookmark_is_empty:
            singer.write_message(activate_version_message)
            state = singer.write_bookmark(state,
                                          catalog_entry['tap_stream_id'],
                                          'version',
                                          stream_version)
        await loop.run_in_executor(None, sync_stream, sf, catalog_entry, state, state_msg_threshold)
        LOGGER.info("Completed sync for %s", stream_name)

def do_sync(sf, catalog, state):
    LOGGER.info("Starting sync")

    max_workers = CONFIG.get('max_workers', 8)
    executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
    loop = asyncio.get_event_loop()
    loop.set_default_executor(executor)

    try:
        streams_to_sync = catalog["streams"]

        # Schedule one task for each catalog entry to be extracted
        # and run them concurrently.
        sync_tasks = (sync_catalog_entry(sf, catalog_entry, state)
                      for catalog_entry in streams_to_sync)
        tasks = asyncio.gather(*sync_tasks)
        loop.run_until_complete(tasks)
    finally:
        loop.run_until_complete(loop.shutdown_asyncgens())
        loop.close()

    singer.write_state(state)
    LOGGER.info("Finished sync")

def main_impl():
    args = singer_utils.parse_args(REQUIRED_CONFIG_KEYS)
    CONFIG.update(args.config)

    credentials = parse_credentials(CONFIG)
    sf = None
    try:
        sf = Salesforce(
            credentials=credentials,
            quota_percent_total=CONFIG.get('quota_percent_total'),
            quota_percent_per_run=CONFIG.get('quota_percent_per_run'),
            is_sandbox=CONFIG.get('is_sandbox'),
            select_fields_by_default=CONFIG.get('select_fields_by_default'),
            default_start_date=CONFIG.get('start_date'),
            api_type=CONFIG.get('api_type'))
        sf.login()

        if args.discover:
            do_discover(sf, CONFIG.get("streams_to_discover", []))
        elif args.properties or args.catalog:
            catalog = args.properties or args.catalog.to_dict()
            state = build_state(args.state, catalog)
            do_sync(sf, catalog, state)
    finally:
        if sf:
            if sf.rest_requests_attempted > 0:
                LOGGER.debug(
                    "This job used %s REST requests towards the Salesforce quota.",
                    sf.rest_requests_attempted)
            if sf.jobs_completed > 0:
                LOGGER.debug(
                    "Replication used %s Bulk API jobs towards the Salesforce quota.",
                    sf.jobs_completed)
            if sf.auth.login_timer:
                sf.auth.login_timer.cancel()


def main():
    try:
        main_impl()
    except TapSalesforceQuotaExceededException as e:
        LOGGER.critical(e)
        sys.exit(2)
    except TapSalesforceException as e:
        LOGGER.critical(e)
        sys.exit(1)
    except Exception as e:
        LOGGER.critical(e)
        raise e