From 44c39bc578722980c883638e15960b3478bd1313 Mon Sep 17 00:00:00 2001 From: satyendra101 Date: Wed, 17 Jun 2026 09:07:30 +0000 Subject: [PATCH 1/4] SAC-31234: Exlcude un-auth streams from catalog while discovery --- CHANGELOG.md | 3 + setup.py | 2 +- tap_outbrain/__init__.py | 17 +- tap_outbrain/client.py | 13 +- tap_outbrain/discover.py | 70 +++++++- tap_outbrain/streams.py | 61 ++++++- tests/unittests/test_catalog.py | 6 +- tests/unittests/test_discover.py | 267 ++++++++++++++++++++++++++++--- tests/unittests/test_init.py | 7 +- 9 files changed, 407 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a873bfd..b2cbcd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 1.2.0 + * Exclude un-authorized streams from the catalog during discovery. [#TBD](https://github.com/singer-io/tap-outbrain/pull/TBD) + ## 1.1.0 * Upgrade Python to 3.12 in CircleCI [#30](https://github.com/singer-io/tap-outbrain/pull/30) * Upgrade `singer-python`, `requests` and `python-dateutil` to latest version diff --git a/setup.py b/setup.py index 7860fae..3a67668 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup(name="tap-outbrain", - version="1.1.0", + version="1.2.0", description="Singer.io tap for extracting data from the Outbrain API", author="Fishtown Analytics", url="http://singer.io", diff --git a/tap_outbrain/__init__.py b/tap_outbrain/__init__.py index 179577c..7a6cae6 100755 --- a/tap_outbrain/__init__.py +++ b/tap_outbrain/__init__.py @@ -294,12 +294,14 @@ def sync_campaigns(state, access_token, account_id, selected_streams): LOGGER.info('Done!') -def do_discover(): + +def do_discover(client): LOGGER.info("Starting discovery") - catalog = discover() + catalog = discover(client) json.dump(catalog.to_dict(), sys.stdout, indent=2) LOGGER.info("Finished discover") + def do_sync(catalog: singer.Catalog, config: Dict, state): #pylint: disable=global-statement global DEFAULT_START_DATE @@ -355,7 +357,16 @@ def main_impl(): 'start_date']) if args.discover: - do_discover() + config = args.config + access_token = config.get('access_token') or generate_token( + config.get('username'), config.get('password') + ) + if access_token is None: + LOGGER.fatal("Failed to generate a new access token for discover mode.") + raise RuntimeError + + client = OutbrainClient(config={**config, 'access_token': access_token}) + do_discover(client) elif args.catalog: state = args.state or DEFAULT_STATE do_sync(args.catalog, args.config, state) diff --git a/tap_outbrain/client.py b/tap_outbrain/client.py index 9ff1f22..37f1773 100644 --- a/tap_outbrain/client.py +++ b/tap_outbrain/client.py @@ -9,15 +9,22 @@ LOGGER = singer.get_logger() SESSION = requests.Session() +OUTBRAIN_API_BASE = 'https://api.outbrain.com/amplify/v0.1' + class Server429Error(Exception): pass +class OutbrainForbiddenError(Exception): + pass + + class OutbrainClient: - def __init__(self): + def __init__(self, config=None): self._retry_after = RETRY_RATE_LIMIT_MS / 1000.0 # Conversion to seconds + self.config = config or {} def _rate_limit_backoff(self): """ @@ -71,6 +78,10 @@ def _call(): self._retry_after = RETRY_RATE_LIMIT_MS self._retry_after /= 1000.0 # For miliseconds conversion to seconds raise Server429Error("Rate limit exceeded") + elif resp.status_code == 403: + raise OutbrainForbiddenError( + f"HTTP-error-code: 403, Error: {resp.content!r}" + ) elif resp.status_code >= 400: LOGGER.error( f"{method} {req.url} [{resp.status_code} – {resp.content!r}]" diff --git a/tap_outbrain/discover.py b/tap_outbrain/discover.py index aebaeaf..79bb43d 100644 --- a/tap_outbrain/discover.py +++ b/tap_outbrain/discover.py @@ -2,15 +2,79 @@ from singer import metadata from singer.catalog import Catalog, CatalogEntry, Schema +from tap_outbrain.client import OutbrainForbiddenError from tap_outbrain.schema import get_schemas +from tap_outbrain.streams import STREAMS LOGGER = singer.get_logger() -def discover() -> Catalog: - """Run the discovery mode, prepare the catalog file and return the - catalog.""" +def _apply_access_checks(client, schemas: dict, field_metadata: dict) -> None: + """ + Probe each stream for read access and remove inaccessible streams + (and their children) from *schemas* and *field_metadata* in place. + + Note: ``check_access()`` always returns ``True`` for child streams, so + this loop effectively identifies only inaccessible parent streams by + design. Child stream removal is handled separately by + ``_prune_inaccessible_children()``. + + Raises ``OutbrainForbiddenError`` if no parent streams are accessible. + """ + inaccessible_streams = [ + stream_name + for stream_name, stream_cls in STREAMS.items() + if stream_name in schemas + and not stream_cls(client=client).check_access() + ] + + for stream_name in inaccessible_streams: + schemas.pop(stream_name, None) + field_metadata.pop(stream_name, None) + + _prune_inaccessible_children(schemas, field_metadata) + + if not schemas: + raise OutbrainForbiddenError( + "HTTP-error-code: 403, Error: The credentials do not have" + " 'read' access to any supported streams. Please re-check configuration." + ) + elif inaccessible_streams: + LOGGER.warning( + "No 'read' access to stream(s): %s. Excluded from catalog.", + ", ".join(inaccessible_streams), + ) + + +def _prune_inaccessible_children(schemas: dict, field_metadata: dict) -> None: + """ + Remove child streams from the catalog whose parent stream was excluded. + Mutates *schemas* and *field_metadata* in place. + """ + for name, stream_cls in list(STREAMS.items()): + if name in schemas and stream_cls.parent and stream_cls.parent not in schemas: + LOGGER.warning( + "Stream '%s' excluded from catalog because its parent" + " stream '%s' is not accessible.", + name, + stream_cls.parent, + ) + schemas.pop(name, None) + field_metadata.pop(name, None) + + +def discover(client) -> Catalog: + """ + Run the discovery mode, prepare the catalog file and return the catalog. + + When *client* is provided, access to each stream is verified using the + supplied ``OutbrainClient`` and streams the credentials cannot read are + excluded from the returned catalog. + """ schemas, field_metadata = get_schemas() + + _apply_access_checks(client, schemas, field_metadata) + catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): try: diff --git a/tap_outbrain/streams.py b/tap_outbrain/streams.py index 791ca6c..ad923a5 100644 --- a/tap_outbrain/streams.py +++ b/tap_outbrain/streams.py @@ -1,17 +1,74 @@ -class Campaign: +import singer + +from tap_outbrain.client import OUTBRAIN_API_BASE, OutbrainForbiddenError + +LOGGER = singer.get_logger() + + +class BaseStream: + """ + Abstract base class for all tap-outbrain streams. + + Subclasses set class-level attributes describing the stream and may + override ``check_access()`` to probe the appropriate API endpoint. + Child streams (those with ``parent`` set) always return ``True`` from + the default ``check_access()`` implementation; their accessibility is + governed by their parent stream's check. + """ + + name = None + key_properties = [] + replication_keys = None + replication_method = None + parent = None + + def __init__(self, client=None): + self.client = client + + def check_access(self) -> bool: + """ + Verify that the API credentials have read access to this stream. + Returns True if accessible, False if a 403 Forbidden error is raised. + Child streams always return True (access is governed by the parent check). + """ + return True + + +class Campaign(BaseStream): name = "campaign" key_properties = ["id"] replication_keys = None replication_method = "FULL_TABLE" + def check_access(self) -> bool: + """ + Probe the campaigns endpoint to verify read access. + Returns False when the API responds with HTTP 403 Forbidden. + """ + account_id = self.client.config.get("account_id") + access_token = self.client.config.get("access_token") + headers = {"OB-TOKEN-V1": access_token} + url = f"{OUTBRAIN_API_BASE}/marketers/{account_id}/campaigns" + try: + self.client.make_request("GET", url, headers=headers, params={"limit": 1}) + return True + except OutbrainForbiddenError as exc: + LOGGER.warning( + "Permission Error: Stream '%s' - %s", + self.__class__.__name__, + exc, + ) + return False + -class CampaignPerformance: +class CampaignPerformance(BaseStream): name = "campaign_performance" key_properties = ["campaignId", "fromDate"] bookmark_properties = ["fromDate"] replication_keys = "fromDate" replication_method = "INCREMENTAL" parent = "campaign" + # check_access() inherited from BaseStream always returns True for child streams STREAMS = { diff --git a/tests/unittests/test_catalog.py b/tests/unittests/test_catalog.py index 11e64ad..eee4b0d 100644 --- a/tests/unittests/test_catalog.py +++ b/tests/unittests/test_catalog.py @@ -124,11 +124,11 @@ def mock_file_content(filename, mode): def test_streams_configuration(self): """Test that STREAMS configuration has correct parent attribute.""" - # Verify Campaign class does not have parent attribute - self.assertFalse(hasattr(STREAMS['campaign'], 'parent')) + # Campaign is a root stream — parent is None (inherited from BaseStream) + self.assertIsNone(STREAMS['campaign'].parent) # Verify CampaignPerformance class has parent attribute set correctly - self.assertTrue(hasattr(STREAMS['campaign_performance'], 'parent')) + self.assertIsNotNone(STREAMS['campaign_performance'].parent) self.assertEqual(STREAMS['campaign_performance'].parent, 'campaign') @patch('builtins.open', new_callable=mock_open) diff --git a/tests/unittests/test_discover.py b/tests/unittests/test_discover.py index 66f9a2e..c789420 100644 --- a/tests/unittests/test_discover.py +++ b/tests/unittests/test_discover.py @@ -1,11 +1,13 @@ """Unit tests for tap_outbrain/discover.py""" import unittest -from unittest.mock import patch +from unittest.mock import MagicMock, patch from singer.catalog import Catalog, CatalogEntry -from tap_outbrain.discover import discover +from tap_outbrain.client import OutbrainForbiddenError +from tap_outbrain.discover import (_apply_access_checks, + _prune_inaccessible_children, discover) _MOCK_CAMPAIGN_SCHEMA = { "type": "object", @@ -48,37 +50,40 @@ def _build_mock_metadata(stream_name): class TestDiscover(unittest.TestCase): + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_returns_catalog_instance(self, mock_get_schemas): - """discover() returns a singer Catalog object.""" + def test_returns_catalog_instance(self, mock_get_schemas, mock_access_checks): + """discover(client) returns a singer Catalog object.""" mock_get_schemas.return_value = ( _MOCK_SCHEMAS, {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS}, ) - result = discover() + result = discover(MagicMock()) self.assertIsInstance(result, Catalog) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_catalog_contains_all_streams(self, mock_get_schemas): + def test_catalog_contains_all_streams(self, mock_get_schemas, mock_access_checks): """Catalog contains one entry per schema returned by get_schemas.""" mock_get_schemas.return_value = ( _MOCK_SCHEMAS, {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS}, ) - result = discover() + result = discover(MagicMock()) stream_names = [e.stream for e in result.streams] self.assertIn('campaign', stream_names) self.assertIn('campaign_performance', stream_names) self.assertEqual(len(result.streams), 2) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_catalog_entry_has_correct_key_properties(self, mock_get_schemas): + def test_catalog_entry_has_correct_key_properties(self, mock_get_schemas, mock_access_checks): """Each CatalogEntry carries the correct key_properties from metadata.""" mock_get_schemas.return_value = ( _MOCK_SCHEMAS, {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS}, ) - result = discover() + result = discover(MagicMock()) entry_by_name = {e.stream: e for e in result.streams} self.assertEqual(entry_by_name['campaign'].key_properties, ['id']) @@ -87,25 +92,27 @@ def test_catalog_entry_has_correct_key_properties(self, mock_get_schemas): ['campaignId', 'fromDate'], ) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_tap_stream_id_matches_stream_name(self, mock_get_schemas): + def test_tap_stream_id_matches_stream_name(self, mock_get_schemas, mock_access_checks): """tap_stream_id should equal the stream name for every entry.""" mock_get_schemas.return_value = ( _MOCK_SCHEMAS, {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS}, ) - result = discover() + result = discover(MagicMock()) for entry in result.streams: self.assertEqual(entry.tap_stream_id, entry.stream) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_catalog_entry_schema_matches_input(self, mock_get_schemas): + def test_catalog_entry_schema_matches_input(self, mock_get_schemas, mock_access_checks): """The Schema on each CatalogEntry matches what get_schemas returned.""" mock_get_schemas.return_value = ( _MOCK_SCHEMAS, {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS}, ) - result = discover() + result = discover(MagicMock()) entry_by_name = {e.stream: e for e in result.streams} # Schema.to_dict() should equal the original schema dict @@ -117,13 +124,14 @@ def test_catalog_entry_schema_matches_input(self, mock_get_schemas): _MOCK_PERFORMANCE_SCHEMA, ) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_catalog_entry_metadata_preserved(self, mock_get_schemas): + def test_catalog_entry_metadata_preserved(self, mock_get_schemas, mock_access_checks): """Metadata list from get_schemas is stored on each CatalogEntry.""" field_metadata = {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS} mock_get_schemas.return_value = (_MOCK_SCHEMAS, field_metadata) - result = discover() + result = discover(MagicMock()) entry_by_name = {e.stream: e for e in result.streams} self.assertEqual( @@ -131,28 +139,30 @@ def test_catalog_entry_metadata_preserved(self, mock_get_schemas): field_metadata['campaign_performance'], ) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_discover_with_single_stream(self, mock_get_schemas): - """discover() handles a catalog with only one stream.""" + def test_discover_with_single_stream(self, mock_get_schemas, mock_access_checks): + """discover(client) handles a catalog with only one stream.""" single_schema = {'campaign': _MOCK_CAMPAIGN_SCHEMA} mock_get_schemas.return_value = ( single_schema, {'campaign': _build_mock_metadata('campaign')}, ) - result = discover() + result = discover(MagicMock()) self.assertEqual(len(result.streams), 1) self.assertEqual(result.streams[0].stream, 'campaign') @patch('tap_outbrain.discover.get_schemas') def test_discover_propagates_get_schemas_exception(self, mock_get_schemas): - """If get_schemas raises, discover() propagates the exception.""" + """If get_schemas raises, discover(client) propagates the exception.""" mock_get_schemas.side_effect = RuntimeError("Schema load failed") with self.assertRaises(RuntimeError): - discover() + discover(MagicMock()) + @patch('tap_outbrain.discover._apply_access_checks') @patch('tap_outbrain.discover.get_schemas') - def test_discover_logs_and_reraises_schema_error(self, mock_get_schemas): - """When Schema.from_dict fails, discover() re-raises the exception.""" + def test_discover_logs_and_reraises_schema_error(self, mock_get_schemas, mock_access_checks): + """When Schema.from_dict fails, discover(client) re-raises the exception.""" # Provide a schema that is valid dict but will fail Schema.from_dict indirectly bad_schemas = {'campaign': None} # None will cause AttributeError in Schema.from_dict mock_get_schemas.return_value = ( @@ -160,4 +170,215 @@ def test_discover_logs_and_reraises_schema_error(self, mock_get_schemas): {'campaign': _build_mock_metadata('campaign')}, ) with self.assertRaises(Exception): - discover() + discover(MagicMock()) + + +# --------------------------------------------------------------------------- +# Helpers shared by the access-check test classes +# --------------------------------------------------------------------------- + +def _make_stream_cls(accessible=True, parent=None): + """ + Return a mock stream *class* (not instance) whose instantiation returns + an object with ``check_access()`` set to *accessible* and whose + ``parent`` class attribute equals *parent*. + """ + cls = MagicMock() + cls.parent = parent + instance = MagicMock() + instance.check_access.return_value = accessible + cls.return_value = instance + return cls + + +def _make_client(): + from tap_outbrain.client import OutbrainClient + return OutbrainClient(config={"account_id": "acct1", "access_token": "tok"}) + + +# --------------------------------------------------------------------------- +# _apply_access_checks +# --------------------------------------------------------------------------- + +class TestApplyAccessChecks(unittest.TestCase): + + def _schemas_and_meta(self): + schemas = dict(_MOCK_SCHEMAS) + meta = {s: _build_mock_metadata(s) for s in schemas} + return schemas, meta + + @patch("tap_outbrain.discover.STREAMS") + def test_all_accessible_leaves_catalog_intact(self, mock_streams): + """When all streams are accessible, schemas and metadata are unchanged.""" + mock_streams.items.return_value = [ + ("campaign", _make_stream_cls(accessible=True, parent=None)), + ("campaign_performance", _make_stream_cls(accessible=True, parent="campaign")), + ] + schemas, meta = self._schemas_and_meta() + _apply_access_checks(_make_client(), schemas, meta) + + self.assertIn("campaign", schemas) + self.assertIn("campaign_performance", schemas) + + @patch("tap_outbrain.discover.STREAMS") + def test_inaccessible_parent_removed(self, mock_streams): + """An inaccessible parent stream is removed from schemas and metadata.""" + mock_streams.items.return_value = [ + ("campaign", _make_stream_cls(accessible=False, parent=None)), + ("campaign_performance", _make_stream_cls(accessible=True, parent="campaign")), + ] + schemas, meta = self._schemas_and_meta() + # _prune_inaccessible_children also checks STREAMS so patch it globally + with patch("tap_outbrain.discover.STREAMS") as mock_streams2: + mock_streams2.items.return_value = [ + ("campaign", _make_stream_cls(accessible=False, parent=None)), + ("campaign_performance", _make_stream_cls(accessible=True, parent="campaign")), + ] + # Both apply_access_checks' own loop and _prune need the patch + # Re-run with a single consistent patch at module level + pass + + # Simpler: patch once via STREAMS at module level + streams_dict = { + "campaign": _make_stream_cls(accessible=False, parent=None), + "campaign_performance": _make_stream_cls(accessible=True, parent="campaign"), + } + with patch.dict("tap_outbrain.discover.STREAMS", streams_dict, clear=True): + schemas, meta = self._schemas_and_meta() + with self.assertRaises(OutbrainForbiddenError): + # campaign removed → only campaign_performance remains, + # then _prune removes it too → no schemas → raises + _apply_access_checks(_make_client(), schemas, meta) + + @patch.dict( + "tap_outbrain.discover.STREAMS", + { + "campaign": _make_stream_cls(accessible=False, parent=None), + "campaign_performance": _make_stream_cls(accessible=True, parent="campaign"), + }, + clear=True, + ) + def test_all_inaccessible_raises_forbidden_error(self): + """OutbrainForbiddenError is raised when no streams remain after pruning.""" + schemas, meta = {**_MOCK_SCHEMAS}, {s: _build_mock_metadata(s) for s in _MOCK_SCHEMAS} + with self.assertRaises(OutbrainForbiddenError) as ctx: + _apply_access_checks(_make_client(), schemas, meta) + self.assertIn("403", str(ctx.exception)) + self.assertIn("read", str(ctx.exception)) + + def test_inaccessible_warning_logged(self): + """A warning is logged for each excluded stream when some remain.""" + accessible_campaign = _make_stream_cls(accessible=True, parent=None) + streams_dict = { + "campaign": accessible_campaign, + "campaign_performance": _make_stream_cls(accessible=True, parent="campaign"), + } + with patch.dict("tap_outbrain.discover.STREAMS", streams_dict, clear=True): + schemas, meta = self._schemas_and_meta() + # All accessible → no warning, no raise + with patch("tap_outbrain.discover.LOGGER") as mock_logger: + _apply_access_checks(_make_client(), schemas, meta) + mock_logger.warning.assert_not_called() + + def test_partial_access_excludes_forbidden_streams(self): + """When one stream is inaccessible and a sibling remains, no error is raised.""" + # We need two independent parent streams. Temporarily extend STREAMS. + extra_cls = _make_stream_cls(accessible=True, parent=None) + extra_cls.name = "extra_stream" + forbidden_cls = _make_stream_cls(accessible=False, parent=None) + forbidden_cls.name = "campaign" + + extra_schema = {"type": "object", "properties": {}} + streams_dict = { + "campaign": forbidden_cls, + "extra_stream": extra_cls, + } + schemas = {"campaign": _MOCK_CAMPAIGN_SCHEMA, "extra_stream": extra_schema} + meta = { + "campaign": _build_mock_metadata("campaign"), + "extra_stream": _build_mock_metadata("campaign"), + } + with patch.dict("tap_outbrain.discover.STREAMS", streams_dict, clear=True): + _apply_access_checks(_make_client(), schemas, meta) + + self.assertNotIn("campaign", schemas) + self.assertIn("extra_stream", schemas) + + +# --------------------------------------------------------------------------- +# _prune_inaccessible_children +# --------------------------------------------------------------------------- + +class TestPruneInaccessibleChildren(unittest.TestCase): + + def test_child_removed_when_parent_absent(self): + """ + A child stream is pruned from schemas when its parent is not present. + """ + parent_cls = _make_stream_cls(parent=None) + child_cls = _make_stream_cls(parent="campaign") + + with patch.dict( + "tap_outbrain.discover.STREAMS", + {"campaign": parent_cls, "campaign_performance": child_cls}, + clear=True, + ): + schemas = {"campaign_performance": _MOCK_PERFORMANCE_SCHEMA} + meta = {"campaign_performance": _build_mock_metadata("campaign_performance")} + _prune_inaccessible_children(schemas, meta) + + self.assertNotIn("campaign_performance", schemas) + self.assertNotIn("campaign_performance", meta) + + def test_child_retained_when_parent_present(self): + """ + A child stream is kept when its parent stream is still in schemas. + """ + parent_cls = _make_stream_cls(parent=None) + child_cls = _make_stream_cls(parent="campaign") + + with patch.dict( + "tap_outbrain.discover.STREAMS", + {"campaign": parent_cls, "campaign_performance": child_cls}, + clear=True, + ): + schemas = dict(_MOCK_SCHEMAS) + meta = {s: _build_mock_metadata(s) for s in schemas} + _prune_inaccessible_children(schemas, meta) + + self.assertIn("campaign_performance", schemas) + + def test_streams_without_parent_unaffected(self): + """ + Streams with no parent attribute are never removed by pruning. + """ + parent_cls = _make_stream_cls(parent=None) + + with patch.dict( + "tap_outbrain.discover.STREAMS", + {"campaign": parent_cls}, + clear=True, + ): + schemas = {"campaign": _MOCK_CAMPAIGN_SCHEMA} + meta = {"campaign": _build_mock_metadata("campaign")} + _prune_inaccessible_children(schemas, meta) + + self.assertIn("campaign", schemas) + + def test_warning_logged_on_child_pruning(self): + """A warning is emitted when a child stream is pruned.""" + parent_cls = _make_stream_cls(parent=None) + child_cls = _make_stream_cls(parent="campaign") + + with patch.dict( + "tap_outbrain.discover.STREAMS", + {"campaign": parent_cls, "campaign_performance": child_cls}, + clear=True, + ): + schemas = {"campaign_performance": _MOCK_PERFORMANCE_SCHEMA} + meta = {"campaign_performance": _build_mock_metadata("campaign_performance")} + with patch("tap_outbrain.discover.LOGGER") as mock_logger: + _prune_inaccessible_children(schemas, meta) + mock_logger.warning.assert_called_once() + warn_msg = mock_logger.warning.call_args[0][0] + self.assertIn("excluded", warn_msg) diff --git a/tests/unittests/test_init.py b/tests/unittests/test_init.py index 593933b..32f1749 100644 --- a/tests/unittests/test_init.py +++ b/tests/unittests/test_init.py @@ -327,14 +327,15 @@ class TestDoDiscover(unittest.TestCase): @patch('sys.stdout') @patch('tap_outbrain.discover') def test_do_discover_dumps_catalog(self, mock_discover, mock_stdout): - """do_discover calls discover() and writes JSON to stdout.""" + """do_discover calls discover(client) and writes JSON to stdout.""" mock_catalog = MagicMock() mock_catalog.to_dict.return_value = {'streams': []} mock_discover.return_value = mock_catalog + mock_client = MagicMock() - do_discover() + do_discover(mock_client) - mock_discover.assert_called_once() + mock_discover.assert_called_once_with(mock_client) mock_catalog.to_dict.assert_called_once() From 1cba71c00c5a274f94991d2e3c7faa17f9f8857b Mon Sep 17 00:00:00 2001 From: satyendra101 Date: Wed, 17 Jun 2026 09:25:37 +0000 Subject: [PATCH 2/4] Make client optional in discovery as integration tests are tightly coupled with client as None --- tap_outbrain/discover.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tap_outbrain/discover.py b/tap_outbrain/discover.py index 79bb43d..fe13c40 100644 --- a/tap_outbrain/discover.py +++ b/tap_outbrain/discover.py @@ -63,7 +63,7 @@ def _prune_inaccessible_children(schemas: dict, field_metadata: dict) -> None: field_metadata.pop(name, None) -def discover(client) -> Catalog: +def discover(client=None) -> Catalog: """ Run the discovery mode, prepare the catalog file and return the catalog. @@ -73,7 +73,8 @@ def discover(client) -> Catalog: """ schemas, field_metadata = get_schemas() - _apply_access_checks(client, schemas, field_metadata) + if client: # Checking for client as integration tests are written without client in discovery + _apply_access_checks(client, schemas, field_metadata) catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): From 763761a4bb6f9f1c9d728dc58c1bf0d135be6db7 Mon Sep 17 00:00:00 2001 From: satyendra101 Date: Wed, 17 Jun 2026 09:29:30 +0000 Subject: [PATCH 3/4] Update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2cbcd3..bac6dbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## 1.2.0 - * Exclude un-authorized streams from the catalog during discovery. [#TBD](https://github.com/singer-io/tap-outbrain/pull/TBD) + * Exclude un-authorized streams from the catalog during discovery. [#34](https://github.com/singer-io/tap-outbrain/pull/34) ## 1.1.0 * Upgrade Python to 3.12 in CircleCI [#30](https://github.com/singer-io/tap-outbrain/pull/30) From 171a62304cea268c13de0b27683f28842953525f Mon Sep 17 00:00:00 2001 From: satyendra101 Date: Wed, 17 Jun 2026 12:07:35 +0000 Subject: [PATCH 4/4] Resolve copilot comments --- CHANGELOG.md | 2 +- tap_outbrain/discover.py | 2 +- tests/unittests/test_discover.py | 27 +++++---------------------- 3 files changed, 7 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bac6dbc..93936e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## 1.2.0 - * Exclude un-authorized streams from the catalog during discovery. [#34](https://github.com/singer-io/tap-outbrain/pull/34) + * Exclude unauthorized streams from the catalog during discovery. [#34](https://github.com/singer-io/tap-outbrain/pull/34) ## 1.1.0 * Upgrade Python to 3.12 in CircleCI [#30](https://github.com/singer-io/tap-outbrain/pull/30) diff --git a/tap_outbrain/discover.py b/tap_outbrain/discover.py index fe13c40..73ad70e 100644 --- a/tap_outbrain/discover.py +++ b/tap_outbrain/discover.py @@ -73,7 +73,7 @@ def discover(client=None) -> Catalog: """ schemas, field_metadata = get_schemas() - if client: # Checking for client as integration tests are written without client in discovery + if client is not None: # Checking for client as integration tests are written without client in discovery _apply_access_checks(client, schemas, field_metadata) catalog = Catalog([]) diff --git a/tests/unittests/test_discover.py b/tests/unittests/test_discover.py index c789420..9d403aa 100644 --- a/tests/unittests/test_discover.py +++ b/tests/unittests/test_discover.py @@ -228,27 +228,10 @@ def test_inaccessible_parent_removed(self, mock_streams): ("campaign_performance", _make_stream_cls(accessible=True, parent="campaign")), ] schemas, meta = self._schemas_and_meta() - # _prune_inaccessible_children also checks STREAMS so patch it globally - with patch("tap_outbrain.discover.STREAMS") as mock_streams2: - mock_streams2.items.return_value = [ - ("campaign", _make_stream_cls(accessible=False, parent=None)), - ("campaign_performance", _make_stream_cls(accessible=True, parent="campaign")), - ] - # Both apply_access_checks' own loop and _prune need the patch - # Re-run with a single consistent patch at module level - pass - - # Simpler: patch once via STREAMS at module level - streams_dict = { - "campaign": _make_stream_cls(accessible=False, parent=None), - "campaign_performance": _make_stream_cls(accessible=True, parent="campaign"), - } - with patch.dict("tap_outbrain.discover.STREAMS", streams_dict, clear=True): - schemas, meta = self._schemas_and_meta() - with self.assertRaises(OutbrainForbiddenError): - # campaign removed → only campaign_performance remains, - # then _prune removes it too → no schemas → raises - _apply_access_checks(_make_client(), schemas, meta) + with self.assertRaises(OutbrainForbiddenError): + # campaign removed → only campaign_performance remains, + # then _prune removes it too → no schemas → raises + _apply_access_checks(_make_client(), schemas, meta) @patch.dict( "tap_outbrain.discover.STREAMS", @@ -296,7 +279,7 @@ def test_partial_access_excludes_forbidden_streams(self): schemas = {"campaign": _MOCK_CAMPAIGN_SCHEMA, "extra_stream": extra_schema} meta = { "campaign": _build_mock_metadata("campaign"), - "extra_stream": _build_mock_metadata("campaign"), + "extra_stream": [], } with patch.dict("tap_outbrain.discover.STREAMS", streams_dict, clear=True): _apply_access_checks(_make_client(), schemas, meta)