Skip to content

Commit 2fee322

Browse files
jjoyce0510John JoyceJohn JoyceJohn Joycegabe-lyons
authored
feat(): Introducing support for DataHub Events Source (#142)
* Wrapping it up * Adding event source * Adding more tests * Adding datahub cloud events source * Adding prerequisites * Adding docs * Addressing gabes comments * Update hello_world_datahub_cloud.yaml * Update datahub-actions/src/datahub_actions/plugin/source/acryl/datahub_cloud_event_source.py Co-authored-by: Gabe Lyons <[email protected]> * Remove monkeypatched method --------- Co-authored-by: John Joyce <[email protected]> Co-authored-by: John Joyce <[email protected]> Co-authored-by: John Joyce <[email protected]> Co-authored-by: Gabe Lyons <[email protected]> Co-authored-by: John Joyce <[email protected]>
1 parent c0fc442 commit 2fee322

16 files changed

+1434
-24
lines changed

datahub-actions/setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def get_long_description():
127127
"jsonpickle",
128128
"build",
129129
"twine",
130+
"tenacity",
130131
*list(
131132
dependency
132133
for plugin in [

datahub-actions/src/datahub_actions/cli/actions.py

+8-24
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,11 @@
1717
import signal
1818
import sys
1919
import time
20-
import unittest
2120
from typing import Any, List
2221

2322
import click
2423
from click_default_group import DefaultGroup
25-
from datahub.configuration.config_loader import (
26-
Environ,
27-
_resolve_element,
28-
load_config_file,
29-
)
24+
from datahub.configuration.config_loader import load_config_file
3025

3126
import datahub_actions as datahub_actions_package
3227
from datahub_actions.pipeline.pipeline import Pipeline
@@ -39,13 +34,6 @@
3934
pipeline_manager = PipelineManager()
4035

4136

42-
def best_effort_resolve_element(x: str, environ: Environ) -> str:
43-
try:
44-
return _resolve_element(x, environ=environ)
45-
except Exception:
46-
return x
47-
48-
4937
def pipeline_config_to_pipeline(pipeline_config: dict) -> Pipeline:
5038
logger.debug(
5139
f"Attempting to create Actions Pipeline using config {pipeline_config.get('name')}"
@@ -97,17 +85,13 @@ def run(ctx: Any, config: List[str], debug: bool) -> None:
9785
if config is not None:
9886
for pipeline_config in config:
9987
pipeline_config_file = pathlib.Path(pipeline_config)
100-
with unittest.mock.patch(
101-
"datahub.configuration.config_loader._resolve_element"
102-
) as mock_resolve_element:
103-
mock_resolve_element.side_effect = best_effort_resolve_element
104-
pipeline_config_dict = load_config_file(pipeline_config_file)
105-
enabled = pipeline_config_dict.get("enabled", True)
106-
if enabled == "false" or enabled is False:
107-
logger.warning(
108-
f"Skipping pipeline {pipeline_config_dict.get('name')} as it is not enabled"
109-
)
110-
continue
88+
pipeline_config_dict = load_config_file(pipeline_config_file)
89+
enabled = pipeline_config_dict.get("enabled", True)
90+
if enabled == "false" or enabled is False:
91+
logger.warning(
92+
f"Skipping pipeline {pipeline_config_dict.get('name')} as it is not enabled"
93+
)
94+
continue
11195

11296
# now load the config with variable expansion
11397
pipeline_config_dict = load_config_file(pipeline_config_file)

datahub-actions/src/datahub_actions/plugin/source/acryl/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
PLATFORM_EVENT_TOPIC_NAME = "PlatformEvent_v1"
2+
ENTITY_CHANGE_EVENT_NAME = "entityChangeEvent"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
import json
2+
import logging
3+
import time
4+
from dataclasses import dataclass
5+
from typing import Iterable, List, Optional
6+
7+
from datahub.configuration import ConfigModel
8+
from datahub.emitter.serialization_helper import post_json_transform
9+
10+
# DataHub imports.
11+
from datahub.metadata.schema_classes import GenericPayloadClass
12+
13+
from datahub_actions.event.event_envelope import EventEnvelope
14+
from datahub_actions.event.event_registry import (
15+
ENTITY_CHANGE_EVENT_V1_TYPE,
16+
EntityChangeEvent,
17+
)
18+
19+
# May or may not need these.
20+
from datahub_actions.pipeline.pipeline_context import PipelineContext
21+
from datahub_actions.plugin.source.acryl.constants import (
22+
ENTITY_CHANGE_EVENT_NAME,
23+
PLATFORM_EVENT_TOPIC_NAME,
24+
)
25+
from datahub_actions.plugin.source.acryl.datahub_cloud_events_ack_manager import (
26+
AckManager,
27+
)
28+
from datahub_actions.plugin.source.acryl.datahub_cloud_events_consumer import (
29+
DataHubEventsConsumer,
30+
ExternalEvent,
31+
)
32+
from datahub_actions.source.event_source import EventSource
33+
34+
logging.basicConfig(level=logging.INFO)
35+
logger = logging.getLogger(__name__)
36+
37+
38+
# Converts a DataHub Events Message to an EntityChangeEvent.
39+
def build_entity_change_event(payload: GenericPayloadClass) -> EntityChangeEvent:
40+
try:
41+
return EntityChangeEvent.from_json(payload.get("value"))
42+
except Exception as e:
43+
raise ValueError("Failed to parse into EntityChangeEvent") from e
44+
45+
46+
class DataHubEventsSourceConfig(ConfigModel):
47+
topic: str = PLATFORM_EVENT_TOPIC_NAME
48+
consumer_id: Optional[str] # Used to store offset for the consumer.
49+
lookback_days: Optional[int] = None
50+
reset_offsets: Optional[bool] = False
51+
52+
# Time and Exit Conditions.
53+
kill_after_idle_timeout: bool = False
54+
idle_timeout_duration_seconds: int = 30
55+
event_processing_time_max_duration_seconds: int = 60
56+
57+
58+
# This is the custom DataHub-based Event Source.
59+
@dataclass
60+
class DataHubEventSource(EventSource):
61+
running = False
62+
source_config: DataHubEventsSourceConfig
63+
ctx: PipelineContext
64+
65+
@staticmethod
66+
def _get_pipeline_urn(pipeline_name: str) -> str:
67+
if pipeline_name.startswith("urn:li:dataHubAction:"):
68+
return pipeline_name
69+
else:
70+
return f"urn:li:dataHubAction:{pipeline_name}"
71+
72+
def __init__(self, config: DataHubEventsSourceConfig, ctx: PipelineContext):
73+
self.ctx = ctx
74+
self.source_config = config
75+
self.consumer_id = DataHubEventSource._get_pipeline_urn(self.ctx.pipeline_name)
76+
77+
# Ensure a Graph Instance was provided.
78+
assert self.ctx.graph is not None
79+
80+
self.datahub_events_consumer: DataHubEventsConsumer = DataHubEventsConsumer(
81+
# TODO: This PipelineContext provides an Acryl Graph Instance
82+
graph=self.ctx.graph.graph,
83+
consumer_id=self.consumer_id,
84+
lookback_days=self.source_config.lookback_days,
85+
reset_offsets=self.source_config.reset_offsets,
86+
)
87+
self.ack_manager = AckManager()
88+
self.safe_to_ack_offset: Optional[str] = None
89+
90+
@classmethod
91+
def create(cls, config_dict: dict, ctx: PipelineContext) -> "EventSource":
92+
config = DataHubEventsSourceConfig.parse_obj(config_dict)
93+
return cls(config, ctx)
94+
95+
def events(self) -> Iterable[EventEnvelope]:
96+
logger.info("Starting DataHub Cloud events source...")
97+
logger.info(f"Subscribing to the following topic: {self.source_config.topic}")
98+
self.running = True
99+
yield from self._poll_and_process_events()
100+
101+
def _poll_and_process_events(self) -> Iterable[EventEnvelope]:
102+
"""Poll and process events in the main loop."""
103+
last_idle_response_timestamp = 0
104+
while self.running:
105+
try:
106+
sleeps_to_go = (
107+
self.source_config.event_processing_time_max_duration_seconds
108+
)
109+
110+
while self.ack_manager.outstanding_acks():
111+
time.sleep(1)
112+
sleeps_to_go -= 1
113+
logger.debug(f"Sleeps to go: {sleeps_to_go}")
114+
115+
if sleeps_to_go == 0:
116+
self.running = False
117+
raise Exception(
118+
f"Failed to process all events successfully after specified time {self.source_config.event_processing_time_max_duration_seconds}! If more time is required, please increase the timeout using this config. {self.ack_manager.acks.values()}",
119+
)
120+
logger.debug(
121+
f"Successfully processed events up to offset id {self.safe_to_ack_offset}"
122+
)
123+
self.safe_to_ack_offset = self.datahub_events_consumer.offset_id
124+
logger.debug(f"Safe to ack offset: {self.safe_to_ack_offset}")
125+
126+
events_response = self.datahub_events_consumer.poll_events(
127+
topic=self.source_config.topic, poll_timeout_seconds=2
128+
)
129+
130+
# Handle Idle Timeout
131+
num_events = len(events_response.events)
132+
133+
if num_events == 0:
134+
if last_idle_response_timestamp == 0:
135+
last_idle_response_timestamp = (
136+
self._get_current_timestamp_seconds()
137+
)
138+
if self._should_idle_timeout(
139+
num_events, last_idle_response_timestamp
140+
):
141+
logger.info("Exiting main loop due to idle timeout")
142+
return
143+
else:
144+
self.ack_manager.new_batch()
145+
last_idle_response_timestamp = 0 # Reset the idle timeout
146+
147+
event_envelopes: List[EventEnvelope] = []
148+
for msg in events_response.events:
149+
for event_envelope in self.handle_pe(msg):
150+
event_envelope.meta = self.ack_manager.get_meta(event_envelope)
151+
event_envelopes.append(event_envelope)
152+
153+
yield from event_envelopes
154+
155+
except Exception as e:
156+
logger.exception(f"DataHub Events consumer error: {e}")
157+
self.running = False
158+
159+
logger.info("DataHub Events consumer exiting main loop")
160+
161+
@staticmethod
162+
def handle_pe(msg: ExternalEvent) -> Iterable[EventEnvelope]:
163+
value: dict = json.loads(msg.value)
164+
payload: GenericPayloadClass = GenericPayloadClass.from_obj(
165+
post_json_transform(value["payload"])
166+
)
167+
if ENTITY_CHANGE_EVENT_NAME == value["name"]:
168+
event = build_entity_change_event(payload)
169+
yield EventEnvelope(ENTITY_CHANGE_EVENT_V1_TYPE, event, {})
170+
171+
def close(self) -> None:
172+
if self.datahub_events_consumer:
173+
self.running = False
174+
if self.safe_to_ack_offset:
175+
self.datahub_events_consumer.commit_offsets(
176+
offset_id=self.safe_to_ack_offset
177+
)
178+
self.datahub_events_consumer.close()
179+
180+
def ack(self, event: EventEnvelope, processed: bool = True) -> None:
181+
self.ack_manager.ack(event.meta, processed=processed)
182+
logger.debug(f"Actions acked event {event} as processed {processed}")
183+
184+
def _should_idle_timeout(
185+
self, num_events: int, last_idle_response_timestamp: int
186+
) -> bool:
187+
"""Handle idle timeout logic and decide if the loop should exit."""
188+
if num_events > 0:
189+
return False # Continue processing
190+
191+
current_timestamp_seconds = self._get_current_timestamp_seconds()
192+
193+
if (
194+
self.source_config.kill_after_idle_timeout
195+
and current_timestamp_seconds - last_idle_response_timestamp
196+
> self.source_config.idle_timeout_duration_seconds
197+
):
198+
logger.info(
199+
f"Shutting down due to idle timeout of {self.source_config.idle_timeout_duration_seconds} seconds"
200+
)
201+
self.running = False
202+
return True # Signal that we should exit
203+
return False # Continue processing
204+
205+
def _get_current_timestamp_seconds(self) -> int:
206+
return int(time.time())
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import logging
2+
from typing import Any, Dict, Tuple
3+
4+
logger = logging.getLogger(__name__)
5+
6+
7+
class AckManager:
8+
"""
9+
An internal Ack manager which hands out fake message ids but ensures we
10+
track all acks and processing
11+
"""
12+
13+
def __init__(self) -> None:
14+
self.batch_id = 0
15+
self.msg_id = 0
16+
self.acks: Dict[Tuple[int, int], bool] = {}
17+
18+
def new_batch(self) -> None:
19+
self.batch_id += 1
20+
self.msg_id = 0
21+
22+
def get_meta(self, event: Any) -> Dict[str, Any]:
23+
self.msg_id += 1
24+
self.acks[(self.batch_id, self.msg_id)] = event
25+
return {"batch_id": self.batch_id, "msg_id": self.msg_id}
26+
27+
def ack(self, meta: dict, processed: bool) -> None:
28+
batch_id, msg_id = (meta["batch_id"], meta["msg_id"])
29+
if processed:
30+
self.acks.pop((batch_id, msg_id))
31+
else:
32+
logger.warning(f"Whoops - we didn't process {meta}")
33+
34+
def outstanding_acks(self) -> int:
35+
return len(self.acks)

0 commit comments

Comments
 (0)