You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: metadata-ingestion/docs/sources/hex/README.md
+9-1
Original file line number
Diff line number
Diff line change
@@ -20,4 +20,12 @@ Currently, the [Hex API](https://learn.hex.tech/docs/api/api-reference) has some
20
20
21
21
2. **Metadata Access**: There is no direct method to retrieve metadata for Collections, Status, or Categories. This information is only available indirectly through references within Projects and Components.
22
22
23
-
Please keep these limitations in mind when working with the Hex connector.
23
+
Please keep these limitations in mind when working with the Hex connector.
24
+
25
+
For the Dataset - Hex Project lineage, the connector relies on the
Therefore, in order to extract lineage information, the required setup must include:
28
+
29
+
- A separated warehouse ingestor (_eg_ BigQuery, Snowflake, Redshift, ...) with `use_queries_v2` enabled in order to fetch Queries.
30
+
This will ingest the queries into DataHub as `Query` entities and the ones triggered by Hex will include the corresponding _Hex query metadata_.
31
+
- A DataHub server with version >= SaaS `0.3.10` or > OSS `1.0.0` so the `Query` entities are properly indexed by source (Hex in this case) and so fetched and processed by the Hex ingestor in order to emit the Dataset - Project lineage.
Copy file name to clipboardExpand all lines: metadata-ingestion/src/datahub/ingestion/source/hex/hex.py
+150-22
Original file line number
Diff line number
Diff line change
@@ -1,9 +1,12 @@
1
+
from dataclasses import dataclass
2
+
from datetime import datetime, timedelta, timezone
1
3
from typing import Any, Dict, Iterable, List, Optional
2
4
3
-
from pydantic import Field, SecretStr
5
+
from pydantic import Field, SecretStr, root_validator
4
6
from typing_extensions import assert_never
5
7
6
8
from datahub.configuration.common import AllowDenyPattern
9
+
from datahub.configuration.datetimes import parse_user_datetime
7
10
from datahub.configuration.source_common import (
8
11
EnvConfigMixin,
9
12
PlatformInstanceConfigMixin,
@@ -21,22 +24,28 @@
21
24
from datahub.ingestion.api.workunit import MetadataWorkUnit
22
25
from datahub.ingestion.source.hex.api import HexApi, HexApiReport
23
26
from datahub.ingestion.source.hex.constants import (
27
+
DATAHUB_API_PAGE_SIZE_DEFAULT,
24
28
HEX_API_BASE_URL_DEFAULT,
25
29
HEX_API_PAGE_SIZE_DEFAULT,
26
30
HEX_PLATFORM_NAME,
27
31
)
28
32
from datahub.ingestion.source.hex.mapper import Mapper
29
33
from datahub.ingestion.source.hex.model import Component, Project
34
+
from datahub.ingestion.source.hex.query_fetcher import (
35
+
HexQueryFetcher,
36
+
HexQueryFetcherReport,
37
+
)
30
38
from datahub.ingestion.source.state.stale_entity_removal_handler import (
31
39
StaleEntityRemovalHandler,
32
40
StaleEntityRemovalSourceReport,
33
41
StatefulStaleMetadataRemovalConfig,
34
42
)
35
43
from datahub.ingestion.source.state.stateful_ingestion_base import (
36
44
StatefulIngestionConfigBase,
37
-
StatefulIngestionReport,
38
45
StatefulIngestionSourceBase,
39
46
)
47
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
48
+
from datahub.sdk.main_client import DataHubClient
40
49
41
50
42
51
class HexSourceConfig(
@@ -93,9 +102,73 @@ class HexSourceConfig(
93
102
default=True,
94
103
description="Set ownership identity from owner/creator email",
95
104
)
105
+
include_lineage: bool = Field(
106
+
default=True,
107
+
description='Include Hex lineage, being fetched from DataHub. See "Limitations" section in the docs for more details about the limitations of this feature.',
108
+
)
109
+
lineage_start_time: Optional[datetime] = Field(
110
+
default=None,
111
+
description="Earliest date of lineage to consider. Default: 1 day before lineage end time. You can specify absolute time like '2023-01-01' or relative time like '-7 days' or '-7d'.",
112
+
)
113
+
lineage_end_time: Optional[datetime] = Field(
114
+
default=None,
115
+
description="Latest date of lineage to consider. Default: Current time in UTC. You can specify absolute time like '2023-01-01' or relative time like '-1 day' or '-1d'.",
116
+
)
117
+
datahub_page_size: int = Field(
118
+
default=DATAHUB_API_PAGE_SIZE_DEFAULT,
119
+
description="Number of items to fetch per DataHub API call.",
0 commit comments