Skip to content

Commit ce78ee5

Browse files
authored
Merge branch 'main' into relud-experiment-monitoring
2 parents 7df41bc + 3742cad commit ce78ee5

File tree

407 files changed

+4939
-612
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

407 files changed

+4939
-612
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ sql/**/docs
1616
sql/**/mkdocs.yml
1717
generated_docs/
1818
generated-sql/
19+
CLAUDE.md
1920

2021
# ignore Bigeye artifacts
2122
moz-fx-data-shared-prod_APPLY.yml

CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ dags.yaml
1616
/sql/**/active_users_aggregates_v3 @mozilla/kpi_table_reviewers
1717
/sql_generators/active_users_aggregates_v4/templates/ @mozilla/kpi_table_reviewers
1818
/sql/**/active_users_aggregates_v4 @mozilla/kpi_table_reviewers
19+
/sql/moz-fx-data-shared-prod/telemetry/active_users_aggregates @mozilla/kpi_table_reviewers
1920
# Search
2021
/sql/moz-fx-data-shared-prod/search_terms @whd @jasonthomas
2122
/sql/moz-fx-data-shared-prod/search_terms_derived @whd @jasonthomas

bigquery_etl/cli/metadata.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ def publish(
234234
table_metadata_files = paths_matching_name_pattern(
235235
name, sql_dir, project_id=project_id, files=["metadata.yaml"]
236236
)
237+
skip_deploy = ConfigLoader.get("metadata", "deploy", "skip", fallback=[])
237238

238239
if parallelism > 0:
239240
credentials = get_credentials()
@@ -245,7 +246,10 @@ def publish(
245246
)
246247
else:
247248
for metadata_file in table_metadata_files:
248-
_publish_metadata(project_id, credentials=None, metadata_file=metadata_file)
249+
if str(metadata_file) not in skip_deploy:
250+
_publish_metadata(
251+
project_id, credentials=None, metadata_file=metadata_file
252+
)
249253

250254

251255
def _publish_metadata(project_id, credentials, metadata_file):

bigquery_etl/cli/stage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,7 @@ def _view_dependencies(artifact_files, sql_dir):
295295
)
296296
project, dataset, name = dependency_components
297297

298-
file_path = Path(view.path).parent.parent.parent / dataset / name
298+
file_path = Path(sql_dir) / project / dataset / name
299299

300300
file_exists_for_dependency = False
301301
for file in [VIEW_FILE, QUERY_FILE, QUERY_SCRIPT, MATERIALIZED_VIEW]:

bigquery_etl/cli/utils.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
import os
55
import re
66
from fnmatch import fnmatchcase
7+
from functools import cache
78
from glob import glob
89
from pathlib import Path
9-
from typing import Iterator, List, Optional, Tuple
10+
from typing import Dict, Iterator, List, Optional, Tuple
1011

1112
import click
13+
import requests
1214
from google.auth.exceptions import DefaultCredentialsError
1315
from google.cloud import bigquery
1416

@@ -23,6 +25,7 @@
2325
r"^.*/([a-zA-Z0-9-]+)/([a-zA-Z0-9_]+)/([a-zA-Z0-9_]+(_v[0-9]+)?)/"
2426
r"(?:checks\.sql)$"
2527
)
28+
GLEAN_APP_LISTINGS_URL = "https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings"
2629

2730

2831
def is_valid_dir(ctx, param, value):
@@ -250,3 +253,26 @@ def temp_dataset_option(
250253
help="Dataset where intermediate query results will be temporarily stored, "
251254
"formatted as PROJECT_ID.DATASET_ID",
252255
)
256+
257+
258+
@cache
259+
def get_glean_app_id_to_app_name_mapping() -> Dict[str, str]:
260+
"""Return a dict where key is the channel app id and the value is the shared app name.
261+
262+
e.g. {
263+
"org_mozilla_firefox": "fenix",
264+
"org_mozilla_firefox_beta": "fenix",
265+
"org_mozilla_ios_firefox": "firefox_ios",
266+
"org_mozilla_ios_firefoxbeta": "firefox_ios",
267+
}
268+
"""
269+
response = requests.get(GLEAN_APP_LISTINGS_URL)
270+
response.raise_for_status()
271+
272+
app_listings = response.json()
273+
274+
return {
275+
app["bq_dataset_family"]: app["app_name"]
276+
for app in app_listings
277+
if "bq_dataset_family" in app and "app_name" in app
278+
}

bigquery_etl/copy_deduplicate.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,17 @@
2020
from google.api_core.exceptions import BadRequest
2121
from google.cloud import bigquery
2222

23-
from bigquery_etl.cli.utils import table_matches_patterns
23+
from bigquery_etl.cli.utils import (
24+
get_glean_app_id_to_app_name_mapping,
25+
parallelism_option,
26+
project_id_option,
27+
table_matches_patterns,
28+
)
2429
from bigquery_etl.config import ConfigLoader
2530
from bigquery_etl.util.bigquery_id import sql_table_id
2631
from bigquery_etl.util.client_queue import ClientQueue
2732
from bigquery_etl.util.common import TempDatasetReference
2833

29-
from .cli.utils import parallelism_option, project_id_option
30-
3134
QUERY_TEMPLATE = """
3235
WITH
3336
-- Distinct document_ids and their minimum submission_timestamp today
@@ -97,33 +100,43 @@ def _has_field_path(schema: List[bigquery.SchemaField], path: List[str]) -> bool
97100
def _select_geo(live_table: str, client: bigquery.Client) -> str:
98101
"""Build a SELECT REPLACE clause that NULLs metadata.geo.* if applicable."""
99102
_, dataset_id, table_id = live_table.split(".")
103+
channel_to_app_name = get_glean_app_id_to_app_name_mapping()
104+
app_id = re.sub("_live$", "", dataset_id)
105+
106+
excluded_apps = set(ConfigLoader.get("geo_deprecation", "skip_apps", fallback=[]))
107+
app_name = channel_to_app_name.get(app_id)
108+
if app_name in excluded_apps:
109+
return ""
100110

101111
excluded_tables = set(
102112
ConfigLoader.get("geo_deprecation", "skip_tables", fallback=[])
103113
)
104114
if re.sub(r"_v\d+$", "", table_id) in excluded_tables:
105115
return ""
106116

107-
app_id = dataset_id.removesuffix("_live")
108-
included_apps = set(
109-
ConfigLoader.get("geo_deprecation", "include_app_ids", fallback=[])
110-
)
111-
if app_id not in included_apps:
112-
return ""
113-
114117
table = client.get_table(live_table)
115118

119+
# Only deprecating the geo fields for glean apps. Legacy tables would be deprecated after glean migration
120+
if app_id not in channel_to_app_name.keys():
121+
return ""
122+
123+
# only glean tables have this label
116124
include_client_id = table.labels.get("include_client_id") == "true"
117125
if not include_client_id:
118126
return ""
119127

120-
# Check schema to ensure geo fields exists
128+
# Check schema to ensure required fields exists
121129
schema = table.schema
122-
required_fields = ("city", "subdivision1", "subdivision2")
123-
has_required_fields = all(
124-
_has_field_path(schema, ["metadata", "geo", field]) for field in required_fields
130+
has_client_id_field = _has_field_path(schema, ["client_info", "client_id"])
131+
if not has_client_id_field:
132+
return ""
133+
134+
required_geo_fields = ("city", "subdivision1", "subdivision2")
135+
has_required_geo_fields = all(
136+
_has_field_path(schema, ["metadata", "geo", field])
137+
for field in required_geo_fields
125138
)
126-
if not has_required_fields:
139+
if not has_required_geo_fields:
127140
return ""
128141

129142
return """

bigquery_etl/metadata/parse_metadata.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ def literal_presenter(dumper, data):
3737
yaml.add_representer(Literal, literal_presenter)
3838

3939

40+
class AssetLevel(enum.Enum):
41+
"""Represents BigQuery table level based on requirements for quality and maturity."""
42+
43+
GOLD = "gold"
44+
SILVER = "silver"
45+
BRONZE = "bronze"
46+
47+
4048
class PartitionType(enum.Enum):
4149
"""Represents BigQuery table partition types."""
4250

@@ -198,13 +206,31 @@ class Metadata:
198206
deletion_date: Optional[date] = attr.ib(None)
199207
monitoring: Optional[MonitoringMetadata] = attr.ib(None)
200208
require_column_descriptions: bool = attr.ib(False)
209+
level: Optional[str] = attr.ib(None)
201210

202211
@owners.validator
203212
def validate_owners(self, attribute, value):
204213
"""Check that provided email addresses or github identities for owners are valid."""
205214
if not all(map(lambda e: is_email_or_github_identity(e), value)):
206215
raise ValueError(f"Invalid email or Github identity for owners: {value}.")
207216

217+
@level.validator
218+
def validate_level(self, attribute, value):
219+
"""Check that the level label is a string and one of the expected values."""
220+
allowed = [e.value for e in AssetLevel]
221+
222+
if value is None:
223+
return
224+
225+
if not isinstance(value, str):
226+
raise ValueError(
227+
f"ERROR. Invalid level in metadata with type '{type(value).__name__}'. Must be a string."
228+
)
229+
if value not in allowed:
230+
raise ValueError(
231+
f"ERROR. Invalid level in metadata: {value}. Must be only one of {sorted(allowed)}."
232+
)
233+
208234
@labels.validator
209235
def validate_labels(self, attribute, value):
210236
"""Check that labels are valid."""
@@ -276,6 +302,7 @@ def from_file(cls, metadata_file):
276302
deletion_date = None
277303
monitoring = None
278304
require_column_descriptions = False
305+
level = None
279306

280307
with open(metadata_file, "r") as yaml_stream:
281308
try:
@@ -363,6 +390,9 @@ def from_file(cls, metadata_file):
363390
"require_column_descriptions"
364391
]
365392

393+
if "level" in metadata:
394+
level = metadata["level"]
395+
366396
return cls(
367397
friendly_name,
368398
description,
@@ -378,6 +408,7 @@ def from_file(cls, metadata_file):
378408
deletion_date,
379409
monitoring,
380410
require_column_descriptions,
411+
level,
381412
)
382413
except yaml.YAMLError as e:
383414
raise e

0 commit comments

Comments
 (0)