Skip to content

Commit 5230702

Browse files
authored
Merge pull request #209 from HTTPArchive/upgrade-to-new-datamodel
Upgrade to new datamodel
2 parents 30201f7 + 477d54f commit 5230702

File tree

129 files changed

+1133
-1429
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

129 files changed

+1133
-1429
lines changed

sql/.sqlfluff

+3-1
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,18 @@ templater = jinja
1111
## Comma separated list of rules to check, or None for all
1212
rules = None
1313
## Comma separated list of rules to exclude, or None
14-
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
14+
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,CV12,LT05,LT09,LT14,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07
1515
# AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same.
1616
# AL07 - Avoid aliases in from and join - why?
1717
# AM03 - if using DESC in one ORDER BY column, then insist on ASC/DESC for all.
1818
# AM05 - INNER JOIN must be fully qualified. Probably should use this but not our style.
1919
# CP02 - Unquoted identifiers (e.g. column names) will be mixed case so don't enforce case
2020
# CP03 - Function names will be mixed case so don't enforce case
2121
# CV02 - Use COALESCE instead of IFNULL or NVL. We think ISNULL is clearer.
22+
# CV12 - Doesn't work with UNNEST. https://github.com/sqlfluff/sqlfluff/issues/6558
2223
# LT05 - We allow longer lines as some of our queries are complex. Maybe should limit in future?
2324
# LT09 - Select targets should be on new lines but sub clauses don't always obey this. Maybe revisit in future?
25+
# LT14 - Keywords on newline. We have some simple, single line joins
2426
# RF01 - BigQuery uses STRUCTS which can look like incorrect table references
2527
# RF02 - Asks for qualified columns for ambiguous ones, but we not qualify our columns, and they are not really ambiguous (or BigQuery would complain)
2628
# RF03 - Insists on references in column names even if not ambiguous. Bit OTT.

sql/.sqlfluffignore

+3
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
/lens/*/crux_histograms.sql
2+
/lens/*/crux_timeseries.sql
3+
/lens/*/histograms.sql
4+
/lens/*/timeseries.sql

sql/generate_reports.sh

+95-98
Large diffs are not rendered by default.

sql/histograms/bootupJs.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
FLOOR(CAST(IFNULL(JSON_EXTRACT(report, '$.audits.bootup-time.numericValue'), JSON_EXTRACT(report, '$.audits.bootup-time.rawValue')) AS FLOAT64) / 100) / 10 AS bin
13+
FLOOR(FLOAT64(IFNULL(lighthouse.audits['bootup-time'].numericValue, lighthouse.audits['bootup-time'].rawValue)) / 100) / 10 AS bin
1414
FROM
15-
`httparchive.lighthouse.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesCss.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesCSS / 10240) * 10 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesCss) / 10240) * 10 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesFont.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesFont / 10240) * 10 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesFont) / 10240) * 10 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesHtml.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesHtml / 10240) * 10 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesHtml) / 10240) * 10 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesImg.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesImg / 102400) * 100 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesImg) / 102400) * 100 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesJs.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesJS / 10240) * 10 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesJS) / 10240) * 10 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesOther.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesOther / 10240) * 10 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesOther) / 10240) * 10 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesTotal.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesTotal / 102400) * 100 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesTotal) / 102400) * 100 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/bytesVideo.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(bytesVideo / 10240) * 10 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(summary.bytesVideo) / 10240) * 10 AS INT64) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/compileJs.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(JSON_EXTRACT(payload, "$['_cpu.v8.compile']") AS INT64) AS bin
13+
INT64(payload['_cpu.v8.compile']) AS bin
1414
FROM
15-
`httparchive.pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/cruxFid.sql

-49
This file was deleted.

sql/histograms/cruxShopifyThemes.sql

+15-7
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,27 @@ CREATE TEMP FUNCTION IS_NON_ZERO(good FLOAT64, needs_improvement FLOAT64, poor F
1212
good + needs_improvement + poor > 0
1313
);
1414

15+
-- Test CrUX data exists
16+
WITH crux_test AS ( -- noqa: ST03
17+
SELECT
18+
1
19+
FROM
20+
`chrome-ux-report.all.${YYYYMM}`
21+
),
22+
1523
-- All Shopify shops in HTTPArchive
16-
WITH archive_pages AS (
24+
archive_pages AS (
1725
SELECT
1826
client,
1927
page AS url,
20-
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') AS theme_name,
21-
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.theme_store_id') AS theme_store_id
28+
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) AS theme_name,
29+
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.theme_store_id) AS theme_store_id
2230
FROM
23-
`httparchive.all.pages`
31+
`httparchive.crawl.pages`
2432
WHERE
25-
date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND
33+
date = '${YYYY-MM-DD}' AND
2634
is_root_page AND
27-
JSON_VALUE(custom_metrics, '$.ecommerce.Shopify.theme.name') IS NOT NULL --first grab all shops for market share
35+
JSON_VALUE(custom_metrics.ecommerce.Shopify.theme.name) IS NOT NULL --first grab all shops for market share
2836
)
2937

3038
SELECT
@@ -176,7 +184,7 @@ JOIN (
176184
-- Include null theme store ids so that we can get full market share within CrUX
177185
ON IFNULL(theme_names.theme_store_id, 'N/A') = IFNULL(archive_pages.theme_store_id, 'N/A')
178186
WHERE
179-
date = DATE(REPLACE('${YYYY_MM_DD}', '_', '-')) AND
187+
date = '${YYYY-MM-DD}' AND
180188
theme_names.rank = 1
181189
GROUP BY
182190
client,

sql/histograms/dcl.sql

+6-4
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
FLOOR(onContentLoaded / 1000) AS bin
13+
FLOOR(FLOAT64(summary.onContentLoaded) / 1000) AS bin
1414
FROM
15-
`httparchive.summary_pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
1616
WHERE
17-
onContentLoaded > 0
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page AND
19+
FLOAT64(summary.onContentLoaded) > 0
1820
GROUP BY
1921
bin,
2022
client

sql/histograms/evalJs.sql

+9-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,17 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(CAST(JSON_EXTRACT(payload, "$['_cpu.EvaluateScript']") AS FLOAT64) / 20 AS INT64) * 20 AS bin
13+
CAST(FLOAT64(r.payload['_cpu.EvaluateScript']) / 20 AS INT64) * 20 AS bin
1414
FROM
15-
`httparchive.requests.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.requests` r
16+
INNER JOIN
17+
`httparchive.crawl.pages`
18+
USING (date, client, is_root_page, rank, page)
19+
WHERE
20+
date = '${YYYY-MM-DD}' AND
21+
is_root_page
1622
GROUP BY
1723
bin,
1824
client

sql/histograms/fcp.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(CAST(JSON_EXTRACT(payload, "$['_chromeUserTiming.firstContentfulPaint']") AS FLOAT64) / 1000) AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(payload['_chromeUserTiming.firstContentfulPaint']) / 1000) AS INT64) AS bin
1414
FROM
15-
`httparchive.pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

sql/histograms/gzipSavings.sql

+6-3
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,14 @@ FROM (
88
volume / SUM(volume) OVER (PARTITION BY client) AS pdf
99
FROM (
1010
SELECT
11-
_TABLE_SUFFIX AS client,
11+
client,
1212
COUNT(0) AS volume,
13-
CAST(FLOOR(CAST(JSON_EXTRACT(payload, '$._gzip_savings') AS FLOAT64) / (1024 * 2)) * 2 AS INT64) AS bin
13+
CAST(FLOOR(FLOAT64(payload._gzip_savings) / (1024 * 2)) * 2 AS INT64) AS bin
1414
FROM
15-
`httparchive.pages.${YYYY_MM_DD}_*`
15+
`httparchive.crawl.pages`
16+
WHERE
17+
date = '${YYYY-MM-DD}' AND
18+
is_root_page
1619
GROUP BY
1720
bin,
1821
client

0 commit comments

Comments
 (0)