Open
Description
But there is often the values that are corrupted and have a negative impact on downstream transformations, e.g.: too big values to write into a Firestore document, corrupted values popping up in aggregations.
We can do some efforts to fix the technologies detections on wptagent side.
But in the meantime and if not feasible to achieve 100% valid technologies on wptagent - we should add a cleanup stage for the production crawl.pages
table.
It will save the time when working with the crawl data.
SQL draft:
DECLARE crawl_month DATE DEFAULT DATE('2024-11-01');
CREATE TEMP TABLE technologies_cleaned AS (
WITH wappalyzer AS (
SELECT
name as technology,
category
FROM wappalyzer.apps,
UNNEST(categories) AS category
), pages AS (
SELECT
date,
client,
page,
technologies
FROM crawl.pages
WHERE date = crawl_month
), impacted_pages AS (
SELECT DISTINCT
date,
client,
page
FROM pages,
UNNEST (technologies) AS tech,
UNNEST (tech.categories) AS category
LEFT JOIN wappalyzer
USING (technology, category)
WHERE wappalyzer.category IS NULL OR
wappalyzer.technology IS NULL
), flattened_technologies AS (
SELECT
date,
client,
page,
technology,
category,
info
FROM pages,
UNNEST(technologies) AS tech,
UNNEST(tech.categories) AS category
WHERE page IN (SELECT DISTINCT page FROM impacted_pages)
), whitelisted_technologies AS (
SELECT
date,
client,
page,
f.technology,
f.category,
f.info
FROM flattened_technologies f
INNER JOIN wappalyzer
USING (technology, category)
), reconstructed_technologies AS (
SELECT
date,
client,
page,
ARRAY_AGG(STRUCT(
technology,
categories,
info
)) AS technologies
FROM (
SELECT
date,
client,
page,
technology,
ARRAY_AGG(DISTINCT category IGNORE NULLS) AS categories,
info
FROM whitelisted_technologies
GROUP BY date, client, page, technology, info
)
GROUP BY date, client, page
)
SELECT
date,
client,
page,
r.technologies
FROM impacted_pages
LEFT JOIN reconstructed_technologies r
USING (date, client, page)
);
UPDATE crawl.pages
SET technologies = technologies_cleaned.technologies
FROM technologies_cleaned
WHERE pages.date = crawl_month AND
pages.client = technologies_cleaned.client AND
pages.page = technologies_cleaned.page;
Metadata
Assignees
Labels
No labels