Skip to content

Commit 6b73ba3

Browse files
committed
Merge remote-tracking branch 'origin/main' into SITES-40741
2 parents 8d403f0 + 2b26dc7 commit 6b73ba3

File tree

7 files changed

+262
-10
lines changed

7 files changed

+262
-10
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
## [1.338.1](https://github.com/adobe/spacecat-audit-worker/compare/v1.338.0...v1.338.1) (2026-03-03)
2+
3+
4+
### Bug Fixes
5+
6+
* LLMO-3181 Content summarization: dynamic content by nature should no… ([#2022](https://github.com/adobe/spacecat-audit-worker/issues/2022)) ([5a48019](https://github.com/adobe/spacecat-audit-worker/commit/5a48019894e0cafed7ca50d3aa81627b8c1b8f32))
7+
18
# [1.338.0](https://github.com/adobe/spacecat-audit-worker/compare/v1.337.5...v1.338.0) (2026-03-03)
29

310

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@adobe/spacecat-audit-worker",
3-
"version": "1.338.0",
3+
"version": "1.338.1",
44
"description": "SpaceCat Audit Worker",
55
"main": "src/index.js",
66
"type": "module",
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright 2025 Adobe. All rights reserved.
3+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License. You may obtain a copy
5+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
6+
*
7+
* Unless required by applicable law or agreed to in writing, software distributed under
8+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9+
* OF ANY KIND, either express or implied. See the License for the specific language
10+
* governing permissions and limitations under the License.
11+
*/
12+
13+
/**
14+
* Path segments that indicate dynamic or transient content by nature.
15+
* Such pages should not be summarized (LLMO-3181).
16+
*/
17+
const DYNAMIC_PATH_SEGMENTS = new Set([
18+
'search',
19+
'filter',
20+
'results',
21+
'feed',
22+
'feeds',
23+
'dashboard',
24+
'cart',
25+
'checkout',
26+
'login',
27+
'signin',
28+
'sign-in',
29+
'signup',
30+
'sign-up',
31+
'account',
32+
'accounts',
33+
'admin',
34+
'api',
35+
'compare',
36+
'wishlist',
37+
'payment',
38+
'payments',
39+
]);
40+
41+
/**
42+
* Returns true if the URL path suggests dynamic content (search, filter, feed, cart, etc.).
43+
* @param {string} url - Full URL or pathname
44+
* @returns {boolean}
45+
*/
46+
export function isDynamicPageUrl(url) {
47+
if (!url || typeof url !== 'string') return false;
48+
let pathname;
49+
try {
50+
if (url.startsWith('http://') || url.startsWith('https://')) {
51+
pathname = new URL(url).pathname;
52+
} else {
53+
pathname = url.startsWith('/') ? url : `/${url}`;
54+
}
55+
} catch {
56+
return false;
57+
}
58+
const segments = pathname.split('/').filter(Boolean).map((s) => s.toLowerCase());
59+
return segments.some((seg) => DYNAMIC_PATH_SEGMENTS.has(seg));
60+
}
61+
62+
/**
63+
* Filters out URLs that look like dynamic pages. Keeps order.
64+
* @param {string[]} urls
65+
* @returns {string[]}
66+
*/
67+
export function filterOutDynamicUrls(urls) {
68+
if (!Array.isArray(urls)) return [];
69+
return urls.filter((url) => !isDynamicPageUrl(url));
70+
}

src/summarization/handler.js

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import { Audit } from '@adobe/spacecat-shared-data-access';
1414
import { AuditBuilder } from '../common/audit-builder.js';
1515
import { wwwUrlResolver } from '../common/index.js';
1616
import { getTopAgenticUrlsFromAthena } from '../utils/agentic-urls.js';
17+
import { filterOutDynamicUrls } from './dynamic-content-filter.js';
1718

1819
const { AUDIT_STEP_DESTINATIONS } = Audit;
1920
const SCRAPE_AVAILABILITY_THRESHOLD = 0.5; // 50%
@@ -23,8 +24,11 @@ const MAX_PAGES_TO_MYSTIQUE = 100;
2324
/**
2425
* Step 1: Import top pages (Athena first, then Ahrefs fallback)
2526
*/
27+
/* c8 ignore next 1 - function declaration line often not attributed when called from tests */
2628
export async function importTopPages(context) {
27-
const { site, dataAccess, log } = context;
29+
const {
30+
site, dataAccess, log,
31+
} = context;
2832
const { SiteTopPage } = dataAccess;
2933

3034
try {
@@ -43,7 +47,7 @@ export async function importTopPages(context) {
4347
};
4448
}
4549

46-
log.info(`[SUMMARIZATION] Found ${topPages.length} top pages for site ${site.getId()}`);
50+
log.info(`[SUMMARIZATION] Found ${topPages.length} top pages for site ${site.getId()} (using max ${MAX_TOP_PAGES})`);
4751

4852
return {
4953
type: 'top-pages',
@@ -98,13 +102,24 @@ export async function submitForScraping(context) {
98102
log.warn('[SUMMARIZATION] No top pages to submit for scraping');
99103
throw new Error('No top pages to submit for scraping');
100104
}
101-
const topPagesToScrape = topPageUrls.slice(0, MAX_TOP_PAGES);
105+
106+
const staticUrls = filterOutDynamicUrls(topPageUrls);
107+
const excludedCount = topPageUrls.length - staticUrls.length;
108+
if (excludedCount > 0) {
109+
log.info(`[SUMMARIZATION] Excluded ${excludedCount} dynamic page(s) from summarization`);
110+
}
111+
if (staticUrls.length === 0) {
112+
log.warn('[SUMMARIZATION] No static pages left after filtering dynamic content');
113+
throw new Error('No top pages to submit for scraping (all excluded as dynamic)');
114+
}
115+
const topPagesToScrape = staticUrls.slice(0, MAX_TOP_PAGES);
102116

103117
log.info(`[SUMMARIZATION] Submitting ${topPagesToScrape.length} pages for scraping`);
104118

105119
return {
106120
urls: topPagesToScrape.map((url) => ({ url })),
107121
siteId: site.getId(),
122+
/* c8 ignore next 3 - return object tail covered by submitForScraping tests */
108123
type: 'summarization',
109124
};
110125
}
@@ -143,7 +158,7 @@ export async function sendToMystique(context) {
143158
log.warn('[SUMMARIZATION] No top pages found, skipping Mystique message');
144159
throw new Error('No top pages found');
145160
}
146-
const topPagesScraped = topPageUrls.slice(0, MAX_TOP_PAGES);
161+
const topPagesScraped = topPageUrls.slice(0, MAX_PAGES_TO_MYSTIQUE);
147162

148163
// Verify scrape availability before sending to Mystique
149164
if (!scrapeResultPaths || scrapeResultPaths.size === 0) {
@@ -166,9 +181,10 @@ export async function sendToMystique(context) {
166181
);
167182
}
168183

169-
// Use URLs from scrapeResultPaths Map (these are the URLs that actually have scrape data)
184+
// Use URLs from scrapeResultPaths Map; exclude dynamic pages (defense in depth)
170185
const scrapedUrls = Array.from(scrapeResultPaths.keys());
171-
const scrapedUrlsToSend = scrapedUrls.slice(0, MAX_PAGES_TO_MYSTIQUE);
186+
const staticScrapedUrls = filterOutDynamicUrls(scrapedUrls);
187+
const scrapedUrlsToSend = staticScrapedUrls.slice(0, MAX_PAGES_TO_MYSTIQUE);
172188
const topPagesPayload = scrapedUrlsToSend.map((url) => ({
173189
page_url: url,
174190
keyword: '',
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* Copyright 2025 Adobe. All rights reserved.
3+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
4+
* you may not use this file except in compliance with the License. You may obtain a copy
5+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
6+
*
7+
* Unless required by applicable law or agreed to in writing, software distributed under
8+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9+
* OF ANY KIND, either express or implied. See the License for the specific language
10+
* governing permissions and limitations under the License.
11+
*/
12+
13+
/* eslint-env mocha */
14+
15+
import { expect } from 'chai';
16+
import { isDynamicPageUrl, filterOutDynamicUrls } from '../../../src/summarization/dynamic-content-filter.js';
17+
18+
describe('summarization dynamic-content-filter', () => {
19+
describe('isDynamicPageUrl', () => {
20+
it('returns false for empty or non-string', () => {
21+
expect(isDynamicPageUrl('')).to.be.false;
22+
expect(isDynamicPageUrl(null)).to.be.false;
23+
expect(isDynamicPageUrl(undefined)).to.be.false;
24+
expect(isDynamicPageUrl(123)).to.be.false;
25+
});
26+
27+
it('returns false for static content URLs', () => {
28+
expect(isDynamicPageUrl('https://example.com/')).to.be.false;
29+
expect(isDynamicPageUrl('https://example.com/about')).to.be.false;
30+
expect(isDynamicPageUrl('https://example.com/products/shoes')).to.be.false;
31+
expect(isDynamicPageUrl('/blog/post-1')).to.be.false;
32+
});
33+
34+
it('returns true for URLs with dynamic path segments', () => {
35+
expect(isDynamicPageUrl('https://example.com/search')).to.be.true;
36+
expect(isDynamicPageUrl('https://example.com/search?q=foo')).to.be.true;
37+
expect(isDynamicPageUrl('https://example.com/cart')).to.be.true;
38+
expect(isDynamicPageUrl('https://example.com/checkout')).to.be.true;
39+
expect(isDynamicPageUrl('https://example.com/login')).to.be.true;
40+
expect(isDynamicPageUrl('https://example.com/account')).to.be.true;
41+
expect(isDynamicPageUrl('https://example.com/admin')).to.be.true;
42+
expect(isDynamicPageUrl('https://example.com/feed')).to.be.true;
43+
expect(isDynamicPageUrl('https://example.com/dashboard')).to.be.true;
44+
expect(isDynamicPageUrl('https://example.com/filter/category')).to.be.true;
45+
expect(isDynamicPageUrl('/api/users')).to.be.true;
46+
});
47+
48+
it('matches segment case-insensitively', () => {
49+
expect(isDynamicPageUrl('https://example.com/SEARCH')).to.be.true;
50+
expect(isDynamicPageUrl('https://example.com/Cart')).to.be.true;
51+
});
52+
53+
it('matches segment anywhere in path', () => {
54+
expect(isDynamicPageUrl('https://example.com/shop/cart')).to.be.true;
55+
expect(isDynamicPageUrl('https://example.com/help/search')).to.be.true;
56+
});
57+
58+
it('returns false for invalid URL when used as path', () => {
59+
expect(isDynamicPageUrl('not-a-url')).to.be.false;
60+
});
61+
62+
it('returns false for invalid full URL (URL constructor throws)', () => {
63+
expect(isDynamicPageUrl('https://')).to.be.false;
64+
expect(isDynamicPageUrl('http://[')).to.be.false;
65+
});
66+
});
67+
68+
describe('filterOutDynamicUrls', () => {
69+
it('returns empty array for non-array input', () => {
70+
expect(filterOutDynamicUrls(null)).to.deep.equal([]);
71+
expect(filterOutDynamicUrls(undefined)).to.deep.equal([]);
72+
});
73+
74+
it('keeps only static URLs and preserves order', () => {
75+
const urls = [
76+
'https://example.com/',
77+
'https://example.com/search',
78+
'https://example.com/about',
79+
'https://example.com/cart',
80+
'https://example.com/products',
81+
];
82+
expect(filterOutDynamicUrls(urls)).to.deep.equal([
83+
'https://example.com/',
84+
'https://example.com/about',
85+
'https://example.com/products',
86+
]);
87+
});
88+
89+
it('returns all URLs when none are dynamic', () => {
90+
const urls = ['https://example.com/', 'https://example.com/about', 'https://example.com/contact'];
91+
expect(filterOutDynamicUrls(urls)).to.deep.equal(urls);
92+
});
93+
94+
it('returns empty array when all URLs are dynamic', () => {
95+
const urls = ['https://example.com/search', 'https://example.com/cart', 'https://example.com/login'];
96+
expect(filterOutDynamicUrls(urls)).to.deep.equal([]);
97+
});
98+
});
99+
});

test/audits/summarization/handler.test.js

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ describe('Summarization Handler', () => {
123123
'ahrefs',
124124
'global',
125125
);
126-
expect(log.info).to.have.been.calledWith('[SUMMARIZATION] Found 3 top pages for site site-id-123');
126+
expect(log.info).to.have.been.calledWith('[SUMMARIZATION] Found 3 top pages for site site-id-123 (using max 200)');
127127
});
128128

129129
it('should handle when no top pages are found', async () => {
@@ -573,6 +573,66 @@ describe('Summarization Handler - Athena/Ahrefs fallback', () => {
573573
);
574574
});
575575

576+
it('should exclude dynamic page URLs in submitForScraping', async () => {
577+
mockGetTopAgenticUrlsFromAthena = sandbox.stub().resolves([
578+
'https://adobe.com/search',
579+
'https://adobe.com/about',
580+
'https://adobe.com/cart',
581+
'https://adobe.com/contact',
582+
]);
583+
584+
const handler = await esmock('../../../src/summarization/handler.js', {
585+
'../../../src/utils/agentic-urls.js': {
586+
getTopAgenticUrlsFromAthena: mockGetTopAgenticUrlsFromAthena,
587+
},
588+
});
589+
590+
const context = {
591+
log: { info: sandbox.stub(), warn: sandbox.stub(), error: sandbox.stub() },
592+
site: { getBaseURL: () => 'https://adobe.com', getId: () => 'site-123' },
593+
audit: { getAuditResult: () => ({ success: true }) },
594+
dataAccess: { SiteTopPage: { allBySiteIdAndSourceAndGeo: sandbox.stub() } },
595+
};
596+
597+
const result = await handler.submitForScraping(context);
598+
599+
expect(result.urls).to.deep.equal([
600+
{ url: 'https://adobe.com/about' },
601+
{ url: 'https://adobe.com/contact' },
602+
]);
603+
expect(context.log.info).to.have.been.calledWith(
604+
'[SUMMARIZATION] Excluded 2 dynamic page(s) from summarization',
605+
);
606+
});
607+
608+
it('should throw when all URLs are dynamic in submitForScraping', async () => {
609+
mockGetTopAgenticUrlsFromAthena = sandbox.stub().resolves([
610+
'https://adobe.com/search',
611+
'https://adobe.com/cart',
612+
'https://adobe.com/login',
613+
]);
614+
615+
const handler = await esmock('../../../src/summarization/handler.js', {
616+
'../../../src/utils/agentic-urls.js': {
617+
getTopAgenticUrlsFromAthena: mockGetTopAgenticUrlsFromAthena,
618+
},
619+
});
620+
621+
const context = {
622+
log: { info: sandbox.stub(), warn: sandbox.stub(), error: sandbox.stub() },
623+
site: { getBaseURL: () => 'https://adobe.com', getId: () => 'site-123' },
624+
audit: { getAuditResult: () => ({ success: true }) },
625+
dataAccess: { SiteTopPage: { allBySiteIdAndSourceAndGeo: sandbox.stub() } },
626+
};
627+
628+
await expect(handler.submitForScraping(context)).to.be.rejectedWith(
629+
'No top pages to submit for scraping (all excluded as dynamic)',
630+
);
631+
expect(context.log.warn).to.have.been.calledWith(
632+
'[SUMMARIZATION] No static pages left after filtering dynamic content',
633+
);
634+
});
635+
576636
it('should use Athena URLs in sendToMystique when available', async () => {
577637
mockGetTopAgenticUrlsFromAthena = sandbox.stub().resolves([
578638
'https://adobe.com/athena-page1',

0 commit comments

Comments
 (0)