Merge remote-tracking branch 'origin/main' into SITES-40741

BaDos · BaDos · commit 6b73ba346b2b · 2026-03-03T15:16:53.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [1.338.1](https://github.com/adobe/spacecat-audit-worker/compare/v1.338.0...v1.338.1) (2026-03-03)
+
+
+### Bug Fixes
+
+* LLMO-3181 Content summarization: dynamic content by nature should no… ([#2022](https://github.com/adobe/spacecat-audit-worker/issues/2022)) ([5a48019](https://github.com/adobe/spacecat-audit-worker/commit/5a48019894e0cafed7ca50d3aa81627b8c1b8f32))
+
 # [1.338.0](https://github.com/adobe/spacecat-audit-worker/compare/v1.337.5...v1.338.0) (2026-03-03)
 
 
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@adobe/spacecat-audit-worker",
-  "version": "1.338.0",
+  "version": "1.338.1",
   "description": "SpaceCat Audit Worker",
   "main": "src/index.js",
   "type": "module",
diff --git a/src/summarization/dynamic-content-filter.js b/src/summarization/dynamic-content-filter.js
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+
+/**
+ * Path segments that indicate dynamic or transient content by nature.
+ * Such pages should not be summarized (LLMO-3181).
+ */
+const DYNAMIC_PATH_SEGMENTS = new Set([
+  'search',
+  'filter',
+  'results',
+  'feed',
+  'feeds',
+  'dashboard',
+  'cart',
+  'checkout',
+  'login',
+  'signin',
+  'sign-in',
+  'signup',
+  'sign-up',
+  'account',
+  'accounts',
+  'admin',
+  'api',
+  'compare',
+  'wishlist',
+  'payment',
+  'payments',
+]);
+
+/**
+ * Returns true if the URL path suggests dynamic content (search, filter, feed, cart, etc.).
+ * @param {string} url - Full URL or pathname
+ * @returns {boolean}
+ */
+export function isDynamicPageUrl(url) {
+  if (!url || typeof url !== 'string') return false;
+  let pathname;
+  try {
+    if (url.startsWith('http://') || url.startsWith('https://')) {
+      pathname = new URL(url).pathname;
+    } else {
+      pathname = url.startsWith('/') ? url : `/${url}`;
+    }
+  } catch {
+    return false;
+  }
+  const segments = pathname.split('/').filter(Boolean).map((s) => s.toLowerCase());
+  return segments.some((seg) => DYNAMIC_PATH_SEGMENTS.has(seg));
+}
+
+/**
+ * Filters out URLs that look like dynamic pages. Keeps order.
+ * @param {string[]} urls
+ * @returns {string[]}
+ */
+export function filterOutDynamicUrls(urls) {
+  if (!Array.isArray(urls)) return [];
+  return urls.filter((url) => !isDynamicPageUrl(url));
+}
diff --git a/src/summarization/handler.js b/src/summarization/handler.js
@@ -14,6 +14,7 @@ import { Audit } from '@adobe/spacecat-shared-data-access';
 import { AuditBuilder } from '../common/audit-builder.js';
 import { wwwUrlResolver } from '../common/index.js';
 import { getTopAgenticUrlsFromAthena } from '../utils/agentic-urls.js';
+import { filterOutDynamicUrls } from './dynamic-content-filter.js';
 
 const { AUDIT_STEP_DESTINATIONS } = Audit;
 const SCRAPE_AVAILABILITY_THRESHOLD = 0.5; // 50%
@@ -23,8 +24,11 @@ const MAX_PAGES_TO_MYSTIQUE = 100;
 /**
  * Step 1: Import top pages (Athena first, then Ahrefs fallback)
  */
+/* c8 ignore next 1 - function declaration line often not attributed when called from tests */
 export async function importTopPages(context) {
-  const { site, dataAccess, log } = context;
+  const {
+    site, dataAccess, log,
+  } = context;
   const { SiteTopPage } = dataAccess;
 
   try {
@@ -43,7 +47,7 @@ export async function importTopPages(context) {
       };
     }
 
-    log.info(`[SUMMARIZATION] Found ${topPages.length} top pages for site ${site.getId()}`);
+    log.info(`[SUMMARIZATION] Found ${topPages.length} top pages for site ${site.getId()} (using max ${MAX_TOP_PAGES})`);
 
     return {
       type: 'top-pages',
@@ -98,13 +102,24 @@ export async function submitForScraping(context) {
     log.warn('[SUMMARIZATION] No top pages to submit for scraping');
     throw new Error('No top pages to submit for scraping');
   }
-  const topPagesToScrape = topPageUrls.slice(0, MAX_TOP_PAGES);
+
+  const staticUrls = filterOutDynamicUrls(topPageUrls);
+  const excludedCount = topPageUrls.length - staticUrls.length;
+  if (excludedCount > 0) {
+    log.info(`[SUMMARIZATION] Excluded ${excludedCount} dynamic page(s) from summarization`);
+  }
+  if (staticUrls.length === 0) {
+    log.warn('[SUMMARIZATION] No static pages left after filtering dynamic content');
+    throw new Error('No top pages to submit for scraping (all excluded as dynamic)');
+  }
+  const topPagesToScrape = staticUrls.slice(0, MAX_TOP_PAGES);
 
   log.info(`[SUMMARIZATION] Submitting ${topPagesToScrape.length} pages for scraping`);
 
   return {
     urls: topPagesToScrape.map((url) => ({ url })),
     siteId: site.getId(),
+    /* c8 ignore next 3 - return object tail covered by submitForScraping tests */
     type: 'summarization',
   };
 }
@@ -143,7 +158,7 @@ export async function sendToMystique(context) {
     log.warn('[SUMMARIZATION] No top pages found, skipping Mystique message');
     throw new Error('No top pages found');
   }
-  const topPagesScraped = topPageUrls.slice(0, MAX_TOP_PAGES);
+  const topPagesScraped = topPageUrls.slice(0, MAX_PAGES_TO_MYSTIQUE);
 
   // Verify scrape availability before sending to Mystique
   if (!scrapeResultPaths || scrapeResultPaths.size === 0) {
@@ -166,9 +181,10 @@ export async function sendToMystique(context) {
     );
   }
 
-  // Use URLs from scrapeResultPaths Map (these are the URLs that actually have scrape data)
+  // Use URLs from scrapeResultPaths Map; exclude dynamic pages (defense in depth)
   const scrapedUrls = Array.from(scrapeResultPaths.keys());
-  const scrapedUrlsToSend = scrapedUrls.slice(0, MAX_PAGES_TO_MYSTIQUE);
+  const staticScrapedUrls = filterOutDynamicUrls(scrapedUrls);
+  const scrapedUrlsToSend = staticScrapedUrls.slice(0, MAX_PAGES_TO_MYSTIQUE);
   const topPagesPayload = scrapedUrlsToSend.map((url) => ({
     page_url: url,
     keyword: '',
diff --git a/test/audits/summarization/dynamic-content-filter.test.js b/test/audits/summarization/dynamic-content-filter.test.js
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2025 Adobe. All rights reserved.
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License. You may obtain a copy
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
+ * OF ANY KIND, either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+
+/* eslint-env mocha */
+
+import { expect } from 'chai';
+import { isDynamicPageUrl, filterOutDynamicUrls } from '../../../src/summarization/dynamic-content-filter.js';
+
+describe('summarization dynamic-content-filter', () => {
+  describe('isDynamicPageUrl', () => {
+    it('returns false for empty or non-string', () => {
+      expect(isDynamicPageUrl('')).to.be.false;
+      expect(isDynamicPageUrl(null)).to.be.false;
+      expect(isDynamicPageUrl(undefined)).to.be.false;
+      expect(isDynamicPageUrl(123)).to.be.false;
+    });
+
+    it('returns false for static content URLs', () => {
+      expect(isDynamicPageUrl('https://example.com/')).to.be.false;
+      expect(isDynamicPageUrl('https://example.com/about')).to.be.false;
+      expect(isDynamicPageUrl('https://example.com/products/shoes')).to.be.false;
+      expect(isDynamicPageUrl('/blog/post-1')).to.be.false;
+    });
+
+    it('returns true for URLs with dynamic path segments', () => {
+      expect(isDynamicPageUrl('https://example.com/search')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/search?q=foo')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/cart')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/checkout')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/login')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/account')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/admin')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/feed')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/dashboard')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/filter/category')).to.be.true;
+      expect(isDynamicPageUrl('/api/users')).to.be.true;
+    });
+
+    it('matches segment case-insensitively', () => {
+      expect(isDynamicPageUrl('https://example.com/SEARCH')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/Cart')).to.be.true;
+    });
+
+    it('matches segment anywhere in path', () => {
+      expect(isDynamicPageUrl('https://example.com/shop/cart')).to.be.true;
+      expect(isDynamicPageUrl('https://example.com/help/search')).to.be.true;
+    });
+
+    it('returns false for invalid URL when used as path', () => {
+      expect(isDynamicPageUrl('not-a-url')).to.be.false;
+    });
+
+    it('returns false for invalid full URL (URL constructor throws)', () => {
+      expect(isDynamicPageUrl('https://')).to.be.false;
+      expect(isDynamicPageUrl('http://[')).to.be.false;
+    });
+  });
+
+  describe('filterOutDynamicUrls', () => {
+    it('returns empty array for non-array input', () => {
+      expect(filterOutDynamicUrls(null)).to.deep.equal([]);
+      expect(filterOutDynamicUrls(undefined)).to.deep.equal([]);
+    });
+
+    it('keeps only static URLs and preserves order', () => {
+      const urls = [
+        'https://example.com/',
+        'https://example.com/search',
+        'https://example.com/about',
+        'https://example.com/cart',
+        'https://example.com/products',
+      ];
+      expect(filterOutDynamicUrls(urls)).to.deep.equal([
+        'https://example.com/',
+        'https://example.com/about',
+        'https://example.com/products',
+      ]);
+    });
+
+    it('returns all URLs when none are dynamic', () => {
+      const urls = ['https://example.com/', 'https://example.com/about', 'https://example.com/contact'];
+      expect(filterOutDynamicUrls(urls)).to.deep.equal(urls);
+    });
+
+    it('returns empty array when all URLs are dynamic', () => {
+      const urls = ['https://example.com/search', 'https://example.com/cart', 'https://example.com/login'];
+      expect(filterOutDynamicUrls(urls)).to.deep.equal([]);
+    });
+  });
+});
diff --git a/test/audits/summarization/handler.test.js b/test/audits/summarization/handler.test.js
@@ -123,7 +123,7 @@ describe('Summarization Handler', () => {
         'ahrefs',
         'global',
       );
-      expect(log.info).to.have.been.calledWith('[SUMMARIZATION] Found 3 top pages for site site-id-123');
+      expect(log.info).to.have.been.calledWith('[SUMMARIZATION] Found 3 top pages for site site-id-123 (using max 200)');
     });
 
     it('should handle when no top pages are found', async () => {
@@ -573,6 +573,66 @@ describe('Summarization Handler - Athena/Ahrefs fallback', () => {
     );
   });
 
+  it('should exclude dynamic page URLs in submitForScraping', async () => {
+    mockGetTopAgenticUrlsFromAthena = sandbox.stub().resolves([
+      'https://adobe.com/search',
+      'https://adobe.com/about',
+      'https://adobe.com/cart',
+      'https://adobe.com/contact',
+    ]);
+
+    const handler = await esmock('../../../src/summarization/handler.js', {
+      '../../../src/utils/agentic-urls.js': {
+        getTopAgenticUrlsFromAthena: mockGetTopAgenticUrlsFromAthena,
+      },
+    });
+
+    const context = {
+      log: { info: sandbox.stub(), warn: sandbox.stub(), error: sandbox.stub() },
+      site: { getBaseURL: () => 'https://adobe.com', getId: () => 'site-123' },
+      audit: { getAuditResult: () => ({ success: true }) },
+      dataAccess: { SiteTopPage: { allBySiteIdAndSourceAndGeo: sandbox.stub() } },
+    };
+
+    const result = await handler.submitForScraping(context);
+
+    expect(result.urls).to.deep.equal([
+      { url: 'https://adobe.com/about' },
+      { url: 'https://adobe.com/contact' },
+    ]);
+    expect(context.log.info).to.have.been.calledWith(
+      '[SUMMARIZATION] Excluded 2 dynamic page(s) from summarization',
+    );
+  });
+
+  it('should throw when all URLs are dynamic in submitForScraping', async () => {
+    mockGetTopAgenticUrlsFromAthena = sandbox.stub().resolves([
+      'https://adobe.com/search',
+      'https://adobe.com/cart',
+      'https://adobe.com/login',
+    ]);
+
+    const handler = await esmock('../../../src/summarization/handler.js', {
+      '../../../src/utils/agentic-urls.js': {
+        getTopAgenticUrlsFromAthena: mockGetTopAgenticUrlsFromAthena,
+      },
+    });
+
+    const context = {
+      log: { info: sandbox.stub(), warn: sandbox.stub(), error: sandbox.stub() },
+      site: { getBaseURL: () => 'https://adobe.com', getId: () => 'site-123' },
+      audit: { getAuditResult: () => ({ success: true }) },
+      dataAccess: { SiteTopPage: { allBySiteIdAndSourceAndGeo: sandbox.stub() } },
+    };
+
+    await expect(handler.submitForScraping(context)).to.be.rejectedWith(
+      'No top pages to submit for scraping (all excluded as dynamic)',
+    );
+    expect(context.log.warn).to.have.been.calledWith(
+      '[SUMMARIZATION] No static pages left after filtering dynamic content',
+    );
+  });
+
   it('should use Athena URLs in sendToMystique when available', async () => {
     mockGetTopAgenticUrlsFromAthena = sandbox.stub().resolves([
       'https://adobe.com/athena-page1',

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "@adobe/spacecat-audit-worker",`
`3`		`- "version": "1.338.0",`
	`3`	`+ "version": "1.338.1",`
`4`	`4`	`"description": "SpaceCat Audit Worker",`
`5`	`5`	`"main": "src/index.js",`
`6`	`6`	`"type": "module",`