fix(config): scrape Berlin Presse from /pressemitteilungen listing, not news sub-sitemap

Movm · Movm · commit 97486fdc73f5 · 2026-04-25T03:02:31.000+02:00
The news sub-sitemap (PR #673/#674/#677) aggregates ALL Berlin LV posts: press releases, AG-Sitzung announcements, LAG meetings, and online events. Articles indexed from there polluted berlin-lv-presse with non-press content (8 AG-Sitzung, 8 LAG, 1 Termin, 2 Veranstaltung in the audit) plus 3 entries contaminated with listing-page boilerplate where TYPO3's /nachrichten/<slug> alias silently 404s to the listing. /pressemitteilungen is TYPO3's category-filtered route — only real press releases. Verified pagination works (unlike /nachrichten where tx_xblog_pi1 is silently ignored): pages 1, 2, 3, 57 each return distinct article IDs and next-page links carry per-page cHash signatures. Use paginationLinkSelector mode so the extractor follows next-links from HTML rather than constructing pagination URLs (which can't include the required cHash). paginationPattern stays as fallback. listSelector and contentSelectors unchanged. The off-path filter naturally drops legacy /nachrichten/<slug> teaser links from the listing page since URLs don't share the /pressemitteilungen prefix — exactly the desired behavior. Going forward, future scrapes index articles at /pressemitteilungen/<slug>_<id> canonical URLs. Existing legacy /nachrichten/ entries in Qdrant should be purged separately so dedup works against a single URL form.
diff --git a/apps/api/config/landesverbaendeConfig.ts b/apps/api/config/landesverbaendeConfig.ts
@@ -294,20 +294,27 @@ export const LANDESVERBAENDE_CONFIG: LandesverbaendeConfig = {
       maxAgeYears: 5,
       contentPaths: [
         {
+          // Scrape the dedicated /pressemitteilungen listing instead of /nachrichten
+          // or the news sub-sitemap. The news sub-sitemap aggregates ALL Berlin LV
+          // posts (press releases + AG-Sitzung announcements + LAG meetings + events),
+          // and articles indexed from there polluted Berlin Presse with non-press
+          // content. /pressemitteilungen is TYPO3's category-filtered listing route
+          // and only contains real press releases.
+          //
+          // Pagination on /pressemitteilungen actually works (unlike /nachrichten,
+          // where tx_xblog_pi1[pointer] is silently ignored) — pages 1, 2, 3 ... 57
+          // each return distinct article IDs, and the next-page links carry
+          // per-page cHash signatures. Use paginationLinkSelector so the extractor
+          // follows next-links from HTML rather than constructing URLs (which
+          // wouldn't carry the required cHash). paginationPattern stays as fallback
+          // for the rare case the link-following can't find a "next" anchor.
           type: 'presse',
-          path: '/nachrichten',
+          path: '/pressemitteilungen',
           listSelector: 'h2 a[href], h3 a[href]',
-          // TYPO3's tx_xblog_pi1[pointer] pagination is broken upstream (every page
-          // returns the same first ~10 items), so use the typed sub-sitemap. Filter on
-          // the canonical `/news/` prefix that gruene.berlin's TYPO3 SEO sitemap emits
-          // — do NOT rewrite to /nachrichten/. The /nachrichten/ alias only resolves
-          // for some articles; for older slugs TYPO3 silently routes /nachrichten/<slug>
-          // to the listing page with a "Uups, kein Eintrag vorhanden" notice (HTTP 200
-          // body, no redirect), and the scraper would then index the listing as if it
-          // were the article. /news/<slug>_<id> is the canonical TYPO3 URL and always
-          // resolves to the article page — it is what the sub-sitemap emits.
-          sitemapUrls: ['https://gruene.berlin/sitemap.xml'],
-          sitemapFilter: '/news/',
+          paginationLinkSelector: '.pagination a',
+          paginationPattern: '?tx_xblog_pi1[pointer]={page}',
+          paginationOffset: -1,
+          maxPages: 60,
         },
       ],
       contentSelectors: {