@@ -294,20 +294,27 @@ export const LANDESVERBAENDE_CONFIG: LandesverbaendeConfig = {
294294 maxAgeYears : 5 ,
295295 contentPaths : [
296296 {
297+ // Scrape the dedicated /pressemitteilungen listing instead of /nachrichten
298+ // or the news sub-sitemap. The news sub-sitemap aggregates ALL Berlin LV
299+ // posts (press releases + AG-Sitzung announcements + LAG meetings + events),
300+ // and articles indexed from there polluted Berlin Presse with non-press
301+ // content. /pressemitteilungen is TYPO3's category-filtered listing route
302+ // and only contains real press releases.
303+ //
304+ // Pagination on /pressemitteilungen actually works (unlike /nachrichten,
305+ // where tx_xblog_pi1[pointer] is silently ignored) — pages 1, 2, 3 ... 57
306+ // each return distinct article IDs, and the next-page links carry
307+ // per-page cHash signatures. Use paginationLinkSelector so the extractor
308+ // follows next-links from HTML rather than constructing URLs (which
309+ // wouldn't carry the required cHash). paginationPattern stays as fallback
310+ // for the rare case the link-following can't find a "next" anchor.
297311 type : 'presse' ,
298- path : '/nachrichten ' ,
312+ path : '/pressemitteilungen ' ,
299313 listSelector : 'h2 a[href], h3 a[href]' ,
300- // TYPO3's tx_xblog_pi1[pointer] pagination is broken upstream (every page
301- // returns the same first ~10 items), so use the typed sub-sitemap. Filter on
302- // the canonical `/news/` prefix that gruene.berlin's TYPO3 SEO sitemap emits
303- // — do NOT rewrite to /nachrichten/. The /nachrichten/ alias only resolves
304- // for some articles; for older slugs TYPO3 silently routes /nachrichten/<slug>
305- // to the listing page with a "Uups, kein Eintrag vorhanden" notice (HTTP 200
306- // body, no redirect), and the scraper would then index the listing as if it
307- // were the article. /news/<slug>_<id> is the canonical TYPO3 URL and always
308- // resolves to the article page — it is what the sub-sitemap emits.
309- sitemapUrls : [ 'https://gruene.berlin/sitemap.xml' ] ,
310- sitemapFilter : '/news/' ,
314+ paginationLinkSelector : '.pagination a' ,
315+ paginationPattern : '?tx_xblog_pi1[pointer]={page}' ,
316+ paginationOffset : - 1 ,
317+ maxPages : 60 ,
311318 } ,
312319 ] ,
313320 contentSelectors : {
0 commit comments