Skip to content

Commit 645bc5e

Browse files
authored
Merge pull request #679 from netzbegruenung/fix/berlin-presse-use-pressemitteilungen
fix(config): Berlin Presse via /pressemitteilungen listing, drop news sub-sitemap
2 parents b2d72ef + 97486fd commit 645bc5e

1 file changed

Lines changed: 19 additions & 12 deletions

File tree

apps/api/config/landesverbaendeConfig.ts

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -294,20 +294,27 @@ export const LANDESVERBAENDE_CONFIG: LandesverbaendeConfig = {
294294
maxAgeYears: 5,
295295
contentPaths: [
296296
{
297+
// Scrape the dedicated /pressemitteilungen listing instead of /nachrichten
298+
// or the news sub-sitemap. The news sub-sitemap aggregates ALL Berlin LV
299+
// posts (press releases + AG-Sitzung announcements + LAG meetings + events),
300+
// and articles indexed from there polluted Berlin Presse with non-press
301+
// content. /pressemitteilungen is TYPO3's category-filtered listing route
302+
// and only contains real press releases.
303+
//
304+
// Pagination on /pressemitteilungen actually works (unlike /nachrichten,
305+
// where tx_xblog_pi1[pointer] is silently ignored) — pages 1, 2, 3 ... 57
306+
// each return distinct article IDs, and the next-page links carry
307+
// per-page cHash signatures. Use paginationLinkSelector so the extractor
308+
// follows next-links from HTML rather than constructing URLs (which
309+
// wouldn't carry the required cHash). paginationPattern stays as fallback
310+
// for the rare case the link-following can't find a "next" anchor.
297311
type: 'presse',
298-
path: '/nachrichten',
312+
path: '/pressemitteilungen',
299313
listSelector: 'h2 a[href], h3 a[href]',
300-
// TYPO3's tx_xblog_pi1[pointer] pagination is broken upstream (every page
301-
// returns the same first ~10 items), so use the typed sub-sitemap. Filter on
302-
// the canonical `/news/` prefix that gruene.berlin's TYPO3 SEO sitemap emits
303-
// — do NOT rewrite to /nachrichten/. The /nachrichten/ alias only resolves
304-
// for some articles; for older slugs TYPO3 silently routes /nachrichten/<slug>
305-
// to the listing page with a "Uups, kein Eintrag vorhanden" notice (HTTP 200
306-
// body, no redirect), and the scraper would then index the listing as if it
307-
// were the article. /news/<slug>_<id> is the canonical TYPO3 URL and always
308-
// resolves to the article page — it is what the sub-sitemap emits.
309-
sitemapUrls: ['https://gruene.berlin/sitemap.xml'],
310-
sitemapFilter: '/news/',
314+
paginationLinkSelector: '.pagination a',
315+
paginationPattern: '?tx_xblog_pi1[pointer]={page}',
316+
paginationOffset: -1,
317+
maxPages: 60,
311318
},
312319
],
313320
contentSelectors: {

0 commit comments

Comments
 (0)