pagopa · batdevis · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/.changeset/wide-hairs-fail.md b/.changeset/wide-hairs-fail.md
@@ -0,0 +1,5 @@
+---
+"parser": major
+---
+
+Add script to perform parsing to parser app. Store parsed information locally. Sanitize urls in filesystem compatible format to use as file names.
@@ -0,0 +1,61 @@
+name: Run Parser
+
+on:
+  workflow_dispatch:
+    inputs:
+      urls:
+        description: 'URLs to parse (one per line)'
+        required: true
+        type: string
+      environment:
+        description: 'The environment used as target'
+        type: choice
+        required: true
+        default: dev
+        options:
+          - dev
+          - uat
+          - prod
+      parser_vector_index_name:
+        description: 'Parser Vector Index Name (optional, overrides secret)'
+        required: false
+        type: string
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  run-parser:
+    name: Run Parser Script (manual on ${{ inputs.environment }})
+    runs-on: ubuntu-24.04
+    environment: ${{ inputs.environment }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '22'
+          cache: 'npm'
+
+      - name: Install dependencies
+        working-directory: apps/parser
+        run: npm ci
+
+      - name: Run Parser for each URL
+        env:
+          PARSER_VECTOR_INDEX_NAME: ${{ inputs.parser_vector_index_name || secrets.PARSER_VECTOR_INDEX_NAME }}
+        working-directory: apps/parser
+        run: |
+          # Loop through each URL sequentially
+          while IFS= read -r url; do
+            if [ -n "$url" ]; then
+              echo "=== Parsing: $url ==="
+              URL="$url" npx tsx src/parser.ts
+              echo "=== Completed: $url ==="
+              echo ""
+            fi
+          done <<< "${{ inputs.urls }}"
@@ -0,0 +1,13 @@
+# Default environment variables for the parser CLI
+# Root URL to start parsing from
+URL="https://example.com"
+
+# Maximum recursion depth (integer)
+DEPTH=2
+
+# Name of the vector index bucket/folder where parsed artifacts are stored
+PARSER_VECTOR_INDEX_NAME="parser-vector-index-name"
+
+# Optional absolute/relative directory override. Leave empty to use
+# <PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)> automatically.
+OUTDIR=""
@@ -0,0 +1,40 @@
+## Parser utilities
+
+This package contains the following TypeScript CLI utility:
+
+- `parser` — recursively visits a website, extracts structured page metadata, and saves each page under `<PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)>/`.
+
+
+### Getting started
+
+```bash
+npm install
+npm run build
+```
+
+### Parse a website
+
+```bash
+URL=https://example.com DEPTH=2 npm run parse
+```
+
+Environment variables:
+
+- `URL` (required): root page for the parse.
+- `DEPTH` (optional, default `2`): max depth for recursion.
+- `PARSER_VECTOR_INDEX_NAME` (optional): base directory name where parsed data will be stored as `<PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)>/`.
+- `OUTDIR` (optional): fully override the destination directory.
+    - If neither `PARSER_VECTOR_INDEX_NAME` nor `OUTDIR` is provided, the output directory defaults to `output/<sanitized(baseUrl)>/parsing/`.
+
+`<sanitized(baseUrl)>` and `<sanitized(path)>` refer to filesystem-safe versions of the URL components (illegal characters replaced with `_`), ensuring predictable, human-readable filenames.
+
+Each visited page is stored as `<PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)>/<sanitized(path)>.json` (or under `OUTDIR` if specified) with normalized metadata, making it easy to diff between runs.
+
+
+### Tests
+
+```bash
+npm test
+```
+
+Tests compile the project before executing Jest to ensure the CLI behaves exactly like the production build.
@@ -0,0 +1,14 @@
+import type { Config } from 'jest';
+
+const config: Config = {
+  rootDir: __dirname,
+  testRegex: 'tests/.*\\.test\\.ts$',
+  transform: {
+    '^.+\\.ts$': ['ts-jest', { tsconfig: 'tsconfig.json' }],
+  },
+  testEnvironment: 'node',
+  clearMocks: true,
+  verbose: false,
+};
+
+export default config;
@@ -1,10 +1,29 @@
 {
   "name": "parser",
-  "version": "0.1.0",
+  "version": "1.0.0",
   "private": true,
-  "scripts": {},
+  "scripts": {
+    "clean": "shx rm -rf dist",
+    "compile": "tsc --project tsconfig.json",
+    "build": "npm run clean && tsc --project tsconfig.build.json",
+    "parse": "npm run build && node dist/parser.js",
+    "test": "npm run build && jest -i"
+  },
   "dependencies": {
-    "puppeteer": "^24.37.1"
+    "node-fetch": "^3.3.2",
+    "puppeteer": "^24.37.1",
+    "puppeteer-extra": "^3.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2",
+    "xml2js": "^0.6.2"
+  },
+  "devDependencies": {
+    "@types/jest": "^29.5.1",
+    "@types/node": "18.16.*",
+    "@types/xml2js": "^0.4.11",
+    "jest": "^29.5.0",
+    "shx": "^0.3.4",
+    "ts-jest": "^29.1.1",
+    "tsx": "^4.7.0",
+    "typescript": "5.1.6"
   }
-}
-
+}
@@ -0,0 +1,37 @@
+import path from 'node:path';
+import { stripUrlDecorations } from '../utils/url';
+import { sanitizeFilename } from '../utils/sanitizeFilename';
+
+export type EnvConfig = {
+  readonly baseUrl: string;
+  readonly sanitizedBaseUrl: string;
+  readonly outputDirectory: string;
+  readonly maxDepth: number;
+};
+
+const DEFAULT_BASE_URL = 'https://news.polymer-project.org/';
+const DEFAULT_DEPTH = 2;
+
+export function resolveEnv(): EnvConfig {
+  const baseUrl = process.env.URL?.trim().length ? process.env.URL : DEFAULT_BASE_URL;
+  const sanitizedBaseUrl = stripUrlDecorations(baseUrl);
+  const parsedDepth = Number.parseInt(process.env.DEPTH ?? `${DEFAULT_DEPTH}`, 10);
+  const maxDepth = Number.isNaN(parsedDepth) ? DEFAULT_DEPTH : Math.max(parsedDepth, 0);
+  const vectorIndexName = process.env.PARSER_VECTOR_INDEX_NAME?.trim();
+  const derivedOutput = buildDefaultOutputDirectory(vectorIndexName, sanitizedBaseUrl);
+  const outputDirectory = process.env.OUTDIR?.trim().length
+    ? process.env.OUTDIR
+    : derivedOutput;
+  return { baseUrl, sanitizedBaseUrl, outputDirectory, maxDepth };
+}
+
+function buildDefaultOutputDirectory(
+  vectorIndexName: string | undefined,
+  sanitizedBaseUrl: string
+): string {
+  const safeBaseSegment = sanitizeFilename(sanitizedBaseUrl, { replacement: '_' });
+  if (!vectorIndexName) {
+    return `output/${safeBaseSegment}`;
+  }
+  return path.join(vectorIndexName, 'parsing', safeBaseSegment);
+}
@@ -0,0 +1,142 @@
+import { Browser } from 'puppeteer';
+import { ParseNode, ParseMetadata } from './types';
+import { normalizeUrl } from '../utils/url';
+import { expandInteractiveSections } from './domActions';
+
+export async function parsePages(
+  browser: Browser,
+  node: ParseNode,
+  depth: number,
+  maxDepth: number,
+  parsedPages: Map<string, ParseMetadata>,
+  parsePageFn: (browser: Browser, url: string) => Promise<ParseMetadata | null>,
+  baseOrigin: string,
+  baseScope: string,
+  baseHostToken: string
+): Promise<void> {
+  const visitKey = buildVisitKey(node.url);
+  if (parsedPages.has(visitKey) || depth > maxDepth) {
+    return;
+  }
+
+  const normalizedUrl = normalizeUrl(node.url);
+  if (!isWithinScope(normalizedUrl, baseScope, baseHostToken)) {
+    return;
+  }
+
+  const metadata = await parsePageFn(browser, node.url);
+  if (!metadata) return;
+
+  parsedPages.set(visitKey, metadata);
+  node.title = metadata.title;
+  node.bodyText = metadata.bodyText;
+  node.lang = metadata.lang;
+  node.keywords = metadata.keywords;
+  node.datePublished = metadata.datePublished;
+  node.lastModified = metadata.lastModified;
+
+  let page;
+  let anchors: string[] = [];
+  try {
+    page = await browser.newPage();
+    await page.goto(node.url, { waitUntil: 'networkidle2', timeout: 45000 });
+    await expandInteractiveSections(page);
+    anchors = await page.evaluate((allowedToken: string) => {
+      const anchors = Array.from(document.querySelectorAll('a[href]'));
+      const iframeSources = Array.from(document.querySelectorAll('iframe[src]'));
+      const unique = new Set<string>();
+      for (const anchor of anchors) {
+        const href = (anchor as HTMLAnchorElement).href;
+        if (!href || !href.startsWith('http')) continue;
+        try {
+          const target = new URL(href, window.location.href);
+          const normalizedHref = target.href.toLowerCase();
+          if (allowedToken && !normalizedHref.includes(allowedToken)) continue;
+          if (target.href === window.location.href) continue;
+          unique.add(target.href);
+        } catch (_) {}
+      }
+
+      for (const frame of iframeSources) {
+        const src = (frame as HTMLIFrameElement).src;
+        if (!src || !src.startsWith('http')) {
+          continue;
+        }
+        try {
+          const target = new URL(src, window.location.href);
+          const normalizedSrc = target.href.toLowerCase();
+          if (allowedToken && !normalizedSrc.includes(allowedToken)) continue;
+          unique.add(target.href);
+        } catch (_) {}
+      }
+      return Array.from(unique);
+    }, baseHostToken) as string[];
+  } catch (error) {
+    // Ignore anchor extraction errors
+  } finally {
+    if (page) await page.close();
+  }
+
+
+  const scheduled = new Set<string>();
+  const nextChildren: ParseNode[] = [];
+  for (const href of anchors) {
+    const normalized = normalizeUrl(href);
+    const visitCandidate = buildVisitKey(href);
+    if (parsedPages.has(visitCandidate) || scheduled.has(visitCandidate)) continue;
+    const lowerNormalized = normalized.toLowerCase();
+    if (baseHostToken && !lowerNormalized.includes(baseHostToken)) {
+      continue;
+    }
+    if (!isWithinScope(normalized, baseScope, baseHostToken)) {
+      continue;
+    }
+    scheduled.add(visitCandidate);
+    nextChildren.push({ url: href });
+  }
+  node.children = nextChildren;
+
+  if (!node.children || depth >= maxDepth) return;
+  for (const child of node.children) {
+    await parsePages(
+      browser,
+      child,
+      depth + 1,
+      maxDepth,
+      parsedPages,
+      parsePageFn,
+      baseOrigin,
+      baseScope,
+      baseHostToken
+    );
+  }
+}
+
+function isWithinScope(url: string, scope: string, hostToken: string): boolean {
+  if (hostToken && url.toLowerCase().includes(hostToken)) {
+    return true;
+  }
+  if (!scope) {
+    return true;
+  }
+  const lowerUrl = url.toLowerCase();
+  const lowerScope = scope.toLowerCase();
+  if (lowerUrl === lowerScope) {
+    return true;
+  }
+  if (!lowerUrl.startsWith(lowerScope)) {
+    return false;
+  }
+  const nextChar = lowerUrl.charAt(lowerScope.length);
+  return nextChar === '/' || nextChar === '?' || nextChar === '#';
+}
+
+export function buildVisitKey(rawUrl: string): string {
+  try {
+    const url = new URL(rawUrl);
+    url.hash = '';
+    return normalizeUrl(url.toString());
+  } catch (_error) {
+    return rawUrl;
+  }
+}