pagopa · anemone008 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 10, 2026
diff --git a/.changeset/wide-hairs-fail.md b/.changeset/wide-hairs-fail.md
@@ -0,0 +1,5 @@
+---
+"parser": major
+---
+
+Add script to perform parsing to parser app. Store parsed information locally. Sanitize urls in filesystem compatible format to use as file names.
@@ -0,0 +1,9 @@
+# Default environment variables for the parser CLI
+# Root URL to start parsing from
+URL="https://example.com"
+
+# Maximum recursion depth (integer or null for unlimited)
+DEPTH=null
+
+# Name of the vector index bucket/folder where parsed artifacts are stored
+CHB_INDEX_ID="parser-vector-index-name"
@@ -0,0 +1,92 @@
+
+# Parser Utilities
+
+This package provides a TypeScript CLI tool for recursively crawling a website, extracting structured metadata from each page, and saving the results in a predictable directory structure.
+
+## Features
+
+- **Recursive website parsing**: Visits all reachable pages up to a configurable depth.
+- **Structured output**: Saves each page's metadata as a JSON file.
+- **Configurable via environment variables or .env file**.
+
+---
+
+## Getting Started
+
+1. **Install dependencies:**
+	```bash
+	npm install
+	```
+2. **Type-check & compile**
+	```bash
+	npm run compile
+	```
+3. **Build the project:**
+	```bash
+	npm run build
+	```
+
+---
+
+## Usage
+
+### 1. Configure Environment Variables
+
+You can provide configuration in two ways:
+
+#### a) Using a `.env` file (recommended)
+
+Create a `.env` file in the `apps/parser` directory with the following content:
+
+```
+URL=https://example.com
+CHB_INDEX_ID=name_of_your_choice
+# DEPTH=2  # Optional, defaults to null
+```
+
+#### b) Using command line variables
+
+```bash
+URL=https://example.com DEPTH=2 CHB_INDEX_ID=name_of_your_choice npm run parse
+```
+
+### 2. Run the Parser
+
+```bash
+npm run parse
+```
+
+---
+
+## Environment Variables
+
+- **`URL`** (required): The root page to start parsing from.
+- **`CHB_INDEX_ID`** (required): The base directory for storing parsed data. Output will be saved as `<CHB_INDEX_ID>/parsing/<sanitized(baseUrl)>/`.
+- **`DEPTH`** (optional, default: `2`): Maximum recursion depth for crawling links.
+
+**Note:** The parser will first look for these variables in the environment. If not found, it will load them from `.env` in the `apps/parser` directory.
+
+---
+
+## Output Structure
+
+Each visited page is saved as a JSON file:
+
+```
+<CHB_INDEX_ID>/parsing/<sanitized(baseUrl)>/<sanitized(path)>.json
+```
+
+- `<sanitized(baseUrl)>` and `<sanitized(path)>` are filesystem-safe versions of the URL components (illegal characters replaced with `-`).
+- This structure ensures output is predictable, easy to diff, and human-readable.
+
+---
+
+## Testing
+
+Run tests with:
+
+```bash
+npm run test
+```
+
+Tests will compile the project and then execute Jest to ensure the CLI behaves as expected.
@@ -0,0 +1,14 @@
+import type { Config } from "jest";
+
+const config: Config = {
+  rootDir: __dirname,
+  testRegex: "tests/.*\\.test\\.ts$",
+  transform: {
+    "^.+\\.ts$": ["ts-jest", { tsconfig: "tsconfig.json" }],
+  },
+  testEnvironment: "node",
+  clearMocks: true,
+  verbose: false,
+};
+
+export default config;
@@ -1,10 +1,28 @@
 {
   "name": "parser",
-  "version": "0.1.0",
+  "version": "1.0.0",
   "private": true,
-  "scripts": {},
+  "scripts": {
+    "clean": "shx rm -rf dist",
+    "compile": "tsc --project tsconfig.json",
+    "build": "npm run clean && tsc --project tsconfig.build.json",
+    "parse": "npm run build && node dist/main.js",
+    "test": "npm run build && jest -i"
+  },
   "dependencies": {
-    "puppeteer": "^24.37.1"
+    "node-fetch": "^3.3.2",
+    "puppeteer": "^24.37.1",
+    "puppeteer-extra": "^3.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2",
+    "xml2js": "^0.6.2"
+  },
+  "devDependencies": {
+    "@types/jest": "^29.5.1",
+    "@types/node": "18.16.*",
+    "@types/xml2js": "^0.4.11",
+    "jest": "^29.5.0",
+    "shx": "^0.3.4",
+    "ts-jest": "^29.1.1",
+    "typescript": "5.1.6"
   }
 }
-
@@ -0,0 +1,7 @@
+export function toIsoOrNull(value: string | null): string | null {
+  if (!value) {
+    return null;
+  }
+  const date = new Date(value);
+  return Number.isNaN(date.getTime()) ? null : date.toISOString();
+}
@@ -0,0 +1,66 @@
+import { ParsedMetadata } from "../modules/types";
+import { toIsoOrNull } from "./date-format";
+
+export const extractDocumentMetadata = (): ParsedMetadata => {
+  const getMeta = (name: string): string | null => {
+    return (
+      document.querySelector(`meta[name="${name}"]`)?.getAttribute("content") ||
+      document
+        .querySelector(`meta[property="${name}"]`)
+        ?.getAttribute("content") ||
+      null
+    );
+  };
+  const metaTitle = getMeta("og:title") || getMeta("twitter:title");
+  const documentTitle = document.title?.trim();
+  const normalizedTitle = documentTitle?.length
+    ? documentTitle
+    : metaTitle || "";
+  const normalizeText = (value: string | null | undefined): string => {
+    return value ? value.replace(/\s+/g, " ").trim() : "";
+  };
+  const mainText = normalizeText(document.querySelector("main")?.innerText);
+  const iframeTexts = Array.from(document.querySelectorAll("iframe"))
+    .map((frame) => {
+      try {
+        return normalizeText(frame.contentDocument?.body?.innerText ?? "");
+      } catch (_error) {
+        return "";
+      }
+    })
+    .filter((text) => text.length > 0);
+  const prioritizedTextParts = [mainText, ...iframeTexts].filter(
+    (text) => text.length > 0,
+  );
+  const prioritizedText = prioritizedTextParts.join("\n\n").trim();
+  const fallbackBody = normalizeText(document.body?.innerText ?? "");
+  const bodyText =
+    prioritizedText.length >= 120 ? prioritizedText : fallbackBody;
+  return {
+    title: normalizedTitle,
+    url: window.location.href,
+    bodyText,
+    lang: document.documentElement.lang || getMeta("og:locale") || null,
+    keywords: getMeta("keywords") || getMeta("news_keywords"),
+    datePublished:
+      getMeta("article:published_time") ||
+      getMeta("date") ||
+      getMeta("publish-date"),
+    lastModified:
+      document.lastModified !== "01/01/1970 00:00:00"
+        ? document.lastModified
+        : getMeta("article:modified_time"),
+  };
+};
+
+export function serializeMetadata(raw: ParsedMetadata): ParsedMetadata {
+  return {
+    url: raw.url,
+    title: raw.title,
+    bodyText: raw.bodyText,
+    lang: raw.lang,
+    keywords: raw.keywords,
+    datePublished: toIsoOrNull(raw.datePublished),
+    lastModified: toIsoOrNull(raw.lastModified),
+  };
+}