Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/wide-hairs-fail.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"parser": major
---

Add script to perform parsing to parser app. Store parsed information locally. Sanitize urls in filesystem compatible format to use as file names.
61 changes: 61 additions & 0 deletions .github/workflows/parser.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Run Parser

on:
workflow_dispatch:
inputs:
urls:
description: 'URLs to parse (one per line)'
required: true
type: string
environment:
description: 'The environment used as target'
type: choice
required: true
default: dev
options:
- dev
- uat
- prod
parser_vector_index_name:
description: 'Parser Vector Index Name (optional, overrides secret)'
required: false
type: string

permissions:
id-token: write
contents: read

jobs:
run-parser:
name: Run Parser Script (manual on ${{ inputs.environment }})
runs-on: ubuntu-24.04
environment: ${{ inputs.environment }}

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '22'
cache: 'npm'

- name: Install dependencies
working-directory: apps/parser
run: npm ci

- name: Run Parser for each URL
env:
PARSER_VECTOR_INDEX_NAME: ${{ inputs.parser_vector_index_name || secrets.PARSER_VECTOR_INDEX_NAME }}
working-directory: apps/parser
run: |
# Loop through each URL sequentially
while IFS= read -r url; do
if [ -n "$url" ]; then
echo "=== Parsing: $url ==="
URL="$url" npx tsx src/parser.ts
echo "=== Completed: $url ==="
echo ""
fi
done <<< "${{ inputs.urls }}"
13 changes: 13 additions & 0 deletions apps/parser/.env.default
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Default environment variables for the parser CLI
# Root URL to start parsing from
URL="https://example.com"

# Maximum recursion depth (integer)
DEPTH=2

# Name of the vector index bucket/folder where parsed artifacts are stored
PARSER_VECTOR_INDEX_NAME="parser-vector-index-name"

# Optional absolute/relative directory override. Leave empty to use
# <PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)> automatically.
OUTDIR=""
40 changes: 40 additions & 0 deletions apps/parser/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
## Parser utilities

This package contains the following TypeScript CLI utility:

- `parser` — recursively visits a website, extracts structured page metadata, and saves each page under `<PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)>/`.


### Getting started

```bash
npm install
npm run build
```

### Parse a website

```bash
URL=https://example.com DEPTH=2 npm run parse
```

Environment variables:

- `URL` (required): root page for the parse.
- `DEPTH` (optional, default `2`): max depth for recursion.
- `PARSER_VECTOR_INDEX_NAME` (optional): base directory name where parsed data will be stored as `<PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)>/`.
- `OUTDIR` (optional): fully override the destination directory.
- If neither `PARSER_VECTOR_INDEX_NAME` nor `OUTDIR` is provided, the output directory defaults to `output/<sanitized(baseUrl)>/parsing/`.

`<sanitized(baseUrl)>` and `<sanitized(path)>` refer to filesystem-safe versions of the URL components (illegal characters replaced with `_`), ensuring predictable, human-readable filenames.

Each visited page is stored as `<PARSER_VECTOR_INDEX_NAME>/parsing/<sanitized(baseUrl)>/<sanitized(path)>.json` (or under `OUTDIR` if specified) with normalized metadata, making it easy to diff between runs.


### Tests

```bash
npm test
```

Tests compile the project before executing Jest to ensure the CLI behaves exactly like the production build.
14 changes: 14 additions & 0 deletions apps/parser/jest.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import type { Config } from 'jest';

const config: Config = {
rootDir: __dirname,
testRegex: 'tests/.*\\.test\\.ts$',
transform: {
'^.+\\.ts$': ['ts-jest', { tsconfig: 'tsconfig.json' }],
},
testEnvironment: 'node',
clearMocks: true,
verbose: false,
};

export default config;
29 changes: 24 additions & 5 deletions apps/parser/package.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,29 @@
{
"name": "parser",
"version": "0.1.0",
"version": "1.0.0",
"private": true,
"scripts": {},
"scripts": {
"clean": "shx rm -rf dist",
"compile": "tsc --project tsconfig.json",
"build": "npm run clean && tsc --project tsconfig.build.json",
"parse": "npm run build && node dist/parser.js",
"test": "npm run build && jest -i"
},
"dependencies": {
"puppeteer": "^24.37.1"
"node-fetch": "^3.3.2",
"puppeteer": "^24.37.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"xml2js": "^0.6.2"
},
"devDependencies": {
"@types/jest": "^29.5.1",
"@types/node": "18.16.*",
"@types/xml2js": "^0.4.11",
"jest": "^29.5.0",
"shx": "^0.3.4",
"ts-jest": "^29.1.1",
"tsx": "^4.7.0",
"typescript": "5.1.6"
}
}

}
37 changes: 37 additions & 0 deletions apps/parser/src/modules/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import path from 'node:path';
import { stripUrlDecorations } from '../utils/url';
import { sanitizeFilename } from '../utils/sanitizeFilename';

export type EnvConfig = {
readonly baseUrl: string;
readonly sanitizedBaseUrl: string;
readonly outputDirectory: string;
readonly maxDepth: number;
};

const DEFAULT_BASE_URL = 'https://news.polymer-project.org/';
const DEFAULT_DEPTH = 2;

export function resolveEnv(): EnvConfig {
const baseUrl = process.env.URL?.trim().length ? process.env.URL : DEFAULT_BASE_URL;
const sanitizedBaseUrl = stripUrlDecorations(baseUrl);
const parsedDepth = Number.parseInt(process.env.DEPTH ?? `${DEFAULT_DEPTH}`, 10);
const maxDepth = Number.isNaN(parsedDepth) ? DEFAULT_DEPTH : Math.max(parsedDepth, 0);
const vectorIndexName = process.env.PARSER_VECTOR_INDEX_NAME?.trim();
const derivedOutput = buildDefaultOutputDirectory(vectorIndexName, sanitizedBaseUrl);
const outputDirectory = process.env.OUTDIR?.trim().length
? process.env.OUTDIR
: derivedOutput;
return { baseUrl, sanitizedBaseUrl, outputDirectory, maxDepth };
}

function buildDefaultOutputDirectory(
vectorIndexName: string | undefined,
sanitizedBaseUrl: string
): string {
const safeBaseSegment = sanitizeFilename(sanitizedBaseUrl, { replacement: '_' });
if (!vectorIndexName) {
return `output/${safeBaseSegment}`;
}
return path.join(vectorIndexName, 'parsing', safeBaseSegment);
}
142 changes: 142 additions & 0 deletions apps/parser/src/modules/crawler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { Browser } from 'puppeteer';
import { ParseNode, ParseMetadata } from './types';
import { normalizeUrl } from '../utils/url';
import { expandInteractiveSections } from './domActions';

export async function parsePages(
browser: Browser,
node: ParseNode,
depth: number,
maxDepth: number,
parsedPages: Map<string, ParseMetadata>,
parsePageFn: (browser: Browser, url: string) => Promise<ParseMetadata | null>,
baseOrigin: string,
baseScope: string,
baseHostToken: string
): Promise<void> {
const visitKey = buildVisitKey(node.url);
if (parsedPages.has(visitKey) || depth > maxDepth) {
return;
}

const normalizedUrl = normalizeUrl(node.url);
if (!isWithinScope(normalizedUrl, baseScope, baseHostToken)) {
return;
}

const metadata = await parsePageFn(browser, node.url);
if (!metadata) return;

parsedPages.set(visitKey, metadata);
node.title = metadata.title;
node.bodyText = metadata.bodyText;
node.lang = metadata.lang;
node.keywords = metadata.keywords;
node.datePublished = metadata.datePublished;
node.lastModified = metadata.lastModified;

let page;
let anchors: string[] = [];
try {
page = await browser.newPage();
await page.goto(node.url, { waitUntil: 'networkidle2', timeout: 45000 });
await expandInteractiveSections(page);
anchors = await page.evaluate((allowedToken: string) => {
const anchors = Array.from(document.querySelectorAll('a[href]'));
const iframeSources = Array.from(document.querySelectorAll('iframe[src]'));
const unique = new Set<string>();
for (const anchor of anchors) {
const href = (anchor as HTMLAnchorElement).href;
if (!href || !href.startsWith('http')) continue;
try {
const target = new URL(href, window.location.href);
const normalizedHref = target.href.toLowerCase();
if (allowedToken && !normalizedHref.includes(allowedToken)) continue;
if (target.href === window.location.href) continue;
unique.add(target.href);
} catch (_) {}
}

for (const frame of iframeSources) {
const src = (frame as HTMLIFrameElement).src;
if (!src || !src.startsWith('http')) {
continue;
}
try {
const target = new URL(src, window.location.href);
const normalizedSrc = target.href.toLowerCase();
if (allowedToken && !normalizedSrc.includes(allowedToken)) continue;
unique.add(target.href);
} catch (_) {}
}
return Array.from(unique);
}, baseHostToken) as string[];
} catch (error) {
// Ignore anchor extraction errors
} finally {
if (page) await page.close();
}


const scheduled = new Set<string>();
const nextChildren: ParseNode[] = [];
for (const href of anchors) {
const normalized = normalizeUrl(href);
const visitCandidate = buildVisitKey(href);
if (parsedPages.has(visitCandidate) || scheduled.has(visitCandidate)) continue;
const lowerNormalized = normalized.toLowerCase();
if (baseHostToken && !lowerNormalized.includes(baseHostToken)) {
continue;
}
if (!isWithinScope(normalized, baseScope, baseHostToken)) {
continue;
}
scheduled.add(visitCandidate);
nextChildren.push({ url: href });
}
node.children = nextChildren;

if (!node.children || depth >= maxDepth) return;
for (const child of node.children) {
await parsePages(
browser,
child,
depth + 1,
maxDepth,
parsedPages,
parsePageFn,
baseOrigin,
baseScope,
baseHostToken
);
}
}

function isWithinScope(url: string, scope: string, hostToken: string): boolean {
if (hostToken && url.toLowerCase().includes(hostToken)) {
return true;
}
if (!scope) {
return true;
}
const lowerUrl = url.toLowerCase();
const lowerScope = scope.toLowerCase();
if (lowerUrl === lowerScope) {
return true;
}
if (!lowerUrl.startsWith(lowerScope)) {
return false;
}
const nextChar = lowerUrl.charAt(lowerScope.length);
return nextChar === '/' || nextChar === '?' || nextChar === '#';
}

export function buildVisitKey(rawUrl: string): string {
try {
const url = new URL(rawUrl);
url.hash = '';
return normalizeUrl(url.toString());
} catch (_error) {
return rawUrl;
}
}
Loading
Loading