nasa · htranho · Apr 23, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,5 @@ loadtest/locust/
 scripts/load/hammer_endpoints_sequential.js
 data/scheme-size*
 notes/
+archive-processor/downloaded-rdf
+archive-processor/local-kms-csv
diff --git a/archive-processor/README.md b/archive-processor/README.md
@@ -0,0 +1,67 @@
+"# Archive Processor Scripts
+
+This directory contains a set of Node.js scripts designed to manage the lifecycle of KMS data, from downloading RDF files from an S3 bucket to processing them and uploading the resulting CSV files back to S3.
+
+## Scripts
+
+-   `download-rdf-from-S3.js`: Downloads versioned RDF files from the configured S3 bucket into the local `archive-processor/downloaded-rdf` directory.
+-   `process-rdf.js`: Orchestrates a complex workflow that uses the downloaded RDF files. For each version, it loads the RDF data into an RDF4J repository and then triggers a local KMS application to generate corresponding CSV files in the `archive-processor/local-kms-csv` directory.
+-   `upload-csv-to-S3.js`: Uploads the generated CSV files from the `archive-processor/local-kms-csv` directory to the configured S3 bucket.
+
+## Configuration
+
+All scripts are configured via a single, centralized shell script:
+
+`archive-processor/scripts/scripts-config.sh`
+
+The settings in this file are loaded automatically when you use the `npm run` commands for these scripts.
+
+This file allows you to set:
+-   The S3 bucket name and AWS region.
+-   Your AWS profile.
+-   Delays between S3 API calls to prevent rate-limiting.
+-   Specific lists of versions to download, upload, or process.
+
+## Recommended Workflow
+
+The scripts are designed to be run in a specific sequence. Ensure all prerequisites are met before starting.
+
+### Prerequisites
+
+-   **Node.js and Dependencies**: You must have Node.js installed and have run `npm install` from the project root.
+-   **AWS Credentials**: Your AWS credentials must be configured locally, typically via the `~/.aws/credentials` file. Ensure you have access to the target S3 bucket.
+-   **Running Services**: The `process-rdf` script requires running instances of **RDF4J** and **Redis** that are accessible to the script. The default URLs are `http://127.0.0.1:8081` for RDF4J and `redis://localhost:6380` for Redis.
+
+### Step 1: Configure Your Environment
+
+1.  Open `archive-processor/scripts/scripts-config.sh` in your editor.
+2.  Set the `S3_BUCKET_NAME` and `AWS_PROFILE` if they differ from the defaults.
+3.  Optionally, specify which versions you want to work with using the `TO_BE_DOWNLOADED_VERSIONS`, `TO_BE_PROCESSED_VERSIONS`, and `TO_BE_UPLOADED_VERSIONS` variables. If you leave them empty, the scripts will process all available versions.
+
+### Step 2: Download RDF Files from S3
+
+This step populates the `archive-processor/downloaded-rdf` directory with the master RDF files from S3.
+
+```shell
+npm run download-rdf
+```
+
+### Step 3: Process RDFs and Generate CSVs
+
+This is the main processing step. It reads the downloaded RDF files, loads them into RDF4J, and then uses a local application endpoint to generate CSV files in the `archive-processor/local-kms-csv` directory.
+
+**Important**: Ensure your RDF4J and Redis services are running before executing this command.
+
+```shell
+npm run process-rdf
+```
+
+### Step 4: Upload Generated CSVs to S3
+
+This final step takes the newly created CSV files from the `local-kms-csv` directory and uploads them to the correct versioned path in your S3 bucket.
+
+```shell
+npm run upload-csv
+```
+
+By following these steps, you can perform a full download-process-upload cycle for your KMS data."
diff --git a/archive-processor/schemes-rdf/schemes_published.rdf b/archive-processor/schemes-rdf/schemes_published.rdf
diff --git a/archive-processor/scripts/download-rdf-from-S3.js b/archive-processor/scripts/download-rdf-from-S3.js
@@ -0,0 +1,302 @@
+import { createWriteStream, mkdirSync } from 'fs'
+import { dirname, join } from 'path'
+import { pipeline } from 'stream/promises'
+import { fileURLToPath } from 'url'
+
+import { GetObjectCommand, ListObjectsV2Command } from '@aws-sdk/client-s3'
+
+import { getS3Client } from '@/shared/awsClients'
+
+const scriptPath = fileURLToPath(import.meta.url)
+const scriptDir = dirname(scriptPath)
+
+/**
+ * AWS Region for the S3 bucket
+ * @type {string}
+ */
+const region = process.env.AWS_REGION || 'us-east-1'
+
+/**
+ * S3 bucket name to download files from
+ * @type {string}
+ */
+const bucketName = process.env.S3_BUCKET_NAME || 'kms-rdf-backup-sit'
+
+/**
+ * AWS Profile to use (optional)
+ * @type {string|undefined}
+ */
+const awsProfile = process.env.AWS_PROFILE
+
+/**
+ * Output directory for downloaded files
+ * @type {string}
+ */
+const outputDir = join(scriptDir, '..', 'downloaded-rdf')
+
+/**
+ * Delay in milliseconds between downloads to avoid rate limiting
+ * Default: 100ms (configurable via DOWNLOAD_DELAY_MS environment variable)
+ * Set to 0 to disable delay
+ * @type {number}
+ */
+const downloadDelayMs = parseInt(process.env.DOWNLOAD_DELAY_MS || '100', 10)
+
+/**
+ * Optional comma-separated list of specific versions to download.
+ * If empty, the script will download all RDF files.
+ * These should be the S3 key prefixes (e.g., "10.0", "KMS-654-Testing").
+ * @type {string}
+ */
+const toBeDownloadedVersions = process.env.TO_BE_DOWNLOADED_VERSIONS || ''
+
+/**
+ * S3 Client from shared configuration
+ * @type {S3Client}
+ */
+const s3Client = getS3Client()
+
+/**
+ * Extracts the version name from an S3 object key
+ * Examples:
+ *   "10.0/rdf.xml" -> "10.0"
+ *   "draft/2026/03/16/rdf.xml" -> "draft-2026-03-16"
+ *   "KMS-654-Testing/rdf.xml" -> "KMS-654-Testing"
+ *
+ * @param {string} key - S3 object key
+ * @returns {string} Version name suitable for use as a filename
+ */
+const extractVersionName = (key) => {
+  // Remove the trailing "/rdf.xml"
+  const versionPath = key.replace('/rdf.xml', '')
+
+  // Replace slashes with hyphens for flat file structure
+  return versionPath.replace(/\//g, '-')
+}
+
+/**
+ * Delays execution for the specified number of milliseconds
+ *
+ * @param {number} ms - Milliseconds to delay
+ * @returns {Promise<void>}
+ */
+const delay = (ms) => new Promise((resolve) => {
+  setTimeout(resolve, ms)
+})
+
+/**
+ * Downloads a single RDF file from S3
+ *
+ * @param {string} key - S3 object key
+ * @param {number} index - Current file index (for progress display)
+ * @param {number} total - Total number of files to download
+ * @returns {Promise<{success: boolean, key: string, outputPath?: string, error?: Error}>}
+ */
+const downloadRdfFile = async (key, index, total) => {
+  const versionName = extractVersionName(key)
+  const fileName = `${versionName}.rdf.xml`
+  const outputPath = join(outputDir, fileName)
+
+  try {
+    console.log(`[${index}/${total}] Downloading: ${key} -> ${fileName}`)
+
+    const command = new GetObjectCommand({
+      Bucket: bucketName,
+      Key: key
+    })
+
+    const response = await s3Client.send(command)
+
+    if (!response.Body) {
+      throw new Error('No data returned from S3')
+    }
+
+    // Ensure output directory exists
+    mkdirSync(dirname(outputPath), { recursive: true })
+
+    // Stream the file to disk
+    const writeStream = createWriteStream(outputPath)
+    await pipeline(response.Body, writeStream)
+
+    console.log(`[${index}/${total}] ✓ Downloaded: ${fileName}`)
+
+    return {
+      success: true,
+      key,
+      outputPath
+    }
+  } catch (error) {
+    console.error(`[${index}/${total}] ✗ Failed to download ${key}:`, error.message)
+
+    return {
+      success: false,
+      key,
+      error
+    }
+  }
+}
+
+/**
+ * Lists all RDF objects from S3 bucket, excluding drafts
+ *
+ * @returns {Promise<Array<string>>} Array of S3 object keys
+ */
+const listS3Objects = async () => {
+  console.log(`Listing objects from bucket: ${bucketName}...`)
+
+  const allObjectKeys = []
+  let continuationToken
+  let pageCount = 0
+
+  /* eslint-disable no-await-in-loop */
+  do {
+    pageCount += 1
+    console.log(`Fetching page ${pageCount}...`)
+
+    const command = new ListObjectsV2Command({
+      Bucket: bucketName,
+      ContinuationToken: continuationToken
+    })
+
+    const response = await s3Client.send(command)
+
+    // Extract object keys from the current page, excluding drafts
+    if (response.Contents) {
+      const keys = response.Contents
+        .map((obj) => obj.Key)
+        .filter(Boolean)
+        .filter((key) => !key.startsWith('draft/')) // Exclude draft objects
+        .filter((key) => key.endsWith('/rdf.xml')) // Only include main RDF files
+      allObjectKeys.push(...keys)
+      console.log(`Found ${keys.length} objects in page ${pageCount} (draft objects excluded)`)
+    }
+
+    // Check if there are more pages to fetch
+    continuationToken = response.NextContinuationToken
+  } while (continuationToken)
+  /* eslint-enable no-await-in-loop */
+
+  console.log(`\nTotal objects found: ${allObjectKeys.length}\n`)
+
+  return allObjectKeys
+}
+
+/**
+ * Downloads all RDF files from S3
+ *
+ * @returns {Promise<number>} The number of failed downloads.
+ */
+const downloadAllRdfFiles = async () => {
+  // List all objects from S3
+  let objects
+
+  const versionList = toBeDownloadedVersions.split(',').map((v) => v.trim()).filter(Boolean)
+
+  // If a non-empty list of versions is provided, construct the object keys
+  if (versionList.length > 0) {
+    console.log(`\nFound ${versionList.length} specific versions to download.`)
+    objects = versionList.map((version) => `${version}/rdf.xml`)
+  } else {
+    // Otherwise, list all objects from S3
+    objects = await listS3Objects()
+  }
+
+  const totalFiles = objects.length
+
+  console.log(`Starting download of ${totalFiles} RDF files...`)
+  console.log(`Output directory: ${outputDir}\n`)
+
+  const results = []
+
+  // Download files sequentially to avoid overwhelming the connection
+  /* eslint-disable no-await-in-loop */
+  for (let i = 0; i < objects.length; i += 1) {
+    const key = objects[i]
+    const result = await downloadRdfFile(key, i + 1, totalFiles)
+    results.push(result)
+
+    // Add delay between downloads if configured
+    if (downloadDelayMs > 0 && i < objects.length - 1) {
+      await delay(downloadDelayMs)
+    }
+  }
+  /* eslint-enable no-await-in-loop */
+
+  // Summary
+  const successful = results.filter((r) => r.success).length
+  const failed = results.filter((r) => !r.success).length
+
+  console.log(`\n${'='.repeat(60)}`)
+  console.log('Download Summary')
+  console.log('='.repeat(60))
+  console.log(`Total files: ${totalFiles}`)
+  console.log(`✓ Successful: ${successful}`)
+  console.log(`✗ Failed: ${failed}`)
+  console.log(`Output directory: ${outputDir}`)
+
+  if (failed > 0) {
+    console.log('\nFailed downloads:')
+    results.filter((r) => !r.success).forEach((r) => {
+      console.log(`  - ${r.key}: ${r.error.message}`)
+    })
+  }
+}
+
+/**
+ * Main execution function
+ */
+const main = async () => {
+  console.log('AWS S3 RDF Files Downloader')
+  console.log('===========================\n')
+
+  // Ensure required directories exist before starting
+  try {
+    mkdirSync(outputDir, { recursive: true })
+  } catch (error) {
+    console.error('✗ Failed to create the output directory:', error.message)
+    process.exit(1)
+  }
+
+  console.log('Configuration loaded from environment variables.')
+  console.log('You can set these in archive-processor/scripts/scripts-config.sh and run `source archive-processor/scripts/scripts-config.sh`\n')
+  console.log(`Bucket: ${bucketName}`)
+  console.log(`Region: ${region}`)
+  console.log(`Output: ${outputDir}`)
+  console.log(`Delay between downloads: ${downloadDelayMs}ms`)
+
+  if (process.env.AWS_ENDPOINT_URL) {
+    console.log(`Endpoint: ${process.env.AWS_ENDPOINT_URL}`)
+  } else if (awsProfile) {
+    console.log(`Profile: ${awsProfile}`)
+  }
+
+  if (toBeDownloadedVersions) {
+    console.log(`Versions to download: ${toBeDownloadedVersions}`)
+  } else {
+    console.log('Versions to download: All versions')
+  }
+
+  try {
+    const failedCount = await downloadAllRdfFiles()
+
+    if (failedCount > 0) {
+      console.error(`\n✗ Download process concluded with ${failedCount} failure(s).`)
+      process.exit(1)
+    } else {
+      console.log('\n✓ Download completed successfully!')
+    }
+  } catch (error) {
+    console.error('\n✗ Failed to download RDF files:', error.message)
+
+    if (error.name === 'InvalidAccessKeyId' || error.name === 'CredentialsProviderError') {
+      console.error('\n⚠️  AWS Credentials Error:')
+      console.error('Please configure your AWS credentials. Run with:')
+      console.error('   AWS_PROFILE=kms-sit node scripts/downloadRdfFiles.js')
+    }
+
+    process.exit(1)
+  }
+}
+
+// Execute the script
+main()