diff --git a/.changeset/late-terms-apply.md b/.changeset/late-terms-apply.md new file mode 100644 index 00000000..1fb6190a --- /dev/null +++ b/.changeset/late-terms-apply.md @@ -0,0 +1,7 @@ +--- +'@transcend-io/cli': major +--- + +Port the multi-identifier preference upload workflow into the monorepo CLI. + +This adds `consent configure-preference-upload`, moves `upload-preferences` to the schema-backed parallel worker flow, and includes the supporting receipts state, preference upload skill, and reconcile script. diff --git a/.cursor/skills/preference-data-upload/SKILL.md b/.cursor/skills/preference-data-upload/SKILL.md new file mode 100644 index 00000000..b18dce28 --- /dev/null +++ b/.cursor/skills/preference-data-upload/SKILL.md @@ -0,0 +1,429 @@ + + + +## Table of Contents + +- [Preference Data Upload Pipeline](#preference-data-upload-pipeline) + - [Prerequisites](#prerequisites) + - [Phase 1: Receive & Transform Raw Data](#phase-1-receive--transform-raw-data) + - [1.1 Set up working directory](#11-set-up-working-directory) + - [1.2 Ask the user](#12-ask-the-user) + - [1.3 Write the transform script](#13-write-the-transform-script) + - [1.4 Run and verify counts](#14-run-and-verify-counts) + - [Phase 2: Chunk Large Files](#phase-2-chunk-large-files) + - [2.1 Verify chunk counts](#21-verify-chunk-counts) + - [Phase 3: Generate & Validate Config](#phase-3-generate--validate-config) + - [3.1 Run interactive config](#31-run-interactive-config) + - [3.2 Validate the config](#32-validate-the-config) + - [Phase 4: Test Upload](#phase-4-test-upload) + - [4.1 Ask the user](#41-ask-the-user) + - [4.2 Run test upload](#42-run-test-upload) + - [4.3 Generate verification links](#43-generate-verification-links) + - [4.4 API verification script](#44-api-verification-script) + - [Phase 5: Full Upload](#phase-5-full-upload) + - [5.1 Key flags for production uploads](#51-key-flags-for-production-uploads) + - [5.2 Copy config and run](#52-copy-config-and-run) + - [5.3 Monitor progress](#53-monitor-progress) + - [5.4 Resumability](#54-resumability) + - [Phase 6: Error Analysis & Cleanup](#phase-6-error-analysis--cleanup) + - [Phase 7: Executive Summary](#phase-7-executive-summary) + + + +--- + +name: preference-data-upload +description: End-to-end workflow for uploading preference data to Transcend via the CLI. Covers receiving raw files, running transformation scripts, chunking, interactive config generation, test uploads with verification, and full production uploads. Use when the user mentions uploading preferences, preference migration, consent data upload, airtable upload, or bulk preference import. + +--- + +# Preference Data Upload Pipeline + +End-to-end workflow for uploading preference/consent data to Transcend's preference store via the CLI. + +## Prerequisites + +- Transcend CLI built and available (`pnpm build` then `pnpm start `) +- A valid `TRANSCEND_API_KEY` with scopes: `Manage Preference Store`, `View Preference Store` +- The partition UUID for the target preference store +- The `--transcendUrl` for the org (e.g. `https://api.us.transcend.io` for US-backed) + +## Phase 1: Receive & Transform Raw Data + +### 1.1 Set up working directory + +``` +working// +├── raw/ # Original files from customer (never modify) +├── exclusions/ # Block/suppression lists +├── transform.py # or transform.ts — transformation script +├── output/ +│ ├── batch_a/ # Chunked files for first-pass upload +│ ├── batch_b/ # Chunked files for second-pass (if split timestamps) +│ ├── test/ # Small subset for test uploads +│ └── all_chunks/ # Symlinks to all chunks (for config scanning) +└── README.md # Document the pipeline for this project +``` + +### 1.2 Ask the user + +- **Language preference**: Python (pandas) or TypeScript for the transform script? +- **Source files**: What are the raw input files and their schemas? +- **Exclusion list**: Is there a block/suppression list to filter out? +- **Column mappings**: Which columns map to email, name, country, purposes, timestamps? +- **Timestamp splitting**: Do any records have multiple consent timestamps that need splitting across batches? +- **Duplicate handling**: How should duplicate emails within a single source be handled? + - **Keep last** (default): `df.sort_values('timestamp').drop_duplicates(subset='email_lower', keep='last')` + - **Keep first**: Keep the earliest record + - **Skip dedup**: Let the uploader handle it (it appends `___` and API resolves by timestamp — works but generates warnings) + +### 1.3 Write the transform script + +The script must: + +1. Load and deduplicate the exclusion list (lowercase + strip emails) +2. Read source files with `dtype=str` (Python) to preserve original values +3. Rename columns to the standardized schema +4. Clean placeholder values (`[none]`, whitespace) in name/country fields +5. Filter out excluded emails +6. Filter out records with no valid timestamp +7. **Deduplicate by email** within each source (sort by timestamp, keep last) +8. Handle timestamp splitting (batch_a = earliest consent, batch_b = later consent) +9. Merge overlapping records across sources (e.g. Marketo + Iterable country merge) +10. Write `output/batch_a/batch_a.csv` and `output/batch_b/batch_b.csv` +11. Run sanity checks: no exclusion leaks, print record counts + +**Target output columns** (adjust per project): + +``` +email, firstName, lastName, _country, _Subscribed, +_consent_date, timestamp +``` + +**Purpose column values**: `True`, `False`, or empty string (empty = no preference, will map to null). + +### 1.4 Run and verify counts + +```bash +cd working/ +python3 transform.py # or: npx ts-node transform.ts +``` + +Cross-reference output counts with customer-provided numbers. Account for: + +- Exclusion list removals +- Records with no timestamp (dropped) +- Duplicate emails across sources (merged, not double-counted) +- Timestamp splits creating batch_b rows + +## Phase 2: Chunk Large Files + +Files over ~10MB should be chunked for parallel upload. The `chunk-csv` command defaults to 10MB chunks. + +```bash +pnpm start admin chunk-csv \ + --directory ./working//output/batch_a/ \ + --chunkSizeMB 10 + +pnpm start admin chunk-csv \ + --directory ./working//output/batch_b/ \ + --chunkSizeMB 10 +``` + +After chunking, move the originals out so they don't get scanned/uploaded: + +```bash +mv output/batch_a/batch_a.csv output/batch_a_original.csv +mv output/batch_b/batch_b.csv output/batch_b_original.csv +``` + +Create symlink directory for config scanning and test subset: + +```bash +mkdir -p output/all_chunks output/test + +# Symlink all chunks +for f in output/batch_a/batch_a_chunk_*.csv; do ln -s "../$f" output/all_chunks/; done +for f in output/batch_b/batch_b_chunk_*.csv; do ln -s "../$f" output/all_chunks/; done + +# Test subset +head -101 output/batch_a/batch_a_chunk_0001.csv > output/test/test_100.csv +cp output/batch_a/batch_a_chunk_0001.csv output/test/ +``` + +### 2.1 Verify chunk counts + +**IMPORTANT**: Always verify row counts after chunking using `test_csv_count.sh`: + +```bash +bash test_csv_count.sh -H ./working//output/batch_a/ +bash test_csv_count.sh -H ./working//output/batch_b/ +``` + +Cross-reference totals against the transform script output: + +| Batch | Expected Records | Actual Records | Chunks | +| ------- | ---------------- | -------------- | ------ | +| batch_a | X | X | N | +| batch_b | X | X | N | +| Total | X | X | N | + +Do NOT proceed to upload until counts match. + +> **Known issue — auto-chunk re-chunking**: The upload command has a built-in auto-chunk +> (default 11MB) that re-processes files. Since `chunk-csv` defaults to 10MB, some chunks +> land slightly over 10MB. The upload's 11MB threshold avoids re-chunking these. If you +> still see re-chunking, pass `--chunkSizeMB 0` to the upload command to disable it. +> Re-chunking creates duplicate files with `_chunk_0001` suffix that must be cleaned up. + +## Phase 3: Generate & Validate Config + +### 3.1 Run interactive config + +```bash +pnpm start consent configure-preference-upload \ + --auth $TRANSCEND_API_KEY \ + --partition \ + --directory ./working//output/all_chunks/ \ + --transcendUrl +``` + +The interactive flow walks through 6 steps: + +1. **Identifier columns** — select email + any secondary identifiers (firstName, lastName) +2. **Identifier mapping** — map each to org identifier names, mark which are unique +3. **Timestamp column** — select the column used for "last preference update" +4. **Purpose columns** — select which columns map to purposes/preferences +5. **Value mapping** — map each unique value to opted-in/opted-out/null. Empty strings auto-map to null (no preference). `True`/`False` defaults are auto-detected. +6. **Metadata columns** — select which remaining columns to INCLUDE as metadata (unselected are ignored) + +### 3.2 Validate the config + +Read the generated `preference-upload-schema.json` and verify: + +- **columnToIdentifier**: correct names, `email` marked as unique +- **timestampColumn**: points to the right column +- **columnToPurposeName**: each purpose column maps to correct org purpose, valueMapping includes `"": null` for empty strings +- **columnToMetadata**: country columns or other useful metadata included +- **columnsToIgnore**: consent date columns or other non-upload columns excluded + +Compare against any previous upload's config if available. + +## Phase 4: Test Upload + +### 4.1 Ask the user + +- **Test size**: How many records? (default: 100) +- **Corner cases**: Any specific scenarios to verify? (e.g. empty purpose values, records with only one consent type, overlapping source records) +- **Verification links**: Generate Transcend dashboard links for spot-checking + +### 4.2 Run test upload + +**Important**: The upload command does NOT have a `--file` flag. It processes all CSV files in `--directory`. To upload only a specific file, ensure it's the only CSV in the directory. + +Move extra files out of the test directory before uploading: + +```bash +cp output/preference-upload-schema.json output/test/ +# If batch_a_chunk_0001.csv is also in test/, move it out temporarily +mv output/test/batch_a_chunk_0001.csv output/test/batch_a_chunk_0001.csv.bak +``` + +First do a dry run: + +```bash +pnpm start consent upload-preferences \ + --auth $TRANSCEND_API_KEY \ + --partition \ + --directory ./working//output/test/ \ + --transcendUrl \ + --concurrency 1 \ + --dryRun +``` + +Verify: `PendingSafe` should equal total rows, `PendingConflicts: 0`, `Skipped: 0`. + +Then upload for real (drop `--dryRun`): + +```bash +pnpm start consent upload-preferences \ + --auth $TRANSCEND_API_KEY \ + --partition \ + --directory ./working//output/test/ \ + --transcendUrl \ + --concurrency 1 +``` + +Restore the chunk file after test: + +```bash +mv output/test/batch_a_chunk_0001.csv.bak output/test/batch_a_chunk_0001.csv +``` + +### 4.3 Generate verification links + +Extract sample emails and build dashboard URLs. URLs must be URL-encoded: + +``` +https://app.transcend.io/preference-store/user-preferences?filters=%7B%22identifiers%22%3A%5B%7B%22name%22%3A%22email%22%2C%22value%22%3A%22%22%7D%5D%7D +``` + +Note: `@` encodes to `%40`. Do NOT use raw JSON in the URL — it won't work. + +Pick emails covering these scenarios: + +- Has both purposes opted-in +- Has one opted-in, one empty (should show no preference for the empty one) +- Has both opted-out +- Has mixed (one opted-in, one opted-out) +- Has metadata (country) populated, especially split-source metadata (different countries) +- From each data source if multiple + +### 4.4 API verification script + +Create a verification script to query the API directly and validate results programmatically: + +```bash +#!/usr/bin/env bash +SOMBRA_URL="https://multi-tenant.sombra.us.transcend.io" # adjust for EU +PARTITION="" + +query_email() { + local email="$1" label="$2" + echo "=== $label: $email ===" + curl -s "${SOMBRA_URL}/v1/preferences/${PARTITION}/query" \ + -H "Authorization: Bearer ${TRANSCEND_API_KEY}" \ + -H "Content-Type: application/json" \ + -d "{\"filter\":{\"identifiers\":[{\"name\":\"email\",\"value\":\"${email}\"}]},\"limit\":1}" \ + | python3 -m json.tool 2>/dev/null || echo "(failed to parse)" + echo "" +} + +query_email "user@example.com" "BOTH_OPTED_IN" +# ... add more scenarios +``` + +Run: `TRANSCEND_API_KEY= bash verify_upload.sh` + +Check that for each record: + +- `purposes` array has correct `enabled` values +- Empty source values result in purpose being **absent** (not `enabled: false`) +- `identifiers` include email, firstName, lastName +- `metadata` has correct country values +- `timestamp` matches source CSV +- `decryptionStatus` is `DECRYPTED`, `confirmed` is `true` + +Present results to user and ask them to verify before proceeding. + +## Phase 5: Full Upload + +### 5.1 Key flags for production uploads + +> **Important**: For large uploads (>1M records), dry-run and per-record verification +> dramatically slow uploads. For production runs of this scale, skip `--dryRun` and +> verify via sampling after completion. + +Critical flags: + +- **`--skipExistingRecordCheck`**: ALWAYS use for initial uploads or when the partition is empty/nearly empty. Without this, the uploader downloads existing preferences for every identifier in the file to check for conflicts — extremely slow on large datasets. +- **`--skipWorkflowTriggers`**: ALWAYS use for bulk imports. Without this, every record triggers workflows on the Transcend side, adding massive overhead and contributing to rate limiting. Only omit if workflows must fire per-record. +- **`--chunkSizeMB 0`**: Use when files are already pre-chunked to skip auto-chunking. +- **`--concurrency`**: Omit to auto-detect from CPU cores (typically 10-12). Each worker also makes `--uploadConcurrency` (default 75) parallel API requests with `--maxChunkSize` (default 25) records each. Total records in flight = `concurrency × uploadConcurrency × maxChunkSize`. + +### 5.2 Copy config and run + +```bash +cp output/preference-upload-schema.json output/batch_a/ +cp output/preference-upload-schema.json output/batch_b/ + +# Upload batch_a first (earliest timestamps) +pnpm start consent upload-preferences \ + --auth $TRANSCEND_API_KEY \ + --partition \ + --directory ./working//output/batch_a/ \ + --transcendUrl \ + --skipExistingRecordCheck \ + --skipWorkflowTriggers \ + --chunkSizeMB 0 + +# Then batch_b (later timestamps, overwrites where needed) +pnpm start consent upload-preferences \ + --auth $TRANSCEND_API_KEY \ + --partition \ + --directory ./working//output/batch_b/ \ + --transcendUrl \ + --skipExistingRecordCheck \ + --skipWorkflowTriggers \ + --chunkSizeMB 0 +``` + +### 5.3 Monitor progress + +The upload command shows a live dashboard with progress, throughput, and errors. Watch for: + +- **Rate limit retries**: Normal, the CLI handles these automatically +- **Unmapped value errors**: Indicates a value in the data not covered by the config — will hard-error in non-interactive worker mode +- **Network errors**: Transient, retried automatically up to 5 times +- **"Duplicate primary key" warnings**: Indicates duplicate emails within a single chunk file. The uploader handles these by uploading both with `___` suffix; the API resolves by timestamp. Harmless but indicates the transform script could improve dedup. + +### 5.4 Resumability + +The upload command writes receipt files. If interrupted, re-running the same command will resume from where it left off, skipping already-uploaded chunks. + +## Phase 6: Error Analysis & Cleanup + +If errors occur during upload: + +1. Check the logs directory for detailed error output: + - `/logs/worker-N.err.log` — per-worker error logs + - `/../receipts/combined-errors.log` — aggregated errors + - `/../receipts/combined-all.log` — full output with stack traces +2. Common issues: + - **Unmapped values**: Add missing mappings to `preference-upload-schema.json` and re-run + - **Unmapped columns**: Ensure all CSV columns are accounted for in the config as either identifiers, purposes, metadata, timestamp, or `columnsToIgnore` + - **Invalid identifiers**: Check for malformed emails in source data + - **Rate limits**: Reduce `--concurrency` or `--uploadConcurrency` and retry +3. Failed chunks can be re-uploaded by re-running the command (receipt-based resumption skips completed chunks) + +## Phase 7: Executive Summary + +After upload completes, produce a summary including: + +```markdown +# Preference Upload Summary — + +## Source Data + +- **Sources**: +- **Exclusion list**: emails filtered +- **Records dropped** (no timestamp): + +## Upload Results + +- **batch_a**: records across chunks +- **batch_b**: records across chunks +- **Total unique emails**: +- **Errors**: (detail any persistent failures) + +## Configuration + +- **Purposes mapped**: +- **Identifiers**: +- **Metadata**: + +## Verification + +- **Test upload**: records verified via dashboard +- **Sample checks**: + +## Timing + +- Transform: ~Xm +- Chunking: ~Xm +- Config generation: ~Xm +- Upload batch_a: ~Xh Xm +- Upload batch_b: ~Xh Xm +``` + +Post the summary as a comment on the associated Linear ticket. diff --git a/packages/cli/README.md b/packages/cli/README.md index 962083c4..50187c7c 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -30,6 +30,7 @@ A command line interface that allows you to programatically interact with the Tr - [`transcend request cron pull-identifiers`](#transcend-request-cron-pull-identifiers) - [`transcend request cron mark-identifiers-completed`](#transcend-request-cron-mark-identifiers-completed) - [`transcend consent build-xdi-sync-endpoint`](#transcend-consent-build-xdi-sync-endpoint) + - [`transcend consent configure-preference-upload`](#transcend-consent-configure-preference-upload) - [`transcend consent generate-access-tokens`](#transcend-consent-generate-access-tokens) - [`transcend consent pull-consent-metrics`](#transcend-consent-pull-consent-metrics) - [`transcend consent pull-consent-preferences`](#transcend-consent-pull-consent-preferences) @@ -1710,6 +1711,33 @@ transcend consent build-xdi-sync-endpoint \ --transcendUrl=https://api.us.transcend.io ``` +### `transcend consent configure-preference-upload` + +```txt +USAGE + transcend consent configure-preference-upload (--auth value) [--sombraAuth value] [--transcendUrl value] (--directory value) [--schemaFilePath value] (--partition value) + transcend consent configure-preference-upload --help + +Interactively configure the column mapping for preference CSV uploads. + +Scans ALL CSV files in the given directory to discover every column header +and every unique value per column, then walks through an interactive editor +to build the full mapping config (identifiers, ignored columns, timestamp, +purposes/preferences and their value mappings). + +The resulting config JSON is reused by 'upload-preferences' so subsequent +uploads run fully non-interactively. + +FLAGS + --auth The Transcend API key. Requires scopes: "View Preference Store Settings", "View Identity Verification Settings" + [--sombraAuth] The Sombra internal key, use for additional authentication when self-hosting Sombra + [--transcendUrl] URL of the Transcend backend. Use https://api.us.transcend.io for US hosting [default = https://api.transcend.io] + --directory Path to the directory of CSV files to scan for column headers and unique values + [--schemaFilePath] Path to the config JSON file. Defaults to /../preference-upload-schema.json + --partition The partition key for the preference store + -h --help Print help information and exit +``` + ### `transcend consent generate-access-tokens` ```txt @@ -2226,33 +2254,53 @@ transcend consent upload-data-flows-from-csv \ ```txt USAGE - transcend consent upload-preferences (--auth value) (--partition value) [--sombraAuth value] [--transcendUrl value] [--file value] [--directory value] [--dryRun] [--skipExistingRecordCheck] [--receiptFileDir value] [--skipWorkflowTriggers] [--forceTriggerWorkflows] [--skipConflictUpdates] [--isSilent] [--attributes value] [--receiptFilepath value] [--concurrency value] + transcend consent upload-preferences (--auth value) (--partition value) [--sombraAuth value] [--transcendUrl value] (--directory value) [--dryRun] [--skipExistingRecordCheck] [--receiptFileDir value] [--schemaFilePath value] [--skipWorkflowTriggers] [--forceTriggerWorkflows] [--skipConflictUpdates] [--isSilent] [--attributes value] [--receiptFilepath value] [--concurrency value] [--uploadConcurrency value] [--maxChunkSize value] [--rateLimitRetryDelay value] [--uploadLogInterval value] [--downloadIdentifierConcurrency value] [--maxRecordsToReceipt value] [--regenerate] [--chunkSizeMB value] [--viewerMode] transcend consent upload-preferences --help Upload preference management data to your Preference Store. -This command prompts you to map the shape of the CSV to the shape of the Transcend API. There is no requirement for the shape of the incoming CSV, as the script will handle the mapping process. +Requires a config file (generated by 'configure-preference-upload') that maps +CSV columns to identifiers, purposes, and preferences. If no config exists, +pass --regenerate to run the interactive configure flow first. -The script will also produce a JSON cache file that allows for the mappings to be preserved between runs. +Large files are automatically chunked into smaller pieces (controlled by +--chunkSizeMB) before uploading. + +Parallel preference uploader (Node 22+ ESM/TS) +----------------------------------------------------------------------------- +- Spawns a pool of child *processes* (not threads) to run uploads in parallel. +- Shows a live dashboard in the parent terminal with progress per worker. +- Creates per-worker log files and (optionally) opens OS terminals to tail them. +- Uses the same module as both parent and child; the child mode is toggled + by the presence of a CLI flag ('--as-child'). FLAGS - --auth The Transcend API key. Requires scopes: "Modify User Stored Preferences", "View Managed Consent Database Admin API", "View Preference Store Settings" - --partition The partition key to download consent preferences to - [--sombraAuth] The Sombra internal key, use for additional authentication when self-hosting Sombra - [--transcendUrl] URL of the Transcend backend. Use https://api.us.transcend.io for US hosting [default = https://api.transcend.io] - [--file] Path to the CSV file to load preferences from - [--directory] Path to the directory of CSV files to load preferences from - [--dryRun] Whether to do a dry run only - will write results to receiptFilepath without updating Transcend [default = false] - [--skipExistingRecordCheck] Whether to skip the check for existing records. SHOULD ONLY BE USED FOR INITIAL UPLOAD [default = false] - [--receiptFileDir] Directory path where the response receipts should be saved [default = ./receipts] - [--skipWorkflowTriggers] Whether to skip workflow triggers when uploading to preference store [default = false] - [--forceTriggerWorkflows] Whether to force trigger workflows for existing consent records [default = false] - [--skipConflictUpdates] Whether to skip uploading of any records where the preference store and file have a hard conflict [default = false] - [--isSilent/--noIsSilent] Whether to skip sending emails in workflows [default = true] - [--attributes] Attributes to add to any DSR request if created. Comma-separated list of key:value pairs. [default = Tags:transcend-cli,Source:transcend-cli] - [--receiptFilepath] Store resulting, continuing where left off [default = ./preference-management-upload-receipts.json] - [--concurrency] The concurrency to use when uploading in parallel [default = 10] - -h --help Print help information and exit + --auth The Transcend API key. Requires scopes: "Modify User Stored Preferences", "View Managed Consent Database Admin API", "View Preference Store Settings", "View Identity Verification Settings" + --partition The partition key to download consent preferences to + [--sombraAuth] The Sombra internal key, use for additional authentication when self-hosting Sombra + [--transcendUrl] URL of the Transcend backend. Use https://api.us.transcend.io for US hosting [default = https://api.transcend.io] + --directory Path to the directory of CSV files to load preferences from + [--dryRun] Whether to do a dry run only - will write results to receiptFilepath without updating Transcend [default = false] + [--skipExistingRecordCheck] Whether to skip the check for existing records. SHOULD ONLY BE USED FOR INITIAL UPLOAD [default = false] + [--receiptFileDir] Directory path where the response receipts should be saved. Defaults to ./receipts if a "file" is provided, or /../receipts if a "directory" is provided. + [--schemaFilePath] The path to where the schema for the file should be saved. If file is provided, it will default to ./-preference-upload-schema.json If directory is provided, it will default to /../preference-upload-schema.json + [--skipWorkflowTriggers] Whether to skip workflow triggers when uploading to preference store [default = false] + [--forceTriggerWorkflows] Whether to force trigger workflows for existing consent records [default = false] + [--skipConflictUpdates] Whether to skip uploading of any records where the preference store and file have a hard conflict [default = false] + [--isSilent/--noIsSilent] Whether to skip sending emails in workflows [default = true] + [--attributes] Attributes to add to any DSR request if created. Comma-separated list of key:value pairs. [default = Tags:transcend-cli,Source:transcend-cli] + [--receiptFilepath] Store resulting, continuing where left off [default = ./preference-management-upload-receipts.json] + [--concurrency] The number of concurrent processes to use to upload the files. When this is not set, it defaults to the number of CPU cores available on the machine. e.g. if there are 5 concurrent processes for 15 files, each parallel job would get 3 files to process. + [--uploadConcurrency] When uploading preferences to v1/preferences - this is the number of concurrent requests made at any given time by a single process.This is NOT the batch size—it's how many batch *tasks* run in parallel. The number of total concurrent requests is maxed out at concurrency * uploadConcurrency. [default = 75] + [--maxChunkSize] When uploading preferences to v1/preferences - this is the maximum number of records to put in a single request.The number of total concurrent records being put in at any one time is is maxed out at maxChunkSize * concurrency * uploadConcurrency. [default = 25] + [--rateLimitRetryDelay] When uploading preferences to v1/preferences - this is the number of milliseconds to wait before retrying a request that was rate limited. This is only used if the request is rate limited by the Transcend API. If the request fails for any other reason, it will not be retried. [default = 3000] + [--uploadLogInterval] When uploading preferences to v1/preferences - this is the number of records after which to log progress. Output will be logged to console and also to the receipt file. Setting this value lower will allow for you to more easily pick up where you left off. Setting this value higher can avoid excessive i/o operations slowing down the upload. Default is a good optimization for most cases. [default = 1000] + [--downloadIdentifierConcurrency] When downloading identifiers for the upload - this is the number of concurrent requests to make. This is only used if the records are not already cached in the preference store. [default = 30] + [--maxRecordsToReceipt] When writing out successful and pending records to the receipt file - this is the maximum number of records to write out. This is to avoid the receipt file getting too large for JSON.parse/stringify. [default = 10] + [--regenerate] Force re-generation of the schema config file before uploading. Runs the interactive configure flow even if a config already exists. [default = false] + [--chunkSizeMB] Auto-chunk threshold in MB. Any CSV file larger than this will be split into smaller files before uploading. Set to 0 to disable. [default = 11] + [--viewerMode] Run in non-interactive viewer mode (no attach UI, auto-artifacts) [default = false] + -h --help Print help information and exit ``` A sample CSV can be found [here](./examples/cli-upload-preferences-example.csv). In this example, `Sales` and `Marketing` are two custom Purposes, and `SalesCommunications` and `MarketingCommunications` are Preference Topics. During the interactive CLI prompt, you can map these columns to the slugs stored in Transcend! @@ -2264,7 +2312,7 @@ A sample CSV can be found [here](./examples/cli-upload-preferences-example.csv). ```sh transcend consent upload-preferences \ --auth="$TRANSCEND_API_KEY" \ - --file=./preferences.csv \ + --directory=./examples/pm-test \ --partition=4d1c5daa-90b7-4d18-aa40-f86a43d2c726 ``` @@ -2274,7 +2322,7 @@ transcend consent upload-preferences \ transcend consent upload-preferences \ --auth="$TRANSCEND_API_KEY" \ --partition=4d1c5daa-90b7-4d18-aa40-f86a43d2c726 \ - --file=./preferences.csv \ + --directory=./examples/pm-test \ --dryRun \ --skipWorkflowTriggers \ --skipConflictUpdates \ @@ -2289,7 +2337,7 @@ transcend consent upload-preferences \ transcend consent upload-preferences \ --auth="$TRANSCEND_API_KEY" \ --partition=4d1c5daa-90b7-4d18-aa40-f86a43d2c726 \ - --file=./preferences.csv \ + --directory=./examples/pm-test \ --transcendUrl=https://api.us.transcend.io ``` diff --git a/packages/cli/src/commands/consent/configure-preference-upload/command.ts b/packages/cli/src/commands/consent/configure-preference-upload/command.ts new file mode 100644 index 00000000..adf51901 --- /dev/null +++ b/packages/cli/src/commands/consent/configure-preference-upload/command.ts @@ -0,0 +1,53 @@ +import { buildCommand } from '@stricli/core'; +import { ScopeName } from '@transcend-io/privacy-types'; + +import { + createAuthParameter, + createSombraAuthParameter, + createTranscendUrlParameter, +} from '../../../lib/cli/common-parameters.js'; + +export const configurePreferenceUploadCommand = buildCommand({ + loader: async () => { + const { configurePreferenceUpload } = await import('./impl.js'); + return configurePreferenceUpload; + }, + parameters: { + flags: { + auth: createAuthParameter({ + scopes: [ScopeName.ViewPreferenceStoreSettings, ScopeName.ViewRequestIdentitySettings], + }), + sombraAuth: createSombraAuthParameter(), + transcendUrl: createTranscendUrlParameter(), + directory: { + kind: 'parsed', + parse: String, + brief: 'Path to the directory of CSV files to scan for column headers and unique values', + }, + schemaFilePath: { + kind: 'parsed', + parse: String, + brief: + 'Path to the config JSON file. Defaults to /../preference-upload-schema.json', + optional: true, + }, + partition: { + kind: 'parsed', + parse: String, + brief: 'The partition key for the preference store', + }, + }, + }, + docs: { + brief: 'Interactively configure the column mapping for preference CSV uploads', + fullDescription: `Interactively configure the column mapping for preference CSV uploads. + +Scans ALL CSV files in the given directory to discover every column header +and every unique value per column, then walks through an interactive editor +to build the full mapping config (identifiers, ignored columns, timestamp, +purposes/preferences and their value mappings). + +The resulting config JSON is reused by 'upload-preferences' so subsequent +uploads run fully non-interactively.`, + }, +}); diff --git a/packages/cli/src/commands/consent/configure-preference-upload/impl.ts b/packages/cli/src/commands/consent/configure-preference-upload/impl.ts new file mode 100644 index 00000000..0e98761b --- /dev/null +++ b/packages/cli/src/commands/consent/configure-preference-upload/impl.ts @@ -0,0 +1,361 @@ +import { createReadStream } from 'node:fs'; + +import { PersistedState } from '@transcend-io/persisted-state'; +import { buildTranscendGraphQLClient, FileFormatState, loadReferenceData } from '@transcend-io/sdk'; +import colors from 'colors'; +import { parse as csvParse } from 'csv-parse'; +import inquirer from 'inquirer'; +import * as t from 'io-ts'; + +import type { LocalContext } from '../../../context.js'; +import { doneInputValidation } from '../../../lib/cli/done-input-validation.js'; +import { collectCsvFilesOrExit } from '../../../lib/helpers/collectCsvFilesOrExit.js'; +import { + parsePreferenceIdentifiersFromCsv, + parsePreferenceFileFormatFromCsv, + parsePreferenceAndPurposeValuesFromCsv, +} from '../../../lib/preference-management/index.js'; +import { readCsv } from '../../../lib/requests/index.js'; +import { logger } from '../../../logger.js'; +import { computeSchemaFile } from '../upload-preferences/artifacts/index.js'; + +export interface ConfigurePreferenceUploadFlags { + auth: string; + sombraAuth?: string; + transcendUrl: string; + directory: string; + schemaFilePath?: string; + partition: string; +} + +/** + * Scan a single CSV file and collect its column headers plus all unique + * values per column. Uses streaming so large files don't need to be held + * in memory. + * + * @param file - CSV file path to scan + * @returns headers and uniqueValuesByColumn + */ +async function scanOneFile(file: string): Promise<{ + headers: Set; + uniqueValuesByColumn: Record>; +}> { + const headers = new Set(); + const uniqueValuesByColumn: Record> = {}; + + await new Promise((resolve, reject) => { + const parser = createReadStream(file).pipe(csvParse({ columns: true, skip_empty_lines: true })); + parser.on('data', (row: Record) => { + for (const [col, val] of Object.entries(row)) { + headers.add(col); + if (!uniqueValuesByColumn[col]) { + uniqueValuesByColumn[col] = new Set(); + } + const trimmed = (val || '').trim(); + uniqueValuesByColumn[col].add(trimmed); + } + }); + parser.on('end', resolve); + parser.on('error', reject); + }); + + return { headers, uniqueValuesByColumn }; +} + +const SCAN_CONCURRENCY = 25; + +async function scanCsvFiles(files: string[]): Promise<{ + /** Union of all column headers */ + headers: string[]; + /** Map of column name to its unique values (trimmed, non-empty) */ + uniqueValuesByColumn: Record>; +}> { + const allHeaders = new Set(); + const merged: Record> = {}; + let completed = 0; + + const queue = [...files]; + const run = async (): Promise => { + while (queue.length > 0) { + const file = queue.shift()!; + const result = await scanOneFile(file); + for (const h of result.headers) allHeaders.add(h); + for (const [col, vals] of Object.entries(result.uniqueValuesByColumn)) { + if (!merged[col]) merged[col] = new Set(); + for (const v of vals) merged[col].add(v); + } + completed += 1; + if (completed % 25 === 0 || completed === files.length) { + logger.info(colors.green(` Scanned ${completed}/${files.length} files...`)); + } + } + }; + + const workers = Array.from({ length: Math.min(SCAN_CONCURRENCY, files.length) }, () => run()); + await Promise.all(workers); + + return { headers: [...allHeaders], uniqueValuesByColumn: merged }; +} + +/** + * Build synthetic preference rows from the scanned unique values so + * the existing parse functions see every value at least once. + * + * Row count is driven only by `enumColumns` (purpose/preference columns) + * whose unique values actually matter for mapping. High-cardinality + * columns like timestamps or emails are filled with a single sample value. + * + * @param headers - all column headers + * @param uniqueValuesByColumn - unique values per column + * @param enumColumns - columns whose full unique values must be represented + * @returns synthetic rows covering all unique enum values + */ +function buildSyntheticRows( + headers: string[], + uniqueValuesByColumn: Record>, + enumColumns: string[] = [], +): Record[] { + const enumSet = new Set(enumColumns); + const maxRows = Math.max(1, ...enumColumns.map((h) => uniqueValuesByColumn[h]?.size ?? 0)); + const rows: Record[] = []; + for (let i = 0; i < maxRows; i += 1) { + const row: Record = {}; + for (const h of headers) { + const vals = uniqueValuesByColumn[h] ? [...uniqueValuesByColumn[h]] : ['']; + row[h] = enumSet.has(h) ? (vals[i % vals.length] ?? '') : (vals[0] ?? ''); + } + rows.push(row); + } + return rows; +} + +/** + * Interactively configure the column mapping for preference CSV uploads. + * + * Scans ALL CSV files in a directory, discovers every header and unique value, + * then walks the user through mapping identifiers, timestamps, + * purpose/preference value mappings, and metadata columns. + * Saves the result as a reusable config. + * + * @param flags - CLI flags + */ +export async function configurePreferenceUpload( + this: LocalContext, + flags: ConfigurePreferenceUploadFlags, +): Promise { + const { auth, transcendUrl, directory, schemaFilePath } = flags; + + const files = collectCsvFilesOrExit(directory, this); + doneInputValidation(this.process.exit); + + logger.info( + colors.green(`Scanning ${files.length} CSV file(s) for headers and unique values...`), + ); + + // 1) Scan all files to discover the full column/value universe + const { headers, uniqueValuesByColumn } = await scanCsvFiles(files); + logger.info(colors.green(`Discovered ${headers.length} columns across all files.`)); + + // 2) Fetch org reference data + const client = buildTranscendGraphQLClient(transcendUrl, auth); + const { purposes, preferenceTopics, identifiers } = await loadReferenceData(client, { logger }); + + const allIdentifierNames = identifiers.map((id) => id.name); + logger.info( + colors.green( + `Loaded ${purposes.length} purposes, ${preferenceTopics.length} preference topics, ${identifiers.length} identifiers from org.`, + ), + ); + + // 3) Create or load persisted schema state + const schemaFile = computeSchemaFile(schemaFilePath, directory, files[0]); + const initial = { + columnToPurposeName: {}, + lastFetchedAt: new Date().toISOString(), + columnToIdentifier: {}, + } as const; + const schemaState = new PersistedState(schemaFile, FileFormatState, initial); + + // 4) Interactive: select identifier columns + logger.info(colors.green('\n[Step 1/6] Identifier column selection...')); + const existingIdentifierCols = Object.keys(schemaState.getValue('columnToIdentifier')); + let identifierColumns: string[]; + if (existingIdentifierCols.length > 0) { + logger.info( + colors.magenta(`Existing identifier columns: ${existingIdentifierCols.join(', ')}`), + ); + const { reuse } = await inquirer.prompt<{ reuse: boolean }>([ + { + name: 'reuse', + type: 'confirm', + message: `Keep existing identifier column selection? (${existingIdentifierCols.join( + ', ', + )})`, + default: true, + }, + ]); + identifierColumns = reuse + ? existingIdentifierCols + : ( + await inquirer.prompt<{ cols: string[] }>([ + { + name: 'cols', + type: 'checkbox', + message: 'Select columns that are identifiers', + choices: headers, + validate: (v: string[]) => v.length > 0 || 'Select at least one identifier column', + }, + ]) + ).cols; + } else { + identifierColumns = ( + await inquirer.prompt<{ cols: string[] }>([ + { + name: 'cols', + type: 'checkbox', + message: 'Select columns that are identifiers', + choices: headers, + validate: (v: string[]) => v.length > 0 || 'Select at least one identifier column', + }, + ]) + ).cols; + } + + // 5) Map identifier columns to org identifier names + logger.info( + colors.green(`\n[Step 2/6] Identifier name mapping (validating sample: ${files[0]})...`), + ); + const sampleRows = readCsv(files[0], t.record(t.string, t.string)); + await parsePreferenceIdentifiersFromCsv(sampleRows, { + schemaState, + orgIdentifiers: identifiers, + allowedIdentifierNames: allIdentifierNames, + identifierColumns, + }); + + const identifierCols = Object.keys(schemaState.getValue('columnToIdentifier')); + + // 6) Select timestamp column (only needs column names, not full rows) + logger.info(colors.green('\n[Step 3/6] Timestamp column selection...')); + const timestampChoices = headers.filter((h) => !identifierCols.includes(h)); + await parsePreferenceFileFormatFromCsv( + [ + Object.fromEntries( + timestampChoices.map((h) => [h, [...(uniqueValuesByColumn[h] ?? [])][0] ?? '']), + ), + ], + schemaState, + ); + + // 7) Select which remaining columns map to purposes/preferences + logger.info(colors.green('\n[Step 4/6] Purpose/preference column selection...')); + const timestampCol = schemaState.getValue('timestampColumn'); + const mappedSoFar = [...identifierCols, ...(timestampCol ? [timestampCol] : [])]; + const remainingColumns = headers.filter((h) => !mappedSoFar.includes(h)); + + const { purposeColumns } = await inquirer.prompt<{ + purposeColumns: string[]; + }>([ + { + name: 'purposeColumns', + type: 'checkbox', + message: 'Select columns that map to purposes/preferences', + choices: remainingColumns, + validate: (v: string[]) => v.length > 0 || 'Select at least one purpose column', + }, + ]); + + const nonPurposeColumns = remainingColumns.filter((h) => !purposeColumns.includes(h)); + + // 8) Build synthetic rows driven ONLY by purpose column unique values + logger.info(colors.green('\n[Step 5/6] Mapping purpose values...')); + const syntheticRows = buildSyntheticRows(headers, uniqueValuesByColumn, purposeColumns); + logger.info( + colors.green( + ` Built ${syntheticRows.length} synthetic rows ` + + `(from ${purposeColumns.length} purpose columns).`, + ), + ); + + // 9) Map purpose columns to org purposes + value mappings + await parsePreferenceAndPurposeValuesFromCsv(syntheticRows, schemaState, { + purposeSlugs: purposes.map((p) => p.trackingType), + preferenceTopics, + forceTriggerWorkflows: false, + columnsToIgnore: nonPurposeColumns, + }); + + // 10) Metadata: select which remaining columns to INCLUDE as metadata + logger.info(colors.green('\n[Step 6/6] Metadata column selection...')); + if (nonPurposeColumns.length > 0) { + logger.info( + colors.magenta('\nRemaining unmapped columns:\n' + ` ${nonPurposeColumns.join(', ')}\n`), + ); + + const { metadataColumns } = await inquirer.prompt<{ + metadataColumns: string[]; + }>([ + { + name: 'metadataColumns', + type: 'checkbox', + message: 'Select columns to INCLUDE as metadata ' + '(unselected columns will be ignored)', + choices: nonPurposeColumns, + }, + ]); + + const ignored = nonPurposeColumns.filter((c) => !metadataColumns.includes(c)); + + if (ignored.length > 0) { + schemaState.setValue(ignored, 'columnsToIgnore'); + } + + if (metadataColumns.length > 0) { + const columnToMetadata: Record = {}; + for (const col of metadataColumns) { + columnToMetadata[col] = { key: col }; + } + schemaState.setValue(columnToMetadata, 'columnToMetadata'); + } + + logger.info( + colors.green( + ` Metadata: ${metadataColumns.length > 0 ? metadataColumns.join(', ') : '(none)'}`, + ), + ); + logger.info(colors.green(` Ignored: ${ignored.length > 0 ? ignored.join(', ') : '(none)'}`)); + } + + // 11) Validate completeness + const purposeCols = Object.keys(schemaState.getValue('columnToPurposeName')); + const ignoredCols = schemaState.getValue('columnsToIgnore') ?? []; + const metadataCols = Object.keys(schemaState.getValue('columnToMetadata') ?? {}); + const allMapped = new Set([ + ...identifierCols, + ...purposeCols, + ...ignoredCols, + ...metadataCols, + ...(timestampCol ? [timestampCol] : []), + ]); + const unmapped = headers.filter((h) => !allMapped.has(h)); + if (unmapped.length > 0) { + logger.warn( + colors.yellow( + `Warning: the following columns are not mapped: ${unmapped.join(', ')}. ` + + 'They will cause errors during upload. Re-run this command to fix.', + ), + ); + } + + schemaState.setValue(new Date().toISOString(), 'lastFetchedAt'); + + logger.info(colors.green(`\nConfiguration saved to: ${schemaFile}`)); + logger.info( + colors.green( + ` Identifiers: ${identifierCols.join(', ')}\n` + + ` Timestamp: ${timestampCol || '(none)'}\n` + + ` Purpose columns: ${purposeCols.join(', ')}\n` + + ` Metadata: ${metadataCols.join(', ') || '(none)'}\n` + + ` Ignored: ${ignoredCols.join(', ') || '(none)'}`, + ), + ); +} diff --git a/packages/cli/src/commands/consent/routes.ts b/packages/cli/src/commands/consent/routes.ts index bd24e3d4..b3a8d816 100644 --- a/packages/cli/src/commands/consent/routes.ts +++ b/packages/cli/src/commands/consent/routes.ts @@ -1,6 +1,7 @@ import { buildRouteMap } from '@stricli/core'; import { buildXdiSyncEndpointCommand } from './build-xdi-sync-endpoint/command.js'; +import { configurePreferenceUploadCommand } from './configure-preference-upload/command.js'; import { deletePreferenceRecordsCommand } from './delete-preference-records/command.js'; import { generateAccessTokensCommand } from './generate-access-tokens/command.js'; import { pullConsentMetricsCommand } from './pull-consent-metrics/command.js'; @@ -14,6 +15,7 @@ import { uploadPreferencesCommand } from './upload-preferences/command.js'; export const consentRoutes = buildRouteMap({ routes: { 'build-xdi-sync-endpoint': buildXdiSyncEndpointCommand, + 'configure-preference-upload': configurePreferenceUploadCommand, 'generate-access-tokens': generateAccessTokensCommand, 'pull-consent-metrics': pullConsentMetricsCommand, 'pull-consent-preferences': pullConsentPreferencesCommand, diff --git a/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/index.ts b/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/index.ts index bf2ebba3..c51a1158 100644 --- a/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/index.ts +++ b/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/index.ts @@ -1,4 +1,5 @@ export * from './readFailingUpdatesFromReceipt.js'; export * from './summarizeReceipt.js'; +export * from './receiptsState.js'; export * from './resolveReceiptPath.js'; export * from './applyReceiptSummary.js'; diff --git a/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/receiptsState.ts b/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/receiptsState.ts new file mode 100644 index 00000000..aa024555 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/receiptsState.ts @@ -0,0 +1,146 @@ +import { PersistedState } from '@transcend-io/persisted-state'; +import { + RequestUploadReceipts, + type FailingPreferenceUpdates, + type PendingSafePreferenceUpdates, + type PendingWithConflictPreferenceUpdates, + type PreferenceUpdateMap, + type SkippedPreferenceUpdates, +} from '@transcend-io/sdk'; +import { retrySamePromise, type RetryPolicy } from '@transcend-io/utils'; + +export type PreferenceReceiptsInterface = { + /** Path to file */ + receiptsFilepath: string; + /** + * Get the successfully updated records + */ + getSuccessful(): PreferenceUpdateMap; + /** + * Get the records pending upload + */ + getPending(): PreferenceUpdateMap; + /** + * Get the failing to upload records + */ + getFailing(): FailingPreferenceUpdates; + /** + * Set the new map of successful records + */ + setSuccessful(next: PreferenceUpdateMap): Promise; + /** + * Set the new map of pending records + */ + setPending(next: PreferenceUpdateMap): Promise; + /** + * Set the new map of safe to upload records + */ + setPendingSafe(next: PendingSafePreferenceUpdates): Promise; + /** + * Set the skipped records + */ + setSkipped(next: PendingSafePreferenceUpdates): Promise; + /** + * Set the new map of conflict upload records + */ + setPendingConflict(next: PendingWithConflictPreferenceUpdates): Promise; + /** + * Set the new map of failing records + */ + setFailing(next: FailingPreferenceUpdates): Promise; + /** + * Reset the pending records + */ + resetPending(): Promise; +}; + +/** + * Build a receipts state adapter for the given file path. + * + * Retries creation of the underlying PersistedState with **exponential backoff** + * when the receipts file cannot be parsed due to a transient write (e.g., empty + * or partially written file) indicated by "Unexpected end of JSON input". + * + * @param filepath - Where to persist/read upload receipts + * @returns Receipt state port with strongly-named methods + */ +export async function makeReceiptsState(filepath: string): Promise { + // Initial shape if file does not exist or is empty. + const initial = { + failingUpdates: {}, + pendingConflictUpdates: {}, + skippedUpdates: {}, + pendingSafeUpdates: {}, + successfulUpdates: {}, + pendingUpdates: {}, + lastFetchedAt: new Date().toISOString(), + } as const; + + // Retry policy: only retry on the specific JSON truncation message. + const policy: RetryPolicy = { + maxAttempts: 10, + delayMs: 500, // start small and backoff + shouldRetry: (_status, message) => + typeof message === 'string' && /Unexpected end of JSON input/i.test(message ?? ''), + }; + + // Exponential backoff cap to avoid unbounded waits. + const MAX_DELAY_MS = 5_000; + + try { + const s = await retrySamePromise( + async () => { + // Wrap constructor in a Promise so thrown sync errors reject properly. + const result = await Promise.resolve( + new PersistedState(filepath, RequestUploadReceipts, initial), + ); + return result; + }, + policy, + // eslint-disable-next-line @typescript-eslint/no-unused-vars + (_note) => { + // Double the delay on each backoff (cap at MAX_DELAY_MS) + policy.delayMs = Math.min(MAX_DELAY_MS, Math.max(1, policy.delayMs * 2)); + // Optional local diagnostics: + // process.stderr.write(`[receiptsState] ${_note}; next delay=${policy.delayMs}ms\n`); + }, + ); + + return { + receiptsFilepath: filepath, + getSuccessful: () => s.getValue('successfulUpdates'), + getPending: () => s.getValue('pendingUpdates'), + getFailing: () => s.getValue('failingUpdates'), + async setSuccessful(v: PreferenceUpdateMap) { + await s.setValue(v, 'successfulUpdates'); + }, + async setSkipped(v: SkippedPreferenceUpdates) { + await s.setValue(v, 'skippedUpdates'); + }, + async setPending(v: PreferenceUpdateMap) { + await s.setValue(v, 'pendingUpdates'); + }, + async setPendingSafe(v: PendingSafePreferenceUpdates) { + await s.setValue(v, 'pendingSafeUpdates'); + }, + async setPendingConflict(v: PendingWithConflictPreferenceUpdates) { + await s.setValue(v, 'pendingConflictUpdates'); + }, + async setFailing(v: FailingPreferenceUpdates) { + await s.setValue(v, 'failingUpdates'); + }, + async resetPending() { + await s.setValue({}, 'pendingUpdates'); + await s.setValue({}, 'pendingSafeUpdates'); + await s.setValue({}, 'skippedUpdates'); + await s.setValue({}, 'pendingConflictUpdates'); + }, + }; + } catch (error) { + throw new Error( + `Failed to create receipts state for ${filepath}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + } +} diff --git a/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/tests/receiptsState.test.ts b/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/tests/receiptsState.test.ts new file mode 100644 index 00000000..1dbd8209 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/artifacts/receipts/tests/receiptsState.test.ts @@ -0,0 +1,53 @@ +import { mkdtemp } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { describe, expect, it } from 'vitest'; + +import { makeReceiptsState } from '../receiptsState.js'; + +describe('makeReceiptsState', () => { + it('creates an empty receipts store and persists updates', async () => { + const tempDir = await mkdtemp(join(tmpdir(), 'cli-receipts-state-')); + const receiptsFilepath = join(tempDir, 'receipts.json'); + + const receipts = await makeReceiptsState(receiptsFilepath); + + expect(receipts.receiptsFilepath).toBe(receiptsFilepath); + expect(receipts.getPending()).toEqual({}); + expect(receipts.getSuccessful()).toEqual({}); + expect(receipts.getFailing()).toEqual({}); + + await receipts.setPending({ + row1: true, + }); + await receipts.setSuccessful({ + row2: true, + }); + await receipts.setFailing({ + row3: { + uploadedAt: '2025-08-07T00:00:00.000Z', + error: 'boom', + update: { + partition: 'partition-1', + timestamp: '2025-08-07T00:00:00.000Z', + identifiers: [{ name: 'email', value: 'test@example.com' }], + }, + }, + }); + + expect(receipts.getPending()).toEqual({ row1: true }); + expect(receipts.getSuccessful()).toEqual({ row2: true }); + expect(receipts.getFailing()).toEqual({ + row3: { + uploadedAt: '2025-08-07T00:00:00.000Z', + error: 'boom', + update: { + partition: 'partition-1', + timestamp: '2025-08-07T00:00:00.000Z', + identifiers: [{ name: 'email', value: 'test@example.com' }], + }, + }, + }); + }); +}); diff --git a/packages/cli/src/commands/consent/upload-preferences/buildTaskOptions.ts b/packages/cli/src/commands/consent/upload-preferences/buildTaskOptions.ts new file mode 100644 index 00000000..c7d96050 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/buildTaskOptions.ts @@ -0,0 +1,86 @@ +// helpers/buildCommon.ts +import type { UploadPreferencesCommandFlags } from './impl.js'; + +/** Common options shared by upload tasks */ +export type TaskCommonOpts = Pick< + UploadPreferencesCommandFlags, + | 'auth' + | 'partition' + | 'sombraAuth' + | 'directory' + | 'transcendUrl' + | 'skipConflictUpdates' + | 'uploadConcurrency' + | 'uploadLogInterval' + | 'maxChunkSize' + | 'downloadIdentifierConcurrency' + | 'rateLimitRetryDelay' + | 'maxRecordsToReceipt' + | 'skipWorkflowTriggers' + | 'skipExistingRecordCheck' + | 'isSilent' + | 'dryRun' + | 'attributes' + | 'forceTriggerWorkflows' +> & { + schemaFile: string; + receiptsFolder: string; +}; + +/** + * Copy the options from the main command over to the spawned tasks + * + * @param flags - All flags + * @param schemaFile - Schema file + * @param receiptsFolder - Receipts folder + * @returns Common task options + */ +export function buildCommonOpts( + flags: UploadPreferencesCommandFlags, + schemaFile: string, + receiptsFolder: string, +): TaskCommonOpts { + const { + auth, + directory, + sombraAuth, + partition, + transcendUrl, + downloadIdentifierConcurrency, + skipConflictUpdates, + skipWorkflowTriggers, + skipExistingRecordCheck, + isSilent, + dryRun, + attributes, + forceTriggerWorkflows, + uploadConcurrency, + maxChunkSize, + rateLimitRetryDelay, + maxRecordsToReceipt, + uploadLogInterval, + } = flags; + + return { + schemaFile, + receiptsFolder, + auth, + directory, + downloadIdentifierConcurrency, + sombraAuth, + partition, + transcendUrl, + skipConflictUpdates, + skipWorkflowTriggers, + skipExistingRecordCheck, + isSilent, + dryRun, + attributes, + forceTriggerWorkflows, + uploadConcurrency, + maxChunkSize, + rateLimitRetryDelay, + maxRecordsToReceipt, + uploadLogInterval, + }; +} diff --git a/packages/cli/src/commands/consent/upload-preferences/command.ts b/packages/cli/src/commands/consent/upload-preferences/command.ts index b2c83395..43ebab15 100644 --- a/packages/cli/src/commands/consent/upload-preferences/command.ts +++ b/packages/cli/src/commands/consent/upload-preferences/command.ts @@ -19,6 +19,7 @@ export const uploadPreferencesCommand = buildCommand({ ScopeName.ManageStoredPreferences, ScopeName.ViewManagedConsentDatabaseAdminApi, ScopeName.ViewPreferenceStoreSettings, + ScopeName.ViewRequestIdentitySettings, ], }), partition: { @@ -28,17 +29,10 @@ export const uploadPreferencesCommand = buildCommand({ }, sombraAuth: createSombraAuthParameter(), transcendUrl: createTranscendUrlParameter(), - file: { - kind: 'parsed', - parse: String, - brief: 'Path to the CSV file to load preferences from', - optional: true, - }, directory: { kind: 'parsed', parse: String, brief: 'Path to the directory of CSV files to load preferences from', - optional: true, }, dryRun: { kind: 'boolean', @@ -55,8 +49,17 @@ export const uploadPreferencesCommand = buildCommand({ receiptFileDir: { kind: 'parsed', parse: String, - brief: 'Directory path where the response receipts should be saved', - default: './receipts', + brief: + 'Directory path where the response receipts should be saved. Defaults to ./receipts if a "file" is provided, or /../receipts if a "directory" is provided.', + optional: true, + }, + schemaFilePath: { + kind: 'parsed', + parse: String, + brief: + 'The path to where the schema for the file should be saved. If file is provided, it will default to ./-preference-upload-schema.json ' + + 'If directory is provided, it will default to /../preference-upload-schema.json', + optional: true, }, skipWorkflowTriggers: { kind: 'boolean', @@ -95,17 +98,104 @@ export const uploadPreferencesCommand = buildCommand({ concurrency: { kind: 'parsed', parse: numberParser, - brief: 'The concurrency to use when uploading in parallel', + brief: + 'The number of concurrent processes to use to upload the files. When this is not set, it defaults ' + + 'to the number of CPU cores available on the machine. ' + + 'e.g. if there are 5 concurrent processes for 15 files, each parallel job would get 3 files to process. ', + optional: true, + }, + uploadConcurrency: { + kind: 'parsed', + parse: numberParser, + brief: + 'When uploading preferences to v1/preferences - this is the number of concurrent requests made at any given time by a single process.' + + "This is NOT the batch size—it's how many batch *tasks* run in parallel. " + + 'The number of total concurrent requests is maxed out at concurrency * uploadConcurrency.', + default: '75', // FIXME 25 + }, + maxChunkSize: { + kind: 'parsed', + parse: numberParser, + brief: + 'When uploading preferences to v1/preferences - this is the maximum number of records to put in a single request.' + + 'The number of total concurrent records being put in at any one time is is maxed out at maxChunkSize * concurrency * uploadConcurrency.', + default: '25', + }, + rateLimitRetryDelay: { + kind: 'parsed', + parse: numberParser, + brief: + 'When uploading preferences to v1/preferences - this is the number of milliseconds to wait before retrying a request that was rate limited. ' + + 'This is only used if the request is rate limited by the Transcend API. ' + + 'If the request fails for any other reason, it will not be retried. ', + default: '3000', + }, + uploadLogInterval: { + kind: 'parsed', + parse: numberParser, + brief: + 'When uploading preferences to v1/preferences - this is the number of records after which to log progress. ' + + 'Output will be logged to console and also to the receipt file. ' + + 'Setting this value lower will allow for you to more easily pick up where you left off. ' + + 'Setting this value higher can avoid excessive i/o operations slowing down the upload. ' + + 'Default is a good optimization for most cases.', + default: '1000', + }, + downloadIdentifierConcurrency: { + kind: 'parsed', + parse: numberParser, + brief: + 'When downloading identifiers for the upload - this is the number of concurrent requests to make. ' + + 'This is only used if the records are not already cached in the preference store. ', + default: '30', + }, + maxRecordsToReceipt: { + kind: 'parsed', + parse: numberParser, + brief: + 'When writing out successful and pending records to the receipt file - this is the maximum number of records to write out. ' + + 'This is to avoid the receipt file getting too large for JSON.parse/stringify.', default: '10', }, + regenerate: { + kind: 'boolean', + brief: + 'Force re-generation of the schema config file before uploading. ' + + 'Runs the interactive configure flow even if a config already exists.', + default: false, + }, + chunkSizeMB: { + kind: 'parsed', + parse: numberParser, + brief: + 'Auto-chunk threshold in MB. Any CSV file larger than this will be ' + + 'split into smaller files before uploading. Set to 0 to disable.', + default: '11', + }, + viewerMode: { + kind: 'boolean', + brief: 'Run in non-interactive viewer mode (no attach UI, auto-artifacts)', + default: false, + }, }, }, docs: { brief: 'Upload preference management data to your Preference Store', fullDescription: `Upload preference management data to your Preference Store. -This command prompts you to map the shape of the CSV to the shape of the Transcend API. There is no requirement for the shape of the incoming CSV, as the script will handle the mapping process. +Requires a config file (generated by 'configure-preference-upload') that maps +CSV columns to identifiers, purposes, and preferences. If no config exists, +pass --regenerate to run the interactive configure flow first. + +Large files are automatically chunked into smaller pieces (controlled by +--chunkSizeMB) before uploading. -The script will also produce a JSON cache file that allows for the mappings to be preserved between runs.`, +Parallel preference uploader (Node 22+ ESM/TS) +----------------------------------------------------------------------------- +- Spawns a pool of child *processes* (not threads) to run uploads in parallel. +- Shows a live dashboard in the parent terminal with progress per worker. +- Creates per-worker log files and (optionally) opens OS terminals to tail them. +- Uses the same module as both parent and child; the child mode is toggled + by the presence of a CLI flag ('--as-child').`, }, }); diff --git a/packages/cli/src/commands/consent/upload-preferences/impl.ts b/packages/cli/src/commands/consent/upload-preferences/impl.ts index 9505153a..7c268204 100644 --- a/packages/cli/src/commands/consent/upload-preferences/impl.ts +++ b/packages/cli/src/commands/consent/upload-preferences/impl.ts @@ -1,103 +1,186 @@ -import { readdirSync } from 'node:fs'; -import { basename, join } from 'node:path'; +import { statSync, existsSync } from 'node:fs'; +import { join } from 'node:path'; -import { map } from '@transcend-io/utils'; -import { splitCsvToList } from '@transcend-io/utils'; +import { + chunkOneCsvFile, + CHILD_FLAG, + type PoolHooks, + runPool, + buildExportStatus, + computePoolSize, + resolveWorkerPath, + PoolCancelledError, +} from '@transcend-io/utils'; import colors from 'colors'; import type { LocalContext } from '../../../context.js'; import { doneInputValidation } from '../../../lib/cli/done-input-validation.js'; -import { uploadPreferenceManagementPreferencesInteractive } from '../../../lib/preference-management/index.js'; +import { collectCsvFilesOrExit } from '../../../lib/helpers/collectCsvFilesOrExit.js'; +import { + dashboardPlugin, + createExtraKeyHandler, + installInteractiveSwitcher, +} from '../../../lib/pooling/index.js'; import { logger } from '../../../logger.js'; +import { + computeReceiptsFolder, + computeSchemaFile, + ExportManager, + writeFailingUpdatesCsv, + type FailingUpdateRow, +} from './artifacts/index.js'; +import { applyReceiptSummary } from './artifacts/receipts/index.js'; +import { buildCommonOpts } from './buildTaskOptions.js'; +import { + AnyTotals, + isUploadModeTotals, + isCheckModeTotals, + uploadPreferencesPlugin, +} from './ui/index.js'; + +/** + * A unit of work: instructs a worker to upload (or check) a single CSV file. + */ +export type UploadPreferencesTask = { + /** Absolute path of the CSV file to process. */ + filePath: string; + /** Command/worker options shared across tasks (built from CLI flags). */ + options: ReturnType; +}; + +/** + * Per-worker progress snapshot emitted by the worker. + * This mirrors the previous IPC progress payload for this command. + */ +export type UploadPreferencesProgress = { + /** File currently being processed. */ + filePath: string; + /** New successes since the last progress message (used to compute rates). */ + successDelta?: number; + /** Cumulative successes so far for the current file. */ + successTotal?: number; + /** Optional total row count for the file (if known). */ + fileTotal?: number; +}; + +/** + * Final result for a single file. + */ +export type UploadPreferencesResult = { + /** Success flag for the file. */ + ok: boolean; + /** File this result pertains to. */ + filePath: string; + /** Optional path to the worker-generated receipt file. */ + receiptFilepath?: string; + /** Optional error string when `ok === false`. */ + error?: string; +}; + +/** + * Aggregate totals shown in the dashboard. + * This command supports two modes: + * - upload mode totals + * - check mode totals + * + * The union is already defined in `./ui` as `AnyTotals`. + */ +type Totals = AnyTotals; export interface UploadPreferencesCommandFlags { auth: string; partition: string; sombraAuth?: string; transcendUrl: string; - file?: string; - directory?: string; + directory: string; dryRun: boolean; skipExistingRecordCheck: boolean; - receiptFileDir: string; + receiptFileDir?: string; + schemaFilePath?: string; skipWorkflowTriggers: boolean; forceTriggerWorkflows: boolean; skipConflictUpdates: boolean; isSilent: boolean; attributes: string; receiptFilepath: string; - concurrency: number; + concurrency?: number; + uploadConcurrency: number; + maxChunkSize: number; + rateLimitRetryDelay: number; + uploadLogInterval: number; + downloadIdentifierConcurrency: number; + maxRecordsToReceipt: number; + regenerate: boolean; + chunkSizeMB: number; + viewerMode: boolean; } +/** + * Parent entrypoint for uploading/checking many preference CSVs in parallel. + * + * Flow: + * 1) Validate inputs & discover CSV files (exit if none). + * 2) Compute pool size from `--concurrency` or CPU heuristic. + * 3) Build `common` worker options and task queue (one task per file). + * 4) Define `PoolHooks` for task scheduling, progress, and results aggregation. + * 5) Launch the pool with `runPool`, rendering via `dashboardPlugin(uploadPreferencesPlugin)`. + * + * All log exporting / artifact work that used to be done in “viewer mode” can be handled + * in `postProcess` using the new log context from the runner. + * + * @param flags - CLI options for the run. + * @returns Promise that resolves when the pool completes. + */ export async function uploadPreferences( this: LocalContext, - { + flags: UploadPreferencesCommandFlags, +): Promise { + const { auth, partition, sombraAuth, transcendUrl, - file = '', directory, - dryRun, skipExistingRecordCheck, receiptFileDir, - skipWorkflowTriggers, - forceTriggerWorkflows, - skipConflictUpdates, - isSilent, - attributes, + schemaFilePath, concurrency, - }: UploadPreferencesCommandFlags, -): Promise { - if (!!directory && !!file) { - logger.error( - colors.red('Cannot provide both a directory and a file. Please provide only one.'), - ); - this.process.exit(1); - } - - if (!file && !directory) { - logger.error( - colors.red( - 'A file or directory must be provided. Please provide one using --file=./preferences.csv or --directory=./preferences', - ), - ); - this.process.exit(1); - } + regenerate, + chunkSizeMB, + viewerMode, + } = flags; + /* 1) Validate & find inputs */ + let files = collectCsvFilesOrExit(directory, this); doneInputValidation(this.process.exit); - const files: string[] = []; - - if (directory) { - try { - const filesInDirectory = readdirSync(directory); - const csvFiles = filesInDirectory.filter((file) => file.endsWith('.csv')); - - if (csvFiles.length === 0) { - logger.error(colors.red(`No CSV files found in directory: ${directory}`)); - this.process.exit(1); + /* 1b) Auto-chunk oversized files */ + if (chunkSizeMB > 0) { + const chunkThreshold = chunkSizeMB * 1024 * 1024; + const oversized = files.filter((f) => { + try { + return statSync(f).size > chunkThreshold; + } catch { + return false; } - - // Add full paths for each CSV file - files.push(...csvFiles.map((file) => join(directory, file))); - } catch (err) { - logger.error(colors.red(`Failed to read directory: ${directory}`)); - logger.error(colors.red((err as Error).message)); - this.process.exit(1); - } - } else { - try { - // Verify file exists and is a CSV - if (!file.endsWith('.csv')) { - logger.error(colors.red('File must be a CSV file')); - this.process.exit(1); + }); + if (oversized.length > 0) { + logger.info( + colors.yellow(`Auto-chunking ${oversized.length} file(s) exceeding ${chunkSizeMB}MB...`), + ); + for (const file of oversized) { + await chunkOneCsvFile({ + filePath: file, + outputDir: directory, + clearOutputDir: false, + chunkSizeMB, + logger, + // eslint-disable-next-line @typescript-eslint/no-empty-function + onProgress: () => {}, + }); } - files.push(file); - } catch (err) { - logger.error(colors.red(`Failed to access file: ${file}`)); - logger.error(colors.red((err as Error).message)); - this.process.exit(1); + // Re-collect after chunking (new chunk files will be in the directory) + files = collectCsvFilesOrExit(directory, this); } } @@ -106,32 +189,223 @@ export async function uploadPreferences( `Processing ${files.length} consent preferences files for partition: ${partition}`, ), ); - logger.debug(`Files to process: ${files.join(', ')}`); + logger.debug( + `Files to process:\n${files.slice(0, 10).join('\n')}\n${ + files.length > 10 ? `... and ${files.length - 10} more` : '' + }`, + ); if (skipExistingRecordCheck) { - logger.info(colors.bgYellow(`Skipping existing record check: ${skipExistingRecordCheck}`)); + logger.info(colors.bgYellow('Skipping existing record check: true')); } - await map( - files, - async (filePath) => { - const fileName = basename(filePath).replace('.csv', ''); - await uploadPreferenceManagementPreferencesInteractive({ - receiptFilepath: join(receiptFileDir, `${fileName}-receipts.json`), + const receiptsFolder = computeReceiptsFolder(receiptFileDir, directory); + const schemaFile = computeSchemaFile(schemaFilePath, directory, files[0]); + + /* 1c) Auto-configure if needed */ + const configExists = existsSync(schemaFile); + if (!configExists || regenerate) { + if (!configExists && !regenerate) { + logger.error( + colors.red( + `No config file found at: ${schemaFile}\n` + + "Run 'transcend consent configure-preference-upload' to create one, " + + 'or pass --regenerate to run the interactive setup now.', + ), + ); + this.process.exit(1); + } + if (regenerate) { + logger.info(colors.yellow('Running interactive config generation...')); + const { configurePreferenceUpload } = await import('../configure-preference-upload/impl.js'); + await configurePreferenceUpload.call(this, { auth, sombraAuth, - file: filePath, - partition, transcendUrl, - skipConflictUpdates, - skipWorkflowTriggers, - skipExistingRecordCheck, - isSilent, - dryRun, - attributes: splitCsvToList(attributes), - forceTriggerWorkflows, + directory, + schemaFilePath, + partition, + }); + } + } + + /* 2) Pool size */ + const { poolSize, cpuCount } = computePoolSize(concurrency, files.length); + + /* 3) Build shared worker options and queue */ + const common = buildCommonOpts(flags, schemaFile, receiptsFolder); + + // FIFO queue: one task per file + const queue = files.map((filePath) => ({ + filePath, + options: common, + })); + + // Dashboard artifacts/export status (shown during renders) + // inside uploadPreferences() before runPool call: + const exportMgr = new ExportManager(receiptsFolder); + const exportStatus = buildExportStatus(receiptsFolder); + const failingUpdatesMem: FailingUpdateRow[] = []; + + /* 4) Hooks */ + const hooks: PoolHooks< + UploadPreferencesTask, + UploadPreferencesProgress, + UploadPreferencesResult, + Totals + > = { + nextTask: () => queue.shift(), + taskLabel: (t) => t.filePath, + initTotals: () => + !common.dryRun + ? ({ + mode: 'upload', + success: 0, + skipped: 0, + error: 0, + errors: {}, + } as Totals) + : ({ + mode: 'check', + totalPending: 0, + pendingConflicts: 0, + pendingSafe: 0, + skipped: 0, + } as Totals), + initSlotProgress: () => undefined, + onProgress: (totals) => totals, + onResult: (totals, res) => { + applyReceiptSummary({ + receiptsFolder: common.receiptsFolder, + filePath: res.filePath, + receiptFilepath: res.receiptFilepath, + agg: totals, + dryRun: common.dryRun, + failingUpdatesMem, }); + return { totals, ok: !!res.ok }; }, - { concurrency }, - ); + exportStatus: () => exportStatus, + /** + * Finalization after all workers exit. + * With the new runner you also receive: + * - logDir + * - logsBySlot (Map) + * - startedAt / finishedAt + * - getLogPathsForSlot(id) + * - viewerMode (boolean) + * + * @param options - Options with logDir, logsBySlot, startedAt, finishedAt, etc. + */ + postProcess: async ({ totals, logsBySlot }) => { + try { + // Persist failing updates CSV next to receipts/logDir. + const fPath = join(receiptsFolder, 'failing-updates.csv'); + await writeFailingUpdatesCsv(failingUpdatesMem, fPath); + exportStatus.failuresCsv = { + path: fPath, + savedAt: Date.now(), + exported: true, + }; + + // Save logs + await Promise.all([ + exportMgr.exportCombinedLogs(logsBySlot, 'error'), + exportMgr.exportCombinedLogs(logsBySlot, 'warn'), + exportMgr.exportCombinedLogs(logsBySlot, 'info'), + exportMgr.exportCombinedLogs(logsBySlot, 'all'), + ]); + + // Summarize totals to stdout (parity with the old implementation) + if (isUploadModeTotals(totals)) { + logger.info( + colors.green( + `All done. Success:${totals.success.toLocaleString()} ` + + `Skipped:${totals.skipped.toLocaleString()} ` + + `Error:${totals.error.toLocaleString()}`, + ), + ); + } else if (isCheckModeTotals(totals)) { + logger.info( + colors.green( + `All done. Pending:${totals.totalPending.toLocaleString()} ` + + `PendingConflicts:${totals.pendingConflicts.toLocaleString()} ` + + `PendingSafe:${totals.pendingSafe.toLocaleString()} ` + + `Skipped:${totals.skipped.toLocaleString()}`, + ), + ); + } + } catch (err: unknown) { + logger.error(colors.red(`Failed to export artifacts: ${String(err)}`)); + } + }, + }; + + /* 5) Launch the pool runner with our hooks and dashboard plugin. */ + await runPool({ + title: `Upload Preferences - ${directory}`, + baseDir: directory || receiptsFolder || process.cwd(), + childFlag: CHILD_FLAG, + childModulePath: resolveWorkerPath( + import.meta.url, + 'commands/consent/upload-preferences/worker.mjs', + ), + poolSize, + cpuCount, + filesTotal: files.length, + hooks, + viewerMode, + render: (input) => dashboardPlugin(input, uploadPreferencesPlugin), + installInteractiveSwitcher: viewerMode + ? undefined + : ({ + workers, + onCtrlC, + getLogPaths, + replayBytes: rb, + replayWhich: rw, + setPaused, + repaint: rp, + }) => + installInteractiveSwitcher({ + workers, + onCtrlC, + getLogPaths, + replayBytes: rb, + replayWhich: rw, + onAttach: () => setPaused(true), + onDetach: () => { + setPaused(false); + rp(); + }, + onEnterAttachScreen: (id) => { + setPaused(true); + process.stdout.write('\x1b[2J\x1b[H'); + process.stdout.write( + `Attached to worker ${id}. (Esc/Ctrl+] detach \u2022 Ctrl+D EOF \u2022 Ctrl+C SIGINT)\n`, + ); + }, + }), + extraKeyHandler: ({ logsBySlot, repaint, setPaused }) => + createExtraKeyHandler({ + logsBySlot, + repaint, + setPaused, + exportMgr, + exportStatus, + custom: { + F: async ({ noteExport, say }) => { + const fPath = join(receiptsFolder, 'failing-updates.csv'); + await writeFailingUpdatesCsv(failingUpdatesMem, fPath); + say(`\nWrote failing updates CSV to: ${fPath}`); + noteExport('failuresCsv', fPath); + }, + }, + }), + }).catch((err) => { + if (err instanceof PoolCancelledError) { + process.exit(130); + } + throw err; + }); } diff --git a/packages/cli/src/commands/consent/upload-preferences/readme.ts b/packages/cli/src/commands/consent/upload-preferences/readme.ts index 4a7d2097..9efb5ac7 100644 --- a/packages/cli/src/commands/consent/upload-preferences/readme.ts +++ b/packages/cli/src/commands/consent/upload-preferences/readme.ts @@ -9,7 +9,7 @@ const examples = buildExamples( 'Upload consent preferences to partition key `4d1c5daa-90b7-4d18-aa40-f86a43d2c726`', flags: { auth: '$TRANSCEND_API_KEY', - file: './preferences.csv', + directory: './examples/pm-test', partition: '4d1c5daa-90b7-4d18-aa40-f86a43d2c726', }, }, @@ -18,7 +18,7 @@ const examples = buildExamples( flags: { auth: '$TRANSCEND_API_KEY', partition: '4d1c5daa-90b7-4d18-aa40-f86a43d2c726', - file: './preferences.csv', + directory: './examples/pm-test', dryRun: true, skipWorkflowTriggers: true, skipConflictUpdates: true, @@ -32,7 +32,7 @@ const examples = buildExamples( flags: { auth: '$TRANSCEND_API_KEY', partition: '4d1c5daa-90b7-4d18-aa40-f86a43d2c726', - file: './preferences.csv', + directory: './examples/pm-test', transcendUrl: 'https://api.us.transcend.io', }, }, diff --git a/packages/cli/src/commands/consent/upload-preferences/schemaState.ts b/packages/cli/src/commands/consent/upload-preferences/schemaState.ts new file mode 100644 index 00000000..5380aca8 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/schemaState.ts @@ -0,0 +1,88 @@ +import { PersistedState } from '@transcend-io/persisted-state'; +import { + FileFormatState, + type ColumnIdentifierMap, + type ColumnMetadataMap, + type ColumnPurposeMap, +} from '@transcend-io/sdk'; +import { retrySamePromise, type RetryPolicy } from '@transcend-io/utils'; + +export interface PreferenceSchemaInterface { + /** Name of the column used as timestamp, if any */ + getTimestampColumn(): string | undefined; + /** CSV column name -> Purpose/Preference mapping */ + getColumnToPurposeName(): ColumnPurposeMap; + /** CSV column name -> Identifier mapping */ + getColumnToIdentifier(): ColumnIdentifierMap; + /** CSV column name -> Metadata key mapping */ + getColumnToMetadata(): ColumnMetadataMap | undefined; + /** CSV columns to ignore during upload */ + getColumnsToIgnore(): string[]; + /** The persisted cache */ // FIXME remove this + state: PersistedState; +} + +/** + * Build a schema state adapter holding CSV→purpose/identifier mappings. + * + * Retries creation of the underlying PersistedState with **exponential backoff** + * when the cache file cannot be parsed due to a transient write (e.g., empty or + * partially written file) indicated by "Unexpected end of JSON input". + * + * @param filepath - Path to the schema cache file + * @returns Schema state port with strongly-named methods + */ +export async function makeSchemaState(filepath: string): Promise { + // Initial state used if file does not exist or is empty. + const initial = { + columnToPurposeName: {}, + lastFetchedAt: new Date().toISOString(), + columnToIdentifier: {}, + } as const; + + // Retry policy: only retry on the specific JSON truncation message. + const policy: RetryPolicy = { + maxAttempts: 5, + delayMs: 50, // start small + shouldRetry: (_status, message) => + typeof message === 'string' && /Unexpected end of JSON input/i.test(message ?? ''), + }; + + // Exponential backoff with a reasonable cap. + const MAX_DELAY_MS = 2_000; + + try { + const state = await retrySamePromise( + async () => { + // Wrap constructor in a Promise so thrown sync errors reject properly. + const result = await Promise.resolve( + new PersistedState(filepath, FileFormatState, initial), + ); + return result; + }, + policy, + // eslint-disable-next-line @typescript-eslint/no-unused-vars + (note) => { + // Double the delay on each backoff (cap at MAX_DELAY_MS) + policy.delayMs = Math.min(MAX_DELAY_MS, Math.max(1, policy.delayMs * 2)); + // Optional: uncomment for local diagnostics + // process.stderr.write(`[schemaState] ${note}; next delay=${policy.delayMs}ms\n`); + }, + ); + + return { + state, + getTimestampColumn: (): string | undefined => state.getValue('timestampColumn'), + getColumnToPurposeName: (): ColumnPurposeMap => state.getValue('columnToPurposeName'), + getColumnToIdentifier: (): ColumnIdentifierMap => state.getValue('columnToIdentifier'), + getColumnToMetadata: (): ColumnMetadataMap | undefined => state.getValue('columnToMetadata'), + getColumnsToIgnore: (): string[] => state.getValue('columnsToIgnore') ?? [], + }; + } catch (err) { + throw new Error( + `Failed to create schema state from ${filepath}: ${ + err instanceof Error ? err.message : String(err) + }`, + ); + } +} diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/buildInteractiveUploadPlan.ts b/packages/cli/src/commands/consent/upload-preferences/upload/buildInteractiveUploadPlan.ts new file mode 100644 index 00000000..a345254f --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/upload/buildInteractiveUploadPlan.ts @@ -0,0 +1,183 @@ +import { + loadReferenceData, + type FileFormatState, + type PendingSafePreferenceUpdates, + type PendingWithConflictPreferenceUpdates, + type PreferenceUploadProgress, + type PreferenceUploadReferenceData, + type SkippedPreferenceUpdates, +} from '@transcend-io/sdk'; +import { limitRecords } from '@transcend-io/utils'; +import colors from 'colors'; +import type { Got } from 'got'; +import type { GraphQLClient } from 'graphql-request'; +import * as t from 'io-ts'; + +import type { FormattedAttribute } from '../../../../lib/graphql/formatAttributeValues.js'; +import { parsePreferenceManagementCsvWithCache } from '../../../../lib/preference-management/index.js'; +import { parseAttributesFromString, readCsv } from '../../../../lib/requests/index.js'; +import { logger } from '../../../../logger.js'; +import { type PreferenceReceiptsInterface } from '../artifacts/receipts/receiptsState.js'; +import { type PreferenceSchemaInterface } from '../schemaState.js'; + +export interface InteractiveUploadPreferencePlan { + /** CSV file path to load preference records from */ + file: string; + /** Partition key used throughout the upload */ + partition: string; + + /** Parsed "workflow attributes" (Key:Value pairs) */ + parsedAttributes: FormattedAttribute[]; + /** Reference data for transforming rows → PreferenceUpdateItem payloads */ + references: PreferenceUploadReferenceData; + /** Result sets derived entirely from validation/pre-processing */ + result: { + pendingSafeUpdates: PendingSafePreferenceUpdates; + pendingConflictUpdates: PendingWithConflictPreferenceUpdates; + skippedUpdates: SkippedPreferenceUpdates; + }; + + /** Snapshot of schema mappings to use during payload building */ + schema: Omit; +} + +/** + * Build an InteractiveUploadPreferencePlan by performing *validation-only* work. + * + * This performs *all pre-processing and validation* up front: + * - Reads the CSV + * - Validates timestamp column and identifier mappings (schema cache) + * - Maps columns to purposes/preferences + * - Loads current consent records (unless skipExistingRecordCheck=true) + * - Computes: pendingSafeUpdates / pendingConflictUpdates / skippedUpdates + * - Seeds the receipts file with snapshots of the pending sets + * + * The returned plan can be passed to `interactivePreferenceUploaderFromPlan` + * to perform the actual upload, keeping responsibilities cleanly separated. + * + * @param opts - Input options required to parse & validate the CSV + * @returns A fully-resolved plan ready to pass to the uploader + */ +export async function buildInteractiveUploadPreferencePlan({ + sombra, + client, + file, + partition, + receipts, + schema, + skipExistingRecordCheck = false, + forceTriggerWorkflows = false, + allowedIdentifierNames, + downloadIdentifierConcurrency = 30, + identifierDownloadLogInterval = 10000, + maxRecordsToReceipt = 50, + identifierColumns, + columnsToIgnore = [], + attributes = [], + nonInteractive = false, + onProgress, +}: { + /** Transcend GraphQL client */ + client: GraphQLClient; + /** Sombra instance to make requests to */ + sombra: Got; + /** CSV file to process */ + file: string; + /** Partition used to scope reads/writes */ + partition: string; + /** Receipts snapshots */ + receipts: PreferenceReceiptsInterface; + /** Schema information */ + schema: PreferenceSchemaInterface; + /** Skip the preflight existing-record check for speed (initial loads only) */ + skipExistingRecordCheck?: boolean; + /** Force workflow triggers; requires existing consent records for all rows */ + forceTriggerWorkflows?: boolean; + /** Concurrency for downloading identifiers */ + downloadIdentifierConcurrency?: number; + /** Allowed identifier names configured for the org/run */ + allowedIdentifierNames: string[]; + /** CSV columns that correspond to identifiers */ + identifierColumns: string[]; + /** CSV columns to ignore entirely */ + columnsToIgnore?: string[]; + /** Extra workflow attributes (pre-parsed Key:Value strings) */ + attributes?: string[]; + /** Interval to log when downloading identifiers */ + identifierDownloadLogInterval?: number; + /** Maximum records to write out to the receipt file */ + maxRecordsToReceipt?: number; + /** When true, throw instead of prompting (for worker processes) */ + nonInteractive?: boolean; + /** on progress callback */ + onProgress?: (info: PreferenceUploadProgress) => void; +}): Promise { + const parsedAttributes = parseAttributesFromString(attributes); + + // Informative status about prior runs (resume/diagnostics) + const failing = receipts.getFailing(); + const pending = receipts.getPending(); + logger.info( + colors.magenta( + 'Restored cache:\n' + + `${Object.values(failing).length} failing requests queued for retry\n` + + `${Object.values(pending).length} pending requests to process\n` + + `Processing file: ${file}\n`, + ), + ); + + // Build clients + reference data (purposes/topics/identifiers) + const references = await loadReferenceData(client, { logger }); + + // Read in the file + logger.info(colors.magenta(`Reading in file: "${file}"`)); + const preferences = readCsv(file, t.record(t.string, t.string)); + logger.info(colors.magenta(`Read in ${preferences.length} rows`)); + + // Parse & validate CSV → derive safe/conflict/skipped sets (no uploading) + const parsed = await parsePreferenceManagementCsvWithCache( + preferences, + { + file, + purposeSlugs: references.purposes.map((x) => x.trackingType), + preferenceTopics: references.preferenceTopics, + sombra, + partitionKey: partition, + skipExistingRecordCheck, + forceTriggerWorkflows, + orgIdentifiers: references.identifiers, + allowedIdentifierNames, + downloadIdentifierConcurrency, + identifierColumns, + identifierDownloadLogInterval, + columnsToIgnore, + onProgress, + nonInteractive, + }, + schema.state, + ); + + // Persist small snapshots of the pending sets into receipts for resumability. + await receipts.setPendingSafe(limitRecords(parsed.pendingSafeUpdates, maxRecordsToReceipt)); + await receipts.setSkipped(parsed.skippedUpdates); + await receipts.setPendingConflict(parsed.pendingConflictUpdates); + + // Return a compact, self-contained plan for the upload stage. + return { + file, + partition, + parsedAttributes, + references, + result: { + pendingSafeUpdates: parsed.pendingSafeUpdates, + pendingConflictUpdates: parsed.pendingConflictUpdates, + skippedUpdates: parsed.skippedUpdates, + }, + schema: { + timestampColumn: schema.getTimestampColumn(), + columnToPurposeName: schema.getColumnToPurposeName(), + columnToIdentifier: schema.getColumnToIdentifier(), + columnToMetadata: schema.getColumnToMetadata(), + }, + }; +} diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/index.ts b/packages/cli/src/commands/consent/upload-preferences/upload/index.ts new file mode 100644 index 00000000..1d484547 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/upload/index.ts @@ -0,0 +1,2 @@ +export * from './buildInteractiveUploadPlan.js'; +export * from './interactivePreferenceUploaderFromPlan.js'; diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/interactivePreferenceUploaderFromPlan.ts b/packages/cli/src/commands/consent/upload-preferences/upload/interactivePreferenceUploaderFromPlan.ts new file mode 100644 index 00000000..fca4ead4 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/upload/interactivePreferenceUploaderFromPlan.ts @@ -0,0 +1,346 @@ +import type { PreferenceUpdateItem } from '@transcend-io/privacy-types'; +import { + uploadChunkWithSplit, + buildPendingUpdates, + type PreferenceUploadProgress, +} from '@transcend-io/sdk'; +import { extractErrorMessage, limitRecords } from '@transcend-io/utils'; +import Bluebird from 'bluebird'; +/* eslint-disable no-param-reassign */ +import colors from 'colors'; +import type { Got } from 'got'; +import { chunk, groupBy } from 'lodash-es'; + +import { RETRYABLE_BATCH_STATUSES } from '../../../../constants.js'; +import { logger } from '../../../../logger.js'; +import type { PreferenceReceiptsInterface } from '../artifacts/receipts/index.js'; +import type { InteractiveUploadPreferencePlan } from './buildInteractiveUploadPlan.js'; + +const { map: pMap } = Bluebird; + +/** + * Execute the upload using a pre-built InteractiveUploadPlan. + * + * This function performs *no CSV parsing or validation*. It: + * - Converts pre-validated safe/conflict sets into PreferenceUpdateItem payloads + * - Batches + uploads with retry/split semantics + * - Writes progress snapshots to receipts + * + * @param plan - Output of `buildInteractiveUploadPlan` + * @param options - Upload-only options (batch size, concurrency, etc.) + */ +export async function interactivePreferenceUploaderFromPlan( + { + partition, + parsedAttributes, + references: { purposes, preferenceTopics }, + result: { pendingSafeUpdates, pendingConflictUpdates }, + schema, + }: InteractiveUploadPreferencePlan, + { + receipts, + sombra, + dryRun = false, + isSilent = true, + skipWorkflowTriggers = false, + skipConflictUpdates = false, + forceTriggerWorkflows = false, + uploadLogInterval = 1_000, + maxChunkSize = 25, + uploadConcurrency = 20, + maxRecordsToReceipt = 50, + onProgress, + }: { + /** Receipts interface */ + receipts: PreferenceReceiptsInterface; + /** Sombra got instance */ + sombra: Got; + /** Compute-only mode: do not PUT; still writes receipts snapshots */ + dryRun?: boolean; + /** Avoid downstream visible notifications */ + isSilent?: boolean; + /** Skip workflow triggers for each update */ + skipWorkflowTriggers?: boolean; + /** Only upload safe updates (ignore conflicts entirely) */ + skipConflictUpdates?: boolean; + /** Force triggering workflows for each update (use sparingly) */ + forceTriggerWorkflows?: boolean; + /** Log/persist cadence for progress updates */ + uploadLogInterval?: number; + /** Max records in a single batch PUT to v1/preferences */ + maxChunkSize?: number; + /** Max concurrent batch tasks at once */ + uploadConcurrency?: number; + /** Maximum records to write out to the receipt file */ + maxRecordsToReceipt?: number; + /** on progress callback */ + onProgress?: (info: PreferenceUploadProgress) => void; + }, +): Promise { + // Build final payloads (pure transform; no network) + const pendingUpdates: Record = buildPendingUpdates({ + safe: pendingSafeUpdates, + conflicts: pendingConflictUpdates, + skipConflictUpdates, + timestampColumn: schema.timestampColumn, + columnToPurposeName: schema.columnToPurposeName, + columnToIdentifier: schema.columnToIdentifier, + columnToMetadata: schema.columnToMetadata, + preferenceTopics, + purposes, + partition, + workflowAttrs: parsedAttributes, + isSilent, + skipWorkflowTriggers, + forceTriggerWorkflows, + }); + + // Seed pending uploads into receipts (first 10 expanded to keep file size small) + await receipts.setPending(limitRecords(pendingUpdates, maxRecordsToReceipt)); + + // Dry-run exits before any network calls + if (dryRun) { + logger.info( + colors.green( + `Dry run complete — ${Object.values(pendingUpdates).length} pending updates. ` + + `See receipts file: ${receipts.receiptsFilepath}`, + ), + ); + return; + } + + logger.info( + colors.magenta( + `Uploading ${ + Object.values(pendingUpdates).length + } preferences to partition: ${partition}. Concurrency: ${uploadConcurrency}, Max Chunk Size: ${maxChunkSize}` + + `, Max Records to Receipt: ${maxRecordsToReceipt}`, + ), + ); + + const t0 = Date.now(); + let uploadedCount = 0; + + // reset failing + await receipts.setFailing({}); + + // Get successful and filtered entries + const successful = receipts.getSuccessful(); + const allEntries = Object.entries(pendingUpdates) as Array<[string, PreferenceUpdateItem]>; + const filtered = allEntries.filter(([userId]) => !successful[userId]); + const fileTotal = filtered.length; + + onProgress?.({ + successDelta: 0, + successTotal: 0, + fileTotal, + }); + + if (filtered.length === 0) { + logger.warn( + colors.yellow( + `No pending updates to upload (all ${allEntries.length} are already marked successful).`, + ), + ); + await receipts.resetPending(); + return; + } + + if (filtered.length < allEntries.length) { + logger.warn( + colors.yellow( + `Filtered ${allEntries.length - filtered.length} already-successful updates. ` + + `${filtered.length} remain to upload.`, + ), + ); + } + + // Retry policy for "retry in place" statuses + const retryPolicy = { + maxAttempts: 5, + delayMs: 10_000, + shouldRetry: (status?: number) => + // eslint-disable-next-line @typescript-eslint/no-explicit-any + !!status && RETRYABLE_BATCH_STATUSES.has(status as any), + }; + + /** + * Mark a batch as successfully uploaded. Persists progress periodically based on + * `uploadLogInterval` to throttle IO and keep receipts compact. + * + * @param entries - Entries to mark as successful + */ + const markSuccessFor = async (entries: Array<[string, PreferenceUpdateItem]>): Promise => { + const successfulUpdates = receipts.getSuccessful(); + + for (const [userId] of entries) { + successfulUpdates[userId] = true; + delete pendingUpdates[userId]; + // Also keep the safe/conflict mirrors in sync in case of resume + delete pendingSafeUpdates[userId]; + delete pendingConflictUpdates[userId]; + } + uploadedCount += entries.length; + onProgress?.({ + successDelta: entries.length, + successTotal: uploadedCount, + fileTotal, + }); + + const shouldLog = + uploadedCount % uploadLogInterval === 0 || + Math.floor((uploadedCount - entries.length) / uploadLogInterval) < + Math.floor(uploadedCount / uploadLogInterval); + + if (shouldLog) { + logger.info( + colors.green( + `Uploaded ${uploadedCount}/${filtered.length} user preferences to partition ${partition}`, + ), + ); + await receipts.setSuccessful(successfulUpdates); + + await receipts.setPending(limitRecords(pendingUpdates, maxRecordsToReceipt)); + await receipts.setPendingSafe(limitRecords(pendingSafeUpdates, maxRecordsToReceipt)); + await receipts.setPendingConflict(pendingConflictUpdates); + } + }; + + /** + * Mark a single record failure with a concise, actionable error message. + * Mirrors are kept in sync to avoid reprocessing this row on resume. + * + * @param userId - User ID to mark as failed + * @param update - The update item that failed + * @param err - The error that occurred + */ + const markFailureForSingle = async ( + userId: string, + update: PreferenceUpdateItem, + err: unknown, + ): Promise => { + const msg = extractErrorMessage(err); + logger.error( + colors.red(`Failed to upload preferences for ${userId} (partition=${partition}): ${msg}`), + ); + const failing = receipts.getFailing(); + failing[userId] = { + uploadedAt: new Date().toISOString(), + update, + error: msg.includes('Identifier email did not pass validation') + ? 'Identifier email did not pass validation' + : msg, + }; + + delete pendingUpdates[userId]; + delete pendingSafeUpdates[userId]; + delete pendingConflictUpdates[userId]; + + await receipts.setFailing(failing); + }; + + /** + * Mark an entire batch as failed (used when we exhaust in-place retries for + * retryable statuses). Delegates to the single-failure handler per entry. + * + * @param entries - Entries to mark as failed + * @param err - The error that occurred + */ + const markFailureForBatch = async ( + entries: Array<[string, PreferenceUpdateItem]>, + err: unknown, + ): Promise => { + for (const [userId, update] of entries) { + await markFailureForSingle(userId, update, err); + } + }; + + const { + valid = [], + invalidAt = [], + invalidSlash = [], + } = groupBy(filtered, ([, update]) => + !update.identifiers + ? 'valid' + : update.identifiers.some((id) => id.name === 'email' && !id.value.includes('@')) + ? 'invalidAt' + : update.identifiers.some((id) => id.name === 'email' && id.value.includes('/')) + ? 'invalidSlash' + : 'valid', + ); + + if (invalidAt.length > 0) { + await markFailureForBatch(invalidAt, new Error('Invalid email format - missing @')); + } + if (invalidSlash.length > 0) { + await markFailureForBatch( + invalidSlash, + new Error('Invalid email format - email contains a slash (/)'), + ); + } + + if (valid.length === 0) { + logger.warn(colors.yellow('No updates to upload after validating emails.')); + await receipts.resetPending(); + return; + } + + // Kick off uploads in chunks; each chunk may be recursively split on errors + const chunks = chunk(valid, maxChunkSize); + await pMap( + chunks, + async (currentChunk) => { + await uploadChunkWithSplit( + currentChunk, + { + // Minimal transport surface for the uploader + putBatch: async (updates, opts) => { + await sombra + .put('v1/preferences', { + json: { + records: updates, + skipWorkflowTriggers: opts.skipWorkflowTriggers, + }, + }) + .json(); + }, + retryPolicy, + options: { skipWorkflowTriggers }, + isRetryableStatus: (s) => + // eslint-disable-next-line @typescript-eslint/no-explicit-any + !!s && RETRYABLE_BATCH_STATUSES.has(s as any), + logger, + }, + { + onSuccess: markSuccessFor, + onFailureSingle: ([userId, update], err) => markFailureForSingle(userId, update, err), + onFailureBatch: markFailureForBatch, + }, + ); + }, + { concurrency: uploadConcurrency }, + ); + + // Finalize receipts: persist success map and clear pending mirrors + await receipts.setSuccessful(receipts.getSuccessful()); + await receipts.resetPending(); + + const elapsedSec = (Date.now() - t0) / 1000; + logger.info( + colors.green( + `Successfully uploaded ${Object.keys(receipts.getSuccessful()).length} user preferences ` + + `to partition ${partition} in "${elapsedSec}" seconds!`, + ), + ); + + const remainingFailures = Object.values(receipts.getFailing()).length; + if (remainingFailures > 0) { + logger.error( + colors.red( + `There are ${remainingFailures} requests that failed to upload. ` + + `Please check the receipts file for details: ${receipts.receiptsFilepath}`, + ), + ); + } +} +/* eslint-enable no-param-reassign */ diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/loadReferenceData.ts b/packages/cli/src/commands/consent/upload-preferences/upload/loadReferenceData.ts deleted file mode 100644 index 13f68e46..00000000 --- a/packages/cli/src/commands/consent/upload-preferences/upload/loadReferenceData.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { - buildTranscendGraphQLClient, - fetchAllIdentifiers, - fetchAllPurposes, - fetchAllPreferenceTopics, - type Identifier, - type PreferenceTopic, - type Purpose, -} from '@transcend-io/sdk'; -import type { GraphQLClient } from 'graphql-request'; - -import { logger } from '../../../../logger.js'; - -export type PreferenceUploadReferenceData = { - /** - * List of purposes in the organization - */ - purposes: Purpose[]; - /** - * List of preference topics in the organization - */ - preferenceTopics: PreferenceTopic[]; - /** - * List of identifiers in the organization - */ - identifiers: Identifier[]; -}; - -/** - * Load all required reference data for an upload run. - * - * @param client - GraphQL client - * @returns GraphQL client and reference data arrays - */ -export async function loadReferenceData(client: GraphQLClient): Promise< - { - /** - * GraphQL client to use for making requests - */ - client: ReturnType; - } & PreferenceUploadReferenceData -> { - const [purposes, preferenceTopics, identifiers] = await Promise.all([ - fetchAllPurposes(client, { logger }), - fetchAllPreferenceTopics(client, { logger }), - fetchAllIdentifiers(client, { logger }), - ]); - return { client, purposes, preferenceTopics, identifiers }; -} diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/tests/batchUploader.test.ts b/packages/cli/src/commands/consent/upload-preferences/upload/tests/batchUploader.test.ts index 441c797b..73fc23ba 100644 --- a/packages/cli/src/commands/consent/upload-preferences/upload/tests/batchUploader.test.ts +++ b/packages/cli/src/commands/consent/upload-preferences/upload/tests/batchUploader.test.ts @@ -1,4 +1,6 @@ import type { PreferenceUpdateItem } from '@transcend-io/privacy-types'; +// --- Import SUT & mocked symbols --- +import { uploadChunkWithSplit, type BatchUploaderDeps } from '@transcend-io/sdk'; import { getErrorStatus, extractErrorMessage, @@ -8,8 +10,6 @@ import { import { describe, it, expect, vi, beforeEach, type Mock } from 'vitest'; import { logger } from '../../../../../logger.js'; -// --- Import SUT & mocked symbols --- -import { uploadChunkWithSplit, type BatchUploaderDeps } from '../batchUploader.js'; // --- Mocks (declare BEFORE importing the SUT) --- vi.mock('@transcend-io/utils', () => ({ @@ -89,6 +89,7 @@ describe('uploadChunkWithSplit', () => { retryPolicy: { maxAttempts: 3, delayMs: 10, shouldRetry: () => false }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn(() => false), + logger, }; const onSuccess = vi.fn().mockResolvedValue(undefined); @@ -135,6 +136,7 @@ describe('uploadChunkWithSplit', () => { }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn((s?: number) => s === 503), + logger, }; const onSuccess = vi.fn().mockResolvedValue(undefined); @@ -176,6 +178,7 @@ describe('uploadChunkWithSplit', () => { retryPolicy: { maxAttempts: 2, delayMs: 1, shouldRetry: () => true }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn((s?: number) => s === 429), + logger, }; const onFailureBatch = vi.fn().mockResolvedValue(undefined); @@ -225,6 +228,7 @@ describe('uploadChunkWithSplit', () => { retryPolicy: { maxAttempts: 1, delayMs: 1, shouldRetry: () => false }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn(() => false), + logger, }; const onSuccess = vi.fn().mockResolvedValue(undefined); @@ -265,6 +269,7 @@ describe('uploadChunkWithSplit', () => { retryPolicy: { maxAttempts: 1, delayMs: 1, shouldRetry: () => false }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn(() => false), + logger, }; const onFailureSingle = vi.fn().mockResolvedValue(undefined); @@ -297,6 +302,7 @@ describe('uploadChunkWithSplit', () => { retryPolicy: { maxAttempts: 2, delayMs: 1, shouldRetry: () => true }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn(() => false), // not retryable by status, but soft-rate-limit triggers retry anyway + logger, }; const onSuccess = vi.fn().mockResolvedValue(undefined); @@ -333,6 +339,7 @@ describe('uploadChunkWithSplit', () => { retryPolicy: { maxAttempts: 2, delayMs: 1, shouldRetry: () => true }, options: { skipWorkflowTriggers: false }, isRetryableStatus: vi.fn(() => false), + logger, }; const onSuccess = vi.fn().mockResolvedValue(undefined); diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/tests/loadReferenceData.test.ts b/packages/cli/src/commands/consent/upload-preferences/upload/tests/loadReferenceData.test.ts deleted file mode 100644 index 09edceb3..00000000 --- a/packages/cli/src/commands/consent/upload-preferences/upload/tests/loadReferenceData.test.ts +++ /dev/null @@ -1,97 +0,0 @@ -import type { Identifier, PreferenceTopic, Purpose } from '@transcend-io/sdk'; -import type { GraphQLClient } from 'graphql-request'; -import { describe, it, expect, vi, beforeEach } from 'vitest'; - -// Shared mocks (we’ll reset them each test) -const mFetchAllPurposes = vi.fn(); -const mFetchAllPreferenceTopics = vi.fn(); -const mFetchAllIdentifiers = vi.fn(); - -// Helper: after resetting modules, install the mocks, then import SUT fresh -async function importSut(): Promise<{ - loadReferenceData: (typeof import('../loadReferenceData.js'))['loadReferenceData']; -}> { - // Mock BEFORE importing the SUT - vi.mock('@transcend-io/sdk', () => ({ - fetchAllPurposes: mFetchAllPurposes, - fetchAllPreferenceTopics: mFetchAllPreferenceTopics, - fetchAllIdentifiers: mFetchAllIdentifiers, - })); - - const mod = await import('../loadReferenceData.js'); - return { - loadReferenceData: - mod.loadReferenceData as (typeof import('../loadReferenceData.js'))['loadReferenceData'], - }; -} - -describe('loadReferenceData', () => { - let client: GraphQLClient; - - beforeEach(() => { - vi.resetModules(); // ensure a clean module graph so mocks stick - - // IMPORTANT: reset implementations + once-queues between tests - mFetchAllPurposes.mockReset(); - mFetchAllPreferenceTopics.mockReset(); - mFetchAllIdentifiers.mockReset(); - - // Minimal safe stub - client = { - request: vi.fn().mockResolvedValue({}), - } as unknown as GraphQLClient; - }); - - it('loads purposes, topics, and identifiers', async () => { - const { loadReferenceData } = await importSut(); - - const purposes = [{ id: 'p1' }, { id: 'p2' }] as Purpose[]; - const preferenceTopics = [{ id: 't1' }] as PreferenceTopic[]; - const identifiers = [{ id: 'i1' }, { id: 'i2' }] as Identifier[]; - - mFetchAllPurposes.mockResolvedValueOnce(purposes); - mFetchAllPreferenceTopics.mockResolvedValueOnce(preferenceTopics); - mFetchAllIdentifiers.mockResolvedValueOnce(identifiers); - - const result = await loadReferenceData(client); - - expect(result.client).toBe(client); - expect(result.purposes).toEqual(purposes); - expect(result.preferenceTopics).toEqual(preferenceTopics); - expect(result.identifiers).toEqual(identifiers); - - expect(mFetchAllPurposes).toHaveBeenCalledTimes(1); - expect(mFetchAllPurposes).toHaveBeenCalledWith( - client, - expect.objectContaining({ logger: expect.anything() }), - ); - - expect(mFetchAllPreferenceTopics).toHaveBeenCalledTimes(1); - expect(mFetchAllPreferenceTopics).toHaveBeenCalledWith( - client, - expect.objectContaining({ logger: expect.anything() }), - ); - - expect(mFetchAllIdentifiers).toHaveBeenCalledTimes(1); - expect(mFetchAllIdentifiers).toHaveBeenCalledWith( - client, - expect.objectContaining({ logger: expect.anything() }), - ); - }); - - it('propagates errors (e.g., identifiers fetch fails)', async () => { - const { loadReferenceData } = await importSut(); - - const err = new Error('boom'); - - mFetchAllPurposes.mockResolvedValueOnce([{ id: 'p' }] as Purpose[]); - mFetchAllPreferenceTopics.mockResolvedValueOnce([{ id: 't' }] as PreferenceTopic[]); - mFetchAllIdentifiers.mockRejectedValueOnce(err); - - await expect(loadReferenceData(client)).rejects.toBe(err); - - expect(mFetchAllPurposes).toHaveBeenCalledTimes(1); - expect(mFetchAllPreferenceTopics).toHaveBeenCalledTimes(1); - expect(mFetchAllIdentifiers).toHaveBeenCalledTimes(1); - }); -}); diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/transform/index.ts b/packages/cli/src/commands/consent/upload-preferences/upload/transform/index.ts new file mode 100644 index 00000000..424257e3 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/upload/transform/index.ts @@ -0,0 +1 @@ +export * from './transformCsv.js'; diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/transform/transformCsv.ts b/packages/cli/src/commands/consent/upload-preferences/upload/transform/transformCsv.ts new file mode 100644 index 00000000..910966e1 --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/upload/transform/transformCsv.ts @@ -0,0 +1,82 @@ +// FIXME +import colors from 'colors'; + +import { logger } from '../../../../../logger.js'; + +/** + * Add Transcend ID to preferences if email_id is present + * + * FIXME remove + * + * @param preferences - List of preferences + * @returns The updated preferences with Transcend ID added + */ +export function transformCsv(preferences: Record[]): Record[] { + // Add a transcendent ID to each preference if it doesn't already exist + const disallowedEmails = (process.env.EMAIL_LIST || '') + .split(',') + .map((email) => email.trim().toLowerCase()); + + const keys = Object.keys(preferences[0]); + const isUdp = + keys.includes('email_address') && + keys.includes('person_id') && + keys.includes('member_id') && + keys.includes('birth_dt'); + if (isUdp) { + logger.info( + colors.yellow('Detected UDP format. Transforming preferences to include Transcend ID.'), + ); + + return preferences.map((pref) => { + const email = (pref.email_address || '').toLowerCase().trim(); + const emailAddress = !email || disallowedEmails.includes(email) ? '' : pref.email_address; + const birthDate = new Date(pref.birth_dt); + if (!!pref.birth_dt || Number.isNaN(birthDate.getTime())) { + logger.warn(colors.yellow(`No birth date for record: ${pref.email_address}`)); + } + return { + ...pref, + Minor: + !pref.birth_dt || Number.isNaN(birthDate.getTime()) + ? '' + : Date.now() - birthDate.getTime() < 1000 * 60 * 60 * 24 * 365 * 18 + ? 'True' + : 'False', + email_address: emailAddress, + // preference email address over transcendID + transcendID: emailAddress + ? '' + : pref.person_id && pref.person_id !== '-2' + ? pref.person_id + : pref.member_id, + }; + }); + } + + const isAdobe = + keys.includes('hashedCostcoID') && keys.includes('address') && keys.includes('lastUpdatedDate'); + if (isAdobe) { + logger.info(colors.green('Pre-processing as adobe ')); + return preferences.map((pref) => { + if (!pref.lastUpdatedDate) { + logger.warn( + colors.yellow( + `Record missing lastUpdatedDate - setting to now() - ${JSON.stringify(pref)}`, + ), + ); + } + return { + ...pref, + lastUpdatedDate: pref.lastUpdatedDate + ? pref.lastUpdatedDate + : new Date('08/24/2025').toISOString(), + }; + }); + } + + logger.info(colors.green('No special transformations applied.')); + + // FIXME skip the emails + return preferences; +} diff --git a/packages/cli/src/commands/consent/upload-preferences/worker.ts b/packages/cli/src/commands/consent/upload-preferences/worker.ts new file mode 100644 index 00000000..deada4eb --- /dev/null +++ b/packages/cli/src/commands/consent/upload-preferences/worker.ts @@ -0,0 +1,198 @@ +import { mkdirSync, createWriteStream } from 'node:fs'; +import { join, dirname } from 'node:path'; + +import { buildTranscendGraphQLClient, createSombraGotInstance } from '@transcend-io/sdk'; +import { CHILD_FLAG, splitCsvToList } from '@transcend-io/utils'; +import type { ToWorker } from '@transcend-io/utils'; + +import { logger } from '../../../logger.js'; +import { getFilePrefix } from './artifacts/index.js'; +import { makeReceiptsState } from './artifacts/receipts/receiptsState.js'; +import type { TaskCommonOpts } from './buildTaskOptions.js'; +import { makeSchemaState } from './schemaState.js'; +import { + interactivePreferenceUploaderFromPlan, + buildInteractiveUploadPreferencePlan, +} from './upload/index.js'; + +/** + * Run the child process for handling upload preferences. + * This runs in a separate CPU if possible + */ +export async function runChild(): Promise { + // Get worker ID from environment or default to 0 + const workerId = Number(process.env.WORKER_ID || '0'); + + // Determine log file path from environment or default location + const logFile = process.env.WORKER_LOG || join(process.cwd(), `logs/worker-${workerId}.log`); + mkdirSync(dirname(logFile), { recursive: true }); + + // Create a writable stream for logging + const logStream = createWriteStream(logFile, { flags: 'a' }); + + // Helper function to write logs with timestamp and worker ID + const log = (...args: unknown[]): void => { + const line = `[w${workerId}] ${new Date().toISOString()} ${args + .map((a) => String(a)) + .join(' ')}\n`; + logStream.write(line); + }; + + // Log that the worker is ready and send a ready message to parent + logger.info(`[w${workerId}] ready pid=${process.pid}`); + process.send?.({ type: 'ready' }); + + // Listen for messages from the parent process + process.on( + 'message', + async ( + msg: ToWorker<{ + /** File path */ + filePath: string; + /** Options */ + options: TaskCommonOpts; + }>, + ) => { + if (!msg || typeof msg !== 'object') return; + + // Handle 'task' messages to process a file + if (msg.type === 'task') { + const { filePath, options } = msg.payload; + // Compute the path for receipts file + const receiptFilepath = join( + options.receiptsFolder, + `${getFilePrefix(filePath)}-receipts.json`, + ); + try { + // Ensure receipts directory exists + mkdirSync(dirname(receiptFilepath), { recursive: true }); + logger.info(`[w${workerId}] START ${filePath}`); + log(`START ${filePath}`); + + // Construct common state objects for the task + const receipts = await makeReceiptsState(receiptFilepath); + const schema = await makeSchemaState(options.schemaFile); + const client = buildTranscendGraphQLClient(options.transcendUrl, options.auth); + const sombra = await createSombraGotInstance(options.transcendUrl, options.auth, { + logger, + sombraApiKey: options.sombraAuth, + }); + + // Derive identifierColumns and columnsToIgnore from config + const columnToIdentifier = schema.getColumnToIdentifier(); + const identifierColumns = Object.keys(columnToIdentifier); + const allowedIdentifierNames = [ + ...new Set(Object.values(columnToIdentifier).map((v) => v.name)), + ]; + const columnsToIgnore = schema.state.getValue('columnsToIgnore') ?? []; + + // Step 1: Build the upload plan (validation-only, non-interactive) + const plan = await buildInteractiveUploadPreferencePlan({ + sombra, + client, + file: filePath, + partition: options.partition, + receipts, + schema, + identifierDownloadLogInterval: options.uploadLogInterval * 10, + downloadIdentifierConcurrency: options.downloadIdentifierConcurrency, + skipExistingRecordCheck: options.skipExistingRecordCheck, + forceTriggerWorkflows: options.forceTriggerWorkflows, + allowedIdentifierNames, + maxRecordsToReceipt: options.maxRecordsToReceipt, + identifierColumns, + columnsToIgnore, + attributes: splitCsvToList(options.attributes), + nonInteractive: true, + // Report progress to parent process + onProgress: ({ successTotal, fileTotal }) => { + process.send?.({ + type: 'progress', + payload: { + filePath, + processed: successTotal, + total: fileTotal, + }, + }); + }, + }); + + // Step 2: Execute the upload using the plan + await interactivePreferenceUploaderFromPlan(plan, { + receipts, + sombra, + dryRun: options.dryRun, + isSilent: options.isSilent, + skipWorkflowTriggers: options.skipWorkflowTriggers, + skipConflictUpdates: options.skipConflictUpdates, + forceTriggerWorkflows: options.forceTriggerWorkflows, + uploadLogInterval: options.uploadLogInterval, + maxChunkSize: options.maxChunkSize, + uploadConcurrency: options.uploadConcurrency, + maxRecordsToReceipt: options.maxRecordsToReceipt, + // Report progress to parent process + onProgress: ({ successTotal, fileTotal }) => { + process.send?.({ + type: 'progress', + payload: { + filePath, + processed: successTotal, + total: fileTotal, + }, + }); + }, + }); + + // Log completion and send result to parent + logger.info(`[w${workerId}] DONE ${filePath}`); + log(`SUCCESS ${filePath}`); + + process.send?.({ + type: 'result', + payload: { ok: true, filePath, receiptFilepath }, + }); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + } catch (err: any) { + // Handle errors, log them, and send failure result to parent + const e = err?.stack || err?.message || String(err); + logger.error(`[w${workerId}] ERROR ${filePath}: ${err?.message || err}\n\n${e}`); + log(`FAIL ${filePath}\n${e}`); + process.send?.({ + type: 'result', + payload: { ok: false, filePath, error: e, receiptFilepath }, + }); + } + } else if (msg.type === 'shutdown') { + // Handle shutdown message: log and exit gracefully + logger.info(`[w${workerId}] shutdown`); + log('Shutting down.'); + logStream.end(() => process.exit(0)); + } + }, + ); + + // Handle uncaught exceptions: log and exit + process.on('uncaughtException', (err) => { + logger.error(`[w${workerId}] uncaughtException: ${err?.stack || err}`); + log(`uncaughtException\n${err?.stack || err}`); + logStream.end(() => process.exit(1)); + }); + // Handle unhandled promise rejections: log and exit + process.on('unhandledRejection', (reason) => { + logger.error(`[w${workerId}] unhandledRejection: ${String(reason)}`); + log(`unhandledRejection\n${String(reason)}`); + logStream.end(() => process.exit(1)); + }); + + // Keep the process alive indefinitely + await new Promise(() => { + // Keep the process alive + }); +} + +if (process.argv.includes(CHILD_FLAG)) { + runChild().catch((err) => { + logger.error(err); + process.exit(1); + }); +} diff --git a/packages/cli/src/lib/graphql/gqls/RequestDataSilo.ts b/packages/cli/src/lib/graphql/gqls/RequestDataSilo.ts index 50bb1136..9186013b 100644 --- a/packages/cli/src/lib/graphql/gqls/RequestDataSilo.ts +++ b/packages/cli/src/lib/graphql/gqls/RequestDataSilo.ts @@ -20,6 +20,7 @@ export const REQUEST_DATA_SILOS = gql` ) { nodes { id + status request { type } diff --git a/packages/cli/src/lib/preference-management/index.ts b/packages/cli/src/lib/preference-management/index.ts index 7485749d..d65a4028 100644 --- a/packages/cli/src/lib/preference-management/index.ts +++ b/packages/cli/src/lib/preference-management/index.ts @@ -1,6 +1,5 @@ -export * from './uploadPreferenceManagementPreferencesInteractive.js'; export * from './parsePreferenceManagementCsv.js'; export * from './parsePreferenceIdentifiersFromCsv.js'; -export * from './parsePreferenceTimestampsFromCsv.js'; +export * from './parsePreferenceFileFormatFromCsv.js'; export * from './parsePreferenceAndPurposeValuesFromCsv.js'; export * from './bulkDeletePreferenceRecords.js'; diff --git a/packages/cli/src/lib/preference-management/parsePreferenceAndPurposeValuesFromCsv.ts b/packages/cli/src/lib/preference-management/parsePreferenceAndPurposeValuesFromCsv.ts index e1b8148d..1234b405 100644 --- a/packages/cli/src/lib/preference-management/parsePreferenceAndPurposeValuesFromCsv.ts +++ b/packages/cli/src/lib/preference-management/parsePreferenceAndPurposeValuesFromCsv.ts @@ -1,5 +1,7 @@ +import type { PersistedState } from '@transcend-io/persisted-state'; import { PreferenceTopicType } from '@transcend-io/privacy-types'; -import { FileMetadataState, type PreferenceTopic } from '@transcend-io/sdk'; +import type { PreferenceTopic } from '@transcend-io/sdk'; +import { FileFormatState } from '@transcend-io/sdk'; import { mapSeries } from '@transcend-io/utils'; import { splitCsvToList } from '@transcend-io/utils'; import colors from 'colors'; @@ -8,23 +10,58 @@ import { uniq, difference } from 'lodash-es'; import { logger } from '../../logger.js'; -/* eslint-disable no-param-reassign */ +/** Values that clearly mean "no preference recorded" and should map to null. */ +const NULL_VALUES = new Set(['', 'undefined', 'null', 'none', 'n/a', 'na']); + +const FALSY_VALUES = new Set([ + 'false', + '0', + 'no', + 'n', + 'off', + 'opt-out', + 'optout', + 'opt_out', + 'unsubscribed', +]); + +/** + * Check whether a raw CSV value represents "no data" and should map to null. + * + * @param value - raw CSV cell value + * @returns true when the value should be treated as null (no preference) + */ +function looksNull(value: string): boolean { + return NULL_VALUES.has(value.trim().toLowerCase()); +} + +/** + * Infer a sensible Y/n default for a purpose/preference value prompt. + * + * @param value - raw CSV cell value + * @returns true when the value looks like "opted-in" + */ +function looksOptedIn(value: string): boolean { + return !FALSY_VALUES.has(value.trim().toLowerCase()) && !looksNull(value); +} /** * Parse out the purpose.enabled and preference values from a CSV file * * @param preferences - List of preferences - * @param currentState - The current file metadata state for parsing this list + * @param schemaState - The schema state to use for parsing the file * @param options - Options * @returns The updated file metadata state */ export async function parsePreferenceAndPurposeValuesFromCsv( preferences: Record[], - currentState: FileMetadataState, + schemaState: PersistedState, { purposeSlugs, preferenceTopics, forceTriggerWorkflows, + columnsToIgnore, + nonInteractive = false, }: { /** The purpose slugs that are allowed to be updated */ purposeSlugs: string[]; @@ -32,19 +69,26 @@ export async function parsePreferenceAndPurposeValuesFromCsv( preferenceTopics: PreferenceTopic[]; /** Force workflow triggers */ forceTriggerWorkflows: boolean; + /** Columns to ignore in the CSV file */ + columnsToIgnore: string[]; + /** When true, throw instead of prompting (for worker processes) */ + nonInteractive?: boolean; }, -): Promise { +): Promise> { // Determine columns to map const columnNames = uniq(preferences.map((x) => Object.keys(x)).flat()); // Determine the columns that could potentially be used for identifier + const timestampCol = schemaState.getValue('timestampColumn'); const otherColumns = difference(columnNames, [ - ...(currentState.identifierColumn ? [currentState.identifierColumn] : []), - ...(currentState.timestampColum ? [currentState.timestampColum] : []), + ...Object.keys(schemaState.getValue('columnToIdentifier')), + ...(timestampCol ? [timestampCol] : []), + ...columnsToIgnore, + ...Object.keys(schemaState.getValue('columnToMetadata') ?? {}), ]); if (otherColumns.length === 0) { if (forceTriggerWorkflows) { - return currentState; + return schemaState; } throw new Error('No other columns to process'); } @@ -57,16 +101,24 @@ export async function parsePreferenceAndPurposeValuesFromCsv( // Ensure all columns are accounted for await mapSeries(otherColumns, async (col) => { - // Determine the unique values to map in this column - const uniqueValues = uniq(preferences.map((x) => x[col])); + // Determine the unique values to map in this column (including empty strings) + const uniqueValues = uniq(preferences.map((x) => x[col] ?? '')); // Map the column to a purpose - let purposeMapping = currentState.columnToPurposeName[col]; + const currentPurposeMapping = schemaState.getValue('columnToPurposeName'); + let purposeMapping = currentPurposeMapping[col]; if (purposeMapping) { logger.info( colors.magenta(`Column "${col}" is associated with purpose "${purposeMapping.purpose}"`), ); } else { + if (nonInteractive) { + throw new Error( + `Column "${col}" has no purpose mapping in the config. ` + + "Run 'transcend consent configure-preference-upload' to update the config.", + ); + } + const { purposeName } = await inquirer.prompt<{ /** purpose name */ purposeName: string; @@ -97,20 +149,44 @@ export async function parsePreferenceAndPurposeValuesFromCsv( ); return; } + + if (looksNull(value)) { + logger.info( + colors.magenta( + `Value "${value || '(empty)'}" for column "${col}" → null (no preference)`, + ), + ); + purposeMapping.valueMapping[value] = null as unknown as boolean; + return; + } + + if (nonInteractive) { + throw new Error( + `Value "${value}" for column "${col}" has no mapping in the config. ` + + "Run 'transcend consent configure-preference-upload' to update the config.", + ); + } + // if preference is null, this column is just for the purpose if (purposeMapping.preference === null) { const { purposeValue } = await inquirer.prompt<{ - /** purpose value */ - purposeValue: boolean; + /** The mapped purpose value chosen by the user */ + purposeValue: string; }>([ { name: 'purposeValue', - message: `Choose the purpose value for value "${value}" associated with purpose "${purposeMapping.purpose}"`, - type: 'confirm', - default: value !== 'false', + message: `Map value "${value}" for purpose "${purposeMapping.purpose}"`, + type: 'list', + choices: [ + { name: 'true (opted in)', value: 'true' }, + { name: 'false (opted out)', value: 'false' }, + { name: 'null (skip / no preference)', value: 'null' }, + ], + default: looksOptedIn(value) ? 'true' : 'false', }, ]); - purposeMapping.valueMapping[value] = purposeValue; + purposeMapping.valueMapping[value] = + purposeValue === 'null' ? (null as unknown as boolean) : purposeValue === 'true'; } // if preference is not null, this column is for a specific preference @@ -124,37 +200,47 @@ export async function parsePreferenceAndPurposeValuesFromCsv( if (preferenceTopic.type === PreferenceTopicType.Boolean) { const { preferenceValue } = await inquirer.prompt<{ - /** purpose value */ - preferenceValue: boolean; + /** The mapped boolean preference value chosen by the user */ + preferenceValue: string; }>([ { name: 'preferenceValue', - message: - // eslint-disable-next-line max-len - `Choose the preference value for "${preferenceTopic.slug}" value "${value}" associated with purpose "${purposeMapping.purpose}"`, - type: 'confirm', - default: value !== 'false', + message: `Map value "${value}" for preference "${preferenceTopic.slug}" (${purposeMapping.purpose})`, + type: 'list', + choices: [ + { name: 'true (opted in)', value: 'true' }, + { name: 'false (opted out)', value: 'false' }, + { name: 'null (skip / no preference)', value: 'null' }, + ], + default: looksOptedIn(value) ? 'true' : 'false', }, ]); - purposeMapping.valueMapping[value] = preferenceValue; + purposeMapping.valueMapping[value] = + preferenceValue === 'null' ? (null as unknown as boolean) : preferenceValue === 'true'; return; } if (preferenceTopic.type === PreferenceTopicType.Select) { + const choices = [ + ...preferenceOptions.map((o) => ({ name: o, value: o })), + { name: '(null — skip / no preference)', value: '__null__' }, + ]; const { preferenceValue } = await inquirer.prompt<{ - /** purpose value */ - preferenceValue: boolean; + /** The mapped select preference value chosen by the user */ + preferenceValue: string; }>([ { name: 'preferenceValue', - // eslint-disable-next-line max-len - message: `Choose the preference value for "${preferenceTopic.slug}" value "${value}" associated with purpose "${purposeMapping.purpose}"`, + message: `Map value "${value}" for preference "${preferenceTopic.slug}" (${purposeMapping.purpose})`, type: 'list', - choices: preferenceOptions, + choices, default: preferenceOptions.find((x) => x === value), }, ]); - purposeMapping.valueMapping[value] = preferenceValue; + purposeMapping.valueMapping[value] = + preferenceValue === '__null__' + ? (null as unknown as boolean) + : (preferenceValue as unknown as boolean); return; } @@ -166,20 +252,29 @@ export async function parsePreferenceAndPurposeValuesFromCsv( if (purposeMapping.valueMapping[parsedValue] !== undefined) { return; } + const msChoices = [ + ...preferenceOptions.map((o) => ({ name: o, value: o })), + { + name: '(null — skip / no preference)', + value: '__null__', + }, + ]; const { preferenceValue } = await inquirer.prompt<{ - /** purpose value */ - preferenceValue: boolean; + /** The mapped multi-select preference value chosen by the user */ + preferenceValue: string; }>([ { name: 'preferenceValue', - // eslint-disable-next-line max-len - message: `Choose the preference value for "${preferenceTopic.slug}" value "${parsedValue}" associated with purpose "${purposeMapping.purpose}"`, + message: `Map token "${parsedValue}" for preference "${preferenceTopic.slug}" (${purposeMapping.purpose})`, type: 'list', - choices: preferenceOptions, + choices: msChoices, default: preferenceOptions.find((x) => x === parsedValue), }, ]); - purposeMapping.valueMapping[parsedValue] = preferenceValue; + purposeMapping.valueMapping[parsedValue] = + preferenceValue === '__null__' + ? (null as unknown as boolean) + : (preferenceValue as unknown as boolean); }); return; } @@ -187,10 +282,9 @@ export async function parsePreferenceAndPurposeValuesFromCsv( throw new Error(`Unknown preference topic type: ${preferenceTopic.type}`); } }); - - currentState.columnToPurposeName[col] = purposeMapping; + currentPurposeMapping[col] = purposeMapping; + schemaState.setValue(currentPurposeMapping, 'columnToPurposeName'); }); - return currentState; + return schemaState; } -/* eslint-enable no-param-reassign */ diff --git a/packages/cli/src/lib/preference-management/parsePreferenceTimestampsFromCsv.ts b/packages/cli/src/lib/preference-management/parsePreferenceFileFormatFromCsv.ts similarity index 54% rename from packages/cli/src/lib/preference-management/parsePreferenceTimestampsFromCsv.ts rename to packages/cli/src/lib/preference-management/parsePreferenceFileFormatFromCsv.ts index 8c35653d..8a9dfb5a 100644 --- a/packages/cli/src/lib/preference-management/parsePreferenceTimestampsFromCsv.ts +++ b/packages/cli/src/lib/preference-management/parsePreferenceFileFormatFromCsv.ts @@ -1,4 +1,5 @@ -import { FileMetadataState } from '@transcend-io/sdk'; +import type { PersistedState } from '@transcend-io/persisted-state'; +import { FileFormatState } from '@transcend-io/sdk'; import colors from 'colors'; import inquirer from 'inquirer'; import { uniq, difference } from 'lodash-es'; @@ -7,10 +8,8 @@ import { logger } from '../../logger.js'; export const NONE_PREFERENCE_MAP = '[NONE]'; -/* eslint-disable no-param-reassign */ - /** - * Parse timestamps from a CSV list of preferences + * Parse timestamps and other file format mapping from a CSV list of preferences * * When timestamp is requested, this script * ensures that all rows have a valid timestamp. @@ -19,23 +18,36 @@ export const NONE_PREFERENCE_MAP = '[NONE]'; * * @param preferences - List of preferences * @param currentState - The current file metadata state for parsing this list + * @param options - Options * @returns The updated file metadata state */ -export async function parsePreferenceTimestampsFromCsv( +export async function parsePreferenceFileFormatFromCsv( preferences: Record[], - currentState: FileMetadataState, -): Promise { + currentState: PersistedState, + { + nonInteractive = false, + }: { + /** When true, throw instead of prompting */ nonInteractive?: boolean; + } = {}, +): Promise> { // Determine columns to map const columnNames = uniq(preferences.map((x) => Object.keys(x)).flat()); // Determine the columns that could potentially be used for timestamp const remainingColumnsForTimestamp = difference(columnNames, [ - ...(currentState.identifierColumn ? [currentState.identifierColumn] : []), - ...Object.keys(currentState.columnToPurposeName), + ...Object.keys(currentState.getValue('columnToIdentifier')), + ...Object.keys(currentState.getValue('columnToPurposeName')), ]); // Determine the timestamp column to work off of - if (!currentState.timestampColum) { + if (!currentState.getValue('timestampColumn')) { + if (nonInteractive) { + throw new Error( + 'No timestamp column configured. ' + + "Run 'transcend consent configure-preference-upload' to set it.", + ); + } + const { timestampName } = await inquirer.prompt<{ /** timestamp name */ timestampName: string; @@ -51,29 +63,31 @@ export async function parsePreferenceTimestampsFromCsv( choices: [...remainingColumnsForTimestamp, NONE_PREFERENCE_MAP], }, ]); - currentState.timestampColum = timestampName; + + currentState.setValue(timestampName, 'timestampColumn'); } - logger.info(colors.magenta(`Using timestamp column "${currentState.timestampColum}"`)); + logger.info( + colors.magenta(`Using timestamp column "${currentState.getValue('timestampColumn')}"`), + ); // Validate that all rows have valid timestamp - if (currentState.timestampColum !== NONE_PREFERENCE_MAP) { + if (currentState.getValue('timestampColumn') !== NONE_PREFERENCE_MAP) { const timestampColumnsMissing = preferences - .map((pref, ind) => (pref[currentState.timestampColum!] ? null : [ind])) + .map((pref, ind) => (pref[currentState.getValue('timestampColumn')!] ? null : [ind])) .filter((x): x is number[] => !!x) .flat(); if (timestampColumnsMissing.length > 0) { throw new Error( - `The timestamp column "${ - currentState.timestampColum - }" is missing a value for the following rows: ${timestampColumnsMissing.join('\n')}`, + `The timestamp column "${currentState.getValue( + 'timestampColumn', + )}" is missing a value for the following rows: ${timestampColumnsMissing.join('\n')}`, ); } logger.info( colors.magenta( - `The timestamp column "${currentState.timestampColum}" is present for all row`, + `The timestamp column "${currentState.getValue('timestampColumn')}" is present for all row`, ), ); } return currentState; } -/* eslint-enable no-param-reassign */ diff --git a/packages/cli/src/lib/preference-management/parsePreferenceIdentifiersFromCsv.ts b/packages/cli/src/lib/preference-management/parsePreferenceIdentifiersFromCsv.ts index f99d5448..0bc77c57 100644 --- a/packages/cli/src/lib/preference-management/parsePreferenceIdentifiersFromCsv.ts +++ b/packages/cli/src/lib/preference-management/parsePreferenceIdentifiersFromCsv.ts @@ -1,11 +1,18 @@ -import { FileMetadataState } from '@transcend-io/sdk'; +import type { PersistedState } from '@transcend-io/persisted-state'; +import type { PreferenceStoreIdentifier } from '@transcend-io/privacy-types'; +import type { Identifier } from '@transcend-io/sdk'; +import type { FileFormatState, IdentifierMetadataForPreference } from '@transcend-io/sdk'; +import Bluebird from 'bluebird'; import colors from 'colors'; import inquirer from 'inquirer'; -import { uniq, groupBy, difference } from 'lodash-es'; +// groupBy +import { uniq, keyBy } from 'lodash-es'; import { logger } from '../../logger.js'; import { inquirerConfirmBoolean } from '../helpers/index.js'; +const { mapSeries } = Bluebird; + /* eslint-disable no-param-reassign */ /** @@ -15,62 +22,129 @@ import { inquirerConfirmBoolean } from '../helpers/index.js'; * and that all identifiers are unique. * * @param preferences - List of preferences - * @param currentState - The current file metadata state for parsing this list + * @param options - Options * @returns The updated file metadata state */ export async function parsePreferenceIdentifiersFromCsv( preferences: Record[], - currentState: FileMetadataState, + { + schemaState, + orgIdentifiers, + allowedIdentifierNames, + identifierColumns, + nonInteractive = false, + }: { + /** The current state of the schema metadata */ + schemaState: PersistedState; + /** The list of identifiers configured for the org */ + orgIdentifiers: Identifier[]; + /** The list of identifier names that are allowed for this upload */ + allowedIdentifierNames: string[]; + /** The columns in the CSV that should be used as identifiers */ + identifierColumns: string[]; + /** When true, throw instead of prompting (for worker processes) */ + nonInteractive?: boolean; + }, ): Promise<{ /** The updated state */ - currentState: FileMetadataState; + schemaState: PersistedState; /** The updated preferences */ preferences: Record[]; }> { + const columnNames = uniq(preferences.map((x) => Object.keys(x)).flat()).filter((col) => + identifierColumns.includes(col), + ); // Determine columns to map - const columnNames = uniq(preferences.map((x) => Object.keys(x)).flat()); + const orgIdentifiersByName = keyBy(orgIdentifiers, 'name'); + const filteredOrgIdentifiers = allowedIdentifierNames + .map((name) => orgIdentifiersByName[name]) + .filter(Boolean); + if (filteredOrgIdentifiers.length !== allowedIdentifierNames.length) { + const missingIdentifiers = allowedIdentifierNames.filter((name) => !orgIdentifiersByName[name]); + throw new Error(`No identifier configuration found for "${missingIdentifiers.join('","')}"`); + } + if (columnNames.length !== identifierColumns.length) { + const missingColumns = identifierColumns.filter((col) => !columnNames.includes(col)); + throw new Error( + `The following identifier columns are missing from the CSV: "${missingColumns.join('","')}"`, + ); + } + + if ( + filteredOrgIdentifiers.filter((identifier) => identifier.isUniqueOnPreferenceStore).length === 0 + ) { + throw new Error( + 'No unique identifier was provided. Please ensure that at least one ' + + 'of the allowed identifiers is configured as unique on the preference store.', + ); + } + + // Determine the columns that could potentially be used for identifiers + const currentColumnToIdentifier = schemaState.getValue('columnToIdentifier'); + await mapSeries(identifierColumns, async (col) => { + // Map the column to an identifier + const identifierMapping = currentColumnToIdentifier[col]; + if (identifierMapping) { + logger.info( + colors.magenta(`Column "${col}" is associated with identifier "${identifierMapping.name}"`), + ); + return; + } - // Determine the columns that could potentially be used for identifier - const remainingColumnsForIdentifier = difference(columnNames, [ - ...(currentState.identifierColumn ? [currentState.identifierColumn] : []), - ...Object.keys(currentState.columnToPurposeName), - ]); + if (nonInteractive) { + throw new Error( + `Column "${col}" has no identifier mapping in the config. ` + + "Run 'transcend consent configure-preference-upload' to update the config.", + ); + } - // Determine the identifier column to work off of - if (!currentState.identifierColumn) { + // If the column is not mapped, ask the user to map it const { identifierName } = await inquirer.prompt<{ /** Identifier name */ identifierName: string; }>([ { name: 'identifierName', - message: - 'Choose the column that will be used as the identifier to upload consent preferences by', + message: `Choose the identifier name for column "${col}"`, type: 'list', - default: - remainingColumnsForIdentifier.find((col) => col.toLowerCase().includes('email')) || - remainingColumnsForIdentifier[0], - choices: remainingColumnsForIdentifier, + // Default to the first allowed identifier name + default: allowedIdentifierNames.find((x) => x.startsWith(col)), + choices: allowedIdentifierNames, }, ]); - currentState.identifierColumn = identifierName; - } - logger.info(colors.magenta(`Using identifier column "${currentState.identifierColumn}"`)); + currentColumnToIdentifier[col] = { + name: identifierName, + isUniqueOnPreferenceStore: orgIdentifiersByName[identifierName].isUniqueOnPreferenceStore, + }; + }); + schemaState.setValue(currentColumnToIdentifier, 'columnToIdentifier'); - // Validate that the identifier column is present for all rows and unique - const identifierColumnsMissing = preferences - .map((pref, ind) => (pref[currentState.identifierColumn!] ? null : [ind])) + const uniqueIdentifierColumns = Object.entries(currentColumnToIdentifier) + .filter(([, identifierMapping]) => identifierMapping.isUniqueOnPreferenceStore) + .map(([col]) => col); + + // Validate that the at least 1 unique identifier column is present + const uniqueIdentifierMissingIndexes = preferences + .map((pref, ind) => (uniqueIdentifierColumns.some((col) => !!pref[col]) ? null : [ind])) .filter((x): x is number[] => !!x) .flat(); - if (identifierColumnsMissing.length > 0) { - const msg = `The identifier column "${ - currentState.identifierColumn - }" is missing a value for the following rows: ${identifierColumnsMissing.join(', ')}`; + + if (uniqueIdentifierMissingIndexes.length > 0) { + const msg = ` + The following rows ${uniqueIdentifierMissingIndexes.join( + ', ', + )} do not have any unique identifier values for the columns "${uniqueIdentifierColumns.join( + '", "', + )}".`; logger.warn(colors.yellow(msg)); + if (nonInteractive) { + throw new Error(msg); + } + // Ask user if they would like to skip rows missing an identifier const skip = await inquirerConfirmBoolean({ - message: 'Would you like to skip rows missing an identifier?', + message: 'Would you like to skip rows missing unique identifiers?', }); if (!skip) { throw new Error(msg); @@ -78,49 +152,89 @@ export async function parsePreferenceIdentifiersFromCsv( // Filter out rows missing an identifier const previous = preferences.length; - preferences = preferences.filter((pref) => pref[currentState.identifierColumn!]); + preferences = preferences.filter( + (pref, index) => !uniqueIdentifierMissingIndexes.includes(index), + ); logger.info( - colors.yellow(`Skipped ${previous - preferences.length} rows missing an identifier`), + colors.yellow(`Skipped ${previous - preferences.length} rows missing unique identifiers`), ); } logger.info( colors.magenta( - `The identifier column "${currentState.identifierColumn}" is present for all rows`, + `At least one unique identifier column is present for all ${preferences.length} rows.`, ), ); - // Validate that all identifiers are unique - const rowsByUserId = groupBy(preferences, currentState.identifierColumn); - const duplicateIdentifiers = Object.entries(rowsByUserId).filter(([, rows]) => rows.length > 1); - if (duplicateIdentifiers.length > 0) { - const msg = `The identifier column "${ - currentState.identifierColumn - }" has duplicate values for the following rows: ${duplicateIdentifiers - .slice(0, 10) - .map(([userId, rows]) => `${userId} (${rows.length})`) - .join('\n')}`; - logger.warn(colors.yellow(msg)); + return { schemaState, preferences }; +} +/* eslint-enable no-param-reassign */ - // Ask user if they would like to take the most recent update - // for each duplicate identifier - const skip = await inquirerConfirmBoolean({ - message: 'Would you like to automatically take the latest update?', - }); - if (!skip) { - throw new Error(msg); - } - preferences = Object.entries(rowsByUserId) - .map(([, rows]) => { - const sorted = rows.sort( - (a, b) => - new Date(b[currentState.timestampColum!]).getTime() - - new Date(a[currentState.timestampColum!]).getTime(), - ); - return sorted[0]; - }) - .filter((x) => x); - } +/** + * Helper function to get the identifiers payload from a row + * + * @param options - Options + * @param options.row - The current row from CSV file + * @param options.columnToIdentifier - The column to identifier mapping metadata + * @returns The updated preferences with identifiers payload + */ +export function getPreferenceIdentifiersFromRow({ + row, + columnToIdentifier, +}: { + /** The current row from CSV file */ + row: Record; + /** The current file metadata state */ + columnToIdentifier: FileFormatState['columnToIdentifier']; +}): PreferenceStoreIdentifier[] { + const identifiers = Object.entries(columnToIdentifier) + .filter(([col]) => !!row[col]) + .map(([col, identifierMapping]) => ({ + name: identifierMapping.name, + value: row[col], + })); + // put email first if it exists + // TODO: https://linear.app/transcend/issue/PIK-285/set-precedence-of-unique-identifiers - remove email logic + return identifiers.sort( + (a, b) => + (a.name === 'email' ? -1 : 0) - (b.name === 'email' ? -1 : 0) || + a.name.localeCompare(b.name, undefined, { sensitivity: 'base' }), + ); +} - return { currentState, preferences }; +/** + * Helper function to get unique identifier name present in a row + * + * @param options - Options + * @param options.row - The current row from CSV file + * @param options.columnToIdentifier - The column to identifier mapping metadata + * @returns The unique identifier names present in the row + */ +export function getUniquePreferenceIdentifierNamesFromRow({ + row, + columnToIdentifier, +}: { + /** The current row from CSV file */ + row: Record; + /** The current file metadata state */ + columnToIdentifier: FileFormatState['columnToIdentifier']; +}): (IdentifierMetadataForPreference & { + /** Column name */ + columnName: string; + /** Value of the identifier in the row */ + value: string; +})[] { + // TODO: https://linear.app/transcend/issue/PIK-285/set-precedence-of-unique-identifiers - remove email logic + // sort email to the front + return Object.entries(columnToIdentifier) + .sort( + ([, a], [, b]) => + (a.name === 'email' ? -1 : 0) - (b.name === 'email' ? -1 : 0) || + a.name.localeCompare(b.name, undefined, { sensitivity: 'base' }), + ) + .filter(([col]) => row[col] && columnToIdentifier[col].isUniqueOnPreferenceStore) + .map(([col, identifier]) => ({ + ...identifier, + columnName: col, + value: row[col], + })); } -/* eslint-enable no-param-reassign */ diff --git a/packages/cli/src/lib/preference-management/parsePreferenceManagementCsv.ts b/packages/cli/src/lib/preference-management/parsePreferenceManagementCsv.ts index 421655ef..0e6ee066 100644 --- a/packages/cli/src/lib/preference-management/parsePreferenceManagementCsv.ts +++ b/packages/cli/src/lib/preference-management/parsePreferenceManagementCsv.ts @@ -1,34 +1,43 @@ import { PersistedState } from '@transcend-io/persisted-state'; +import type { PreferenceQueryResponseItem } from '@transcend-io/privacy-types'; import { checkIfPendingPreferenceUpdatesAreNoOp, checkIfPendingPreferenceUpdatesCauseConflict, - FileMetadataState, getPreferencesForIdentifiers, getPreferenceUpdatesFromRow, - PreferenceState, + type FileFormatState, + type Identifier, + type PendingSafePreferenceUpdates, + type PendingWithConflictPreferenceUpdates, type PreferenceTopic, + type PreferenceUploadProgress, + type RequestUploadReceipts, + type SkippedPreferenceUpdates, } from '@transcend-io/sdk'; -import cliProgress from 'cli-progress'; +import type { ObjByString } from '@transcend-io/type-utils'; import colors from 'colors'; import type { Got } from 'got'; -import * as t from 'io-ts'; import { keyBy } from 'lodash-es'; import { logger } from '../../logger.js'; -import { readCsv } from '../requests/index.js'; import { parsePreferenceAndPurposeValuesFromCsv } from './parsePreferenceAndPurposeValuesFromCsv.js'; -import { parsePreferenceIdentifiersFromCsv } from './parsePreferenceIdentifiersFromCsv.js'; -import { parsePreferenceTimestampsFromCsv } from './parsePreferenceTimestampsFromCsv.js'; +import { parsePreferenceFileFormatFromCsv } from './parsePreferenceFileFormatFromCsv.js'; +import { + getUniquePreferenceIdentifierNamesFromRow, + parsePreferenceIdentifiersFromCsv, +} from './parsePreferenceIdentifiersFromCsv.js'; /** * Parse a file into the cache * * + * @param rawPreferences - The preferences to parse * @param options - Options - * @param cache - The cache to store the parsed file in + * @param schemaState - The schema state to use for parsing the file * @returns The cache with the parsed file */ export async function parsePreferenceManagementCsvWithCache( + rawPreferences: Record[], { file, sombra, @@ -37,6 +46,14 @@ export async function parsePreferenceManagementCsvWithCache( partitionKey, skipExistingRecordCheck, forceTriggerWorkflows, + orgIdentifiers, + allowedIdentifierNames, + identifierColumns, + downloadIdentifierConcurrency, + identifierDownloadLogInterval, + columnsToIgnore, + onProgress, + nonInteractive = false, }: { /** File to parse */ file: string; @@ -52,92 +69,163 @@ export async function parsePreferenceManagementCsvWithCache( skipExistingRecordCheck: boolean; /** Whether to force workflow triggers */ forceTriggerWorkflows: boolean; + /** Identifiers configured for the org */ + orgIdentifiers: Identifier[]; + /** allowed identifiers names */ + allowedIdentifierNames: string[]; + /** Identifier columns on the CSV file */ + identifierColumns: string[]; + /** Columns to ignore in the CSV file */ + columnsToIgnore: string[]; + /** The interval to log upload progress */ + identifierDownloadLogInterval: number; + /** Concurrency for downloading identifiers */ + downloadIdentifierConcurrency: number; + /** on progress callback */ + onProgress?: (info: PreferenceUploadProgress) => void; + /** When true, throw instead of prompting (for worker processes) */ + nonInteractive?: boolean; }, - cache: PersistedState, -): Promise { + schemaState: PersistedState, +): Promise<{ + /** Pending saf updates */ + pendingSafeUpdates: PendingSafePreferenceUpdates; + /** Pending conflict updates */ + pendingConflictUpdates: PendingWithConflictPreferenceUpdates; + /** Skipped updates */ + skippedUpdates: SkippedPreferenceUpdates; +}> { // Start the timer const t0 = new Date().getTime(); - // Get the current metadata - const fileMetadata = cache.getValue('fileMetadata'); - - // Read in the file - logger.info(colors.magenta(`Reading in file: "${file}"`)); - let preferences = readCsv(file, t.record(t.string, t.string)); - - // start building the cache, can use previous cache as well - let currentState: FileMetadataState = { - columnToPurposeName: {}, - pendingSafeUpdates: {}, - pendingConflictUpdates: {}, - skippedUpdates: {}, - // Load in the last fetched time - ...((fileMetadata[file] || {}) as Partial), - lastFetchedAt: new Date().toISOString(), - }; - // Validate that all timestamps are present in the file - currentState = await parsePreferenceTimestampsFromCsv(preferences, currentState); - fileMetadata[file] = currentState; - await cache.setValue(fileMetadata, 'fileMetadata'); + await parsePreferenceFileFormatFromCsv(rawPreferences, schemaState, { + nonInteractive, + }); // Validate that all identifiers are present and unique - const result = await parsePreferenceIdentifiersFromCsv(preferences, currentState); - currentState = result.currentState; - preferences = result.preferences; - fileMetadata[file] = currentState; - await cache.setValue(fileMetadata, 'fileMetadata'); - - // Ensure all other columns are mapped to purpose and preference - // slug values - currentState = await parsePreferenceAndPurposeValuesFromCsv(preferences, currentState, { + const result = await parsePreferenceIdentifiersFromCsv(rawPreferences, { + schemaState, + orgIdentifiers, + allowedIdentifierNames, + identifierColumns, + nonInteractive, + }); + const { preferences } = result; + + // Ensure all other columns are mapped to purpose and preference slug values + await parsePreferenceAndPurposeValuesFromCsv(preferences, schemaState, { preferenceTopics, purposeSlugs, forceTriggerWorkflows, + columnsToIgnore, + nonInteractive, }); - fileMetadata[file] = currentState; - await cache.setValue(fileMetadata, 'fileMetadata'); // Grab existing preference store records - const identifiers = preferences.map((pref) => pref[currentState.identifierColumn!]); - const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic); - if (!skipExistingRecordCheck) { - progressBar.start(identifiers.length, 0); - } + const currentColumnToIdentifierMap = schemaState.getValue('columnToIdentifier'); + const currentColumnToPurposeName = schemaState.getValue('columnToPurposeName'); + const identifiers = preferences.flatMap((pref) => + getUniquePreferenceIdentifierNamesFromRow({ + row: pref, + columnToIdentifier: currentColumnToIdentifierMap, + }), + ); + const existingConsentRecords = skipExistingRecordCheck ? [] : await getPreferencesForIdentifiers(sombra, { - identifiers: identifiers.map((x) => ({ value: x })), + identifiers, + logInterval: identifierDownloadLogInterval, partitionKey, + concurrency: downloadIdentifierConcurrency, logger, - onProgress: (completed, total) => progressBar.update(completed, { total }), + onProgress, }); - progressBar.stop(); - const consentRecordByIdentifier = keyBy(existingConsentRecords, 'userId'); + + // Create a map of all unique identifiers to consent records + const uniqueIdentifiers = Object.values(currentColumnToIdentifierMap) + .filter((x) => x.isUniqueOnPreferenceStore) + .map((x) => x.name); + const consentRecordByUniqueIdentifiers = uniqueIdentifiers.reduce( + (acc, identifier) => { + const recordsWithIdentifier = existingConsentRecords.filter((record) => + (record.identifiers || []).some((id) => id.name === identifier && id.value), + ); + acc[identifier] = keyBy( + recordsWithIdentifier, + (record) => (record.identifiers || []).find((id) => id.name === identifier)?.value || '', + ); + return acc; + }, + {} as Record>, + ); // Clear out previous updates - currentState.pendingConflictUpdates = {}; - currentState.pendingSafeUpdates = {}; - currentState.skippedUpdates = {}; + const pendingConflictUpdates: RequestUploadReceipts['pendingConflictUpdates'] = {}; + const pendingSafeUpdates: Record> = {}; + const skippedUpdates: RequestUploadReceipts['skippedUpdates'] = {}; // Process each row - preferences.forEach((pref) => { - // Grab unique Id for the user - const userId = pref[currentState.identifierColumn!]; + const seenAlready: Record = {}; + logger.log( + colors.green( + `Processing ${preferences.length} preferences with ${ + Object.keys(currentColumnToIdentifierMap).length + } identifiers and ${Object.keys(currentColumnToPurposeName).length} purposes.`, + ), + ); + preferences.forEach((pref, ind) => { + // Get the userIds that could be the primary key of the consent record + const uniqueIdentifiers = getUniquePreferenceIdentifierNamesFromRow({ + row: pref, + columnToIdentifier: currentColumnToIdentifierMap, + }); // determine updates for user const pendingUpdates = getPreferenceUpdatesFromRow({ row: pref, - columnToPurposeName: currentState.columnToPurposeName, + columnToPurposeName: currentColumnToPurposeName, preferenceTopics, purposeSlugs, }); // Grab current state of the update - const currentConsentRecord = consentRecordByIdentifier[userId]; + const primaryKeyMetadata = uniqueIdentifiers[0]; + const currentConsentRecord = + consentRecordByUniqueIdentifiers[primaryKeyMetadata.name][primaryKeyMetadata.value]; + // If consent record is found use it, otherwise use the first unique identifier + let primaryKey = primaryKeyMetadata.value; + // Ensure this is unique + if (seenAlready[primaryKey]) { + if (!Object.entries(pref).every(([key, value]) => seenAlready[primaryKey][key] === value)) { + // Show a diff of what's changed between the duplicate rows + const previous = seenAlready[primaryKey]; + const diffs = Object.entries(pref) + .filter(([key, value]) => previous[key] !== value) + .map(([key]) => key) + .join(', '); + logger.warn( + colors.yellow(`Duplicate primary key "${primaryKey}" at index ${ind}. Diff: ${diffs}`), + ); + primaryKey = `${primaryKey}___${ind}`; + } else { + skippedUpdates[`${primaryKey}___${ind}`] = pref; + logger.warn( + colors.yellow( + `Duplicate primary key found: "${primaryKey}" at index: "${ind}" but rows are identical.`, + ), + ); + return; + } + } + seenAlready[primaryKey] = pref; + if (forceTriggerWorkflows && !currentConsentRecord) { throw new Error( - `No existing consent record found for user with id: ${userId}. + `No existing consent record found for user with ids: ${uniqueIdentifiers + .map((x) => x.value) + .join(', ')}. When 'forceTriggerWorkflows' is set all the user identifiers should contain a consent record`, ); } @@ -153,7 +241,7 @@ export async function parsePreferenceManagementCsvWithCache( }) && !forceTriggerWorkflows ) { - currentState.skippedUpdates[userId] = pref; + skippedUpdates[primaryKey] = pref; return; } @@ -166,7 +254,7 @@ export async function parsePreferenceManagementCsvWithCache( preferenceTopics, }) ) { - currentState.pendingConflictUpdates[userId] = { + pendingConflictUpdates[primaryKey] = { row: pref, record: currentConsentRecord, }; @@ -174,12 +262,15 @@ export async function parsePreferenceManagementCsvWithCache( } // Add to pending updates - currentState.pendingSafeUpdates[userId] = pref; + pendingSafeUpdates[primaryKey] = pref; }); - // Read in the file - fileMetadata[file] = currentState; - await cache.setValue(fileMetadata, 'fileMetadata'); const t1 = new Date().getTime(); logger.info(colors.green(`Successfully pre-processed file: "${file}" in ${(t1 - t0) / 1000}s`)); + + return { + pendingSafeUpdates, + pendingConflictUpdates, + skippedUpdates, + }; } diff --git a/packages/cli/src/lib/preference-management/tests/fetchConsentPreferencesChunked.test.ts b/packages/cli/src/lib/preference-management/tests/fetchConsentPreferencesChunked.test.ts index e8c1d778..36c28d18 100644 --- a/packages/cli/src/lib/preference-management/tests/fetchConsentPreferencesChunked.test.ts +++ b/packages/cli/src/lib/preference-management/tests/fetchConsentPreferencesChunked.test.ts @@ -39,7 +39,7 @@ const H = vi.hoisted(() => ({ // Each call to iterateConsentPages will shift one generator from here: iterators: [] as Array>, makeIter: (pages: PreferenceQueryResponseItem[][]) => - // eslint-disable-next-line wrap-iife + // eslint-disable-next-line wrap-iife, func-names (async function* () { for (const p of pages) yield p; })(), diff --git a/packages/cli/src/lib/preference-management/tests/getPreferencesForIdentifiers.test.ts b/packages/cli/src/lib/preference-management/tests/getPreferencesForIdentifiers.test.ts index 6f65e6a5..4459e1d6 100644 --- a/packages/cli/src/lib/preference-management/tests/getPreferencesForIdentifiers.test.ts +++ b/packages/cli/src/lib/preference-management/tests/getPreferencesForIdentifiers.test.ts @@ -9,11 +9,15 @@ const H = vi.hoisted(() => ({ loggerSpies: { info: vi.fn(), warn: vi.fn(), - error: vi.fn(), - debug: vi.fn(), }, // Capture map options for assertions mapOpts: { current: undefined as unknown }, + // Fake progress bar instance methods + progressBar: { + start: vi.fn(), + update: vi.fn(), + stop: vi.fn(), + }, // Decode result stub makeDecodeResult: (nodes: PreferenceQueryResponseItem[]) => ({ nodes }), })); @@ -23,6 +27,16 @@ vi.mock('../../../logger.js', () => ({ logger: H.loggerSpies, })); +// Return a default export that has SingleBar and Presets +vi.mock('cli-progress', () => ({ + default: { + SingleBar: vi.fn(function MockSingleBar() { + return H.progressBar; + }), + Presets: { shades_classic: {} }, + }, +})); + // Keep colors stable vi.mock('colors', () => ({ default: { @@ -51,23 +65,12 @@ vi.mock('@transcend-io/utils', async (importOriginal) => ({ // decodeCodec should just return what we expect to consume vi.mock('@transcend-io/type-utils', async (importOriginal) => { const actual = await importOriginal(); - return { ...actual, decodeCodec: vi.fn((_codec, raw) => raw), }; }); -// withPreferenceRetry should invoke the provided fn and return its result, -// but we still want to see that it's being called. -const withRetrySpy = vi.fn(async (name: string, fn: () => Promise, _opts?: any) => fn()); - -vi.mock('../../../../../sdk/src/preference-management/withPreferenceRetry.js', () => ({ - withPreferenceRetry: (name: string, fn: unknown, opts?: unknown) => - // @ts-expect-error test-only - withRetrySpy(name, fn, opts), -})); - describe('getPreferencesForIdentifiers', () => { beforeEach(() => { vi.clearAllMocks(); @@ -79,11 +82,12 @@ describe('getPreferencesForIdentifiers', () => { it( 'chunks identifiers into groups of 100, calls the API per group, ' + - 'aggregates nodes, and reports progress via onProgress (skipLogging=true skips completion info log)', + 'aggregates nodes, and reports progress (skipLogging=true avoids start/info logs)', async () => { // Build 250 identifiers -> 3 groups: 100, 100, 50 const identifiers = Array.from({ length: 250 }, (_, i) => ({ value: `user-${i + 1}@ex.com`, + name: 'email', })); // Fake Got client with post().json() chain that returns a result based on the requested group @@ -105,8 +109,6 @@ describe('getPreferencesForIdentifiers', () => { name: string; }[]; }; - /** Limit */ - limit: number; }; }, ) => { @@ -130,15 +132,13 @@ describe('getPreferencesForIdentifiers', () => { ); const sombra = { post: postMock } as unknown as Got; - const onProgress = vi.fn(); const out = await getPreferencesForIdentifiers(sombra, { identifiers, partitionKey: 'p0', - skipLogging: true, // avoid completion logger.info + skipLogging: true, concurrency: 7, - logger: H.loggerSpies, - onProgress, + logger: H.loggerSpies as any, }); // Expect 3 calls (100 + 100 + 50) @@ -149,17 +149,14 @@ describe('getPreferencesForIdentifiers', () => { const call2Json = postMock.mock.calls[1][1].json; const call3Json = postMock.mock.calls[2][1].json; - expect(call1Json.limit).toBe(100); expect(call1Json.filter.identifiers).toHaveLength(100); expect(call1Json.filter.identifiers[0].value).toBe('user-1@ex.com'); expect(call1Json.filter.identifiers[99].value).toBe('user-100@ex.com'); - expect(call2Json.limit).toBe(100); expect(call2Json.filter.identifiers).toHaveLength(100); expect(call2Json.filter.identifiers[0].value).toBe('user-101@ex.com'); expect(call2Json.filter.identifiers[99].value).toBe('user-200@ex.com'); - expect(call3Json.limit).toBe(50); expect(call3Json.filter.identifiers).toHaveLength(50); expect(call3Json.filter.identifiers[0].value).toBe('user-201@ex.com'); expect(call3Json.filter.identifiers[49].value).toBe('user-250@ex.com'); @@ -168,13 +165,10 @@ describe('getPreferencesForIdentifiers', () => { expect(out).toHaveLength(250); expect(out).toHaveLength(250); - // onProgress after each group (SDK reports completed count vs total identifiers) - expect(onProgress).toHaveBeenCalledTimes(3); - expect(onProgress.mock.calls).toEqual([ - [100, 250], - [200, 250], - [250, 250], - ]); + // Progress bar is not used by the current implementation. + expect(H.progressBar.start).not.toHaveBeenCalled(); + expect(H.progressBar.update).not.toHaveBeenCalled(); + expect(H.progressBar.stop).not.toHaveBeenCalled(); // Logger.info only at the end when !skipLogging, so not in this test expect(H.loggerSpies.info).not.toHaveBeenCalled(); @@ -183,14 +177,15 @@ describe('getPreferencesForIdentifiers', () => { // @ts-expect-error test-only capture expect(H.mapOpts.current?.concurrency).toBe(7); - // Ensure wrapper was used for each group - expect(withRetrySpy).toHaveBeenCalledTimes(3); + // withPreferenceRetry is used internally by the SDK — verified via sombra.post calls + expect(sombra.post).toHaveBeenCalledTimes(3); }, ); - it('logs completion when skipLogging=false and invokes onProgress per group', async () => { + it('logs progress start and completion when skipLogging=false', async () => { const identifiers = Array.from({ length: 5 }, (_, i) => ({ value: `u${i + 1}`, + name: 'test-id', })); const postMock = vi.fn( @@ -211,8 +206,6 @@ describe('getPreferencesForIdentifiers', () => { name: string; }[]; }; - /** Limit */ - limit: number; }; }, ) => { @@ -234,19 +227,17 @@ describe('getPreferencesForIdentifiers', () => { ); const sombra = { post: postMock } as unknown as Got; - const onProgress = vi.fn(); const out = await getPreferencesForIdentifiers(sombra, { identifiers, partitionKey: 'pA', skipLogging: false, concurrency: 2, - logger: H.loggerSpies, - onProgress, + logger: H.loggerSpies as any, }); expect(out).toHaveLength(5); - expect(onProgress).toHaveBeenCalledTimes(1); - expect(onProgress).toHaveBeenCalledWith(5, 5); + expect(H.progressBar.start).not.toHaveBeenCalled(); + expect(H.progressBar.stop).not.toHaveBeenCalled(); // Completion info log called once expect(H.loggerSpies.info).toHaveBeenCalledTimes(1); diff --git a/packages/cli/src/lib/preference-management/uploadPreferenceManagementPreferencesInteractive.ts b/packages/cli/src/lib/preference-management/uploadPreferenceManagementPreferencesInteractive.ts deleted file mode 100644 index b0b0aedf..00000000 --- a/packages/cli/src/lib/preference-management/uploadPreferenceManagementPreferencesInteractive.ts +++ /dev/null @@ -1,271 +0,0 @@ -import { PersistedState } from '@transcend-io/persisted-state'; -import { PreferenceUpdateItem } from '@transcend-io/privacy-types'; -import { - buildTranscendGraphQLClient, - createSombraGotInstance, - fetchAllPurposes, - fetchAllPreferenceTopics, - getPreferenceUpdatesFromRow, - PreferenceState, -} from '@transcend-io/sdk'; -import { apply } from '@transcend-io/type-utils'; -import { map } from '@transcend-io/utils'; -import cliProgress from 'cli-progress'; -import colors from 'colors'; -import { chunk } from 'lodash-es'; - -import { logger } from '../../logger.js'; -import { parseAttributesFromString } from '../requests/index.js'; -import { parsePreferenceManagementCsvWithCache } from './parsePreferenceManagementCsv.js'; -import { NONE_PREFERENCE_MAP } from './parsePreferenceTimestampsFromCsv.js'; - -/** - * Upload a set of consent preferences - * - * @param options - Options - */ -export async function uploadPreferenceManagementPreferencesInteractive({ - auth, - sombraAuth, - receiptFilepath, - file, - partition, - isSilent = true, - dryRun = false, - skipWorkflowTriggers = false, - skipConflictUpdates = false, - skipExistingRecordCheck = false, - attributes = [], - transcendUrl, - forceTriggerWorkflows = false, -}: { - /** The Transcend API key */ - auth: string; - /** Sombra API key authentication */ - sombraAuth?: string; - /** Partition key */ - partition: string; - /** File where to store receipt and continue from where left off */ - receiptFilepath: string; - /** The file to process */ - file: string; - /** API URL for Transcend backend */ - transcendUrl: string; - /** Whether to do a dry run */ - dryRun?: boolean; - /** Whether to upload as isSilent */ - isSilent?: boolean; - /** Attributes string pre-parse. In format Key:Value */ - attributes?: string[]; - /** Skip workflow triggers */ - skipWorkflowTriggers?: boolean; - /** - * When true, only update preferences that do not conflict with existing - * preferences. When false, update all preferences in CSV based on timestamp. - */ - skipConflictUpdates?: boolean; - /** Whether to skip the check for existing records. SHOULD ONLY BE USED FOR INITIAL UPLOAD */ - skipExistingRecordCheck?: boolean; - /** Whether to force trigger workflows */ - forceTriggerWorkflows?: boolean; -}): Promise { - // Parse out the extra attributes to apply to all requests uploaded - const parsedAttributes = parseAttributesFromString(attributes); - - // Create a new state file to store the requests from this run - const preferenceState = new PersistedState(receiptFilepath, PreferenceState, { - fileMetadata: {}, - failingUpdates: {}, - pendingUpdates: {}, - }); - const failingRequests = preferenceState.getValue('failingUpdates'); - const pendingRequests = preferenceState.getValue('pendingUpdates'); - let fileMetadata = preferenceState.getValue('fileMetadata'); - - logger.info( - colors.magenta( - 'Restored cache, there are: \n' + - `${Object.values(failingRequests).length} failing requests to be retried\n` + - `${Object.values(pendingRequests).length} pending requests to be processed\n` + - `The following files are stored in cache and will be used:\n${Object.keys(fileMetadata) - .map((x) => x) - .join('\n')}\n` + - `The following file will be processed: ${file}\n`, - ), - ); - - // Create GraphQL client to connect to Transcend backend - const client = buildTranscendGraphQLClient(transcendUrl, auth); - - const [sombra, purposes, preferenceTopics] = await Promise.all([ - // Create sombra instance to communicate with - createSombraGotInstance(transcendUrl, auth, { - logger, - sombraApiKey: sombraAuth, - sombraUrl: process.env.SOMBRA_URL, - }), - // get all purposes and topics - fetchAllPurposes(client, { logger }), - fetchAllPreferenceTopics(client, { logger }), - ]); - - // Process the file - await parsePreferenceManagementCsvWithCache( - { - file, - purposeSlugs: purposes.map((x) => x.trackingType), - preferenceTopics, - sombra, - partitionKey: partition, - skipExistingRecordCheck, - forceTriggerWorkflows, - }, - preferenceState, - ); - - // Construct the pending updates - const pendingUpdates: Record = {}; - fileMetadata = preferenceState.getValue('fileMetadata'); - const metadata = fileMetadata[file]; - - logger.info( - colors.magenta( - `Found ${Object.entries(metadata.pendingSafeUpdates).length} safe updates in ${file}`, - ), - ); - logger.info( - colors.magenta( - `Found ${Object.entries(metadata.pendingConflictUpdates).length} conflict updates in ${file}`, - ), - ); - logger.info( - colors.magenta( - `Found ${Object.entries(metadata.skippedUpdates).length} skipped updates in ${file}`, - ), - ); - - // Update either safe updates only or safe + conflict - Object.entries({ - ...metadata.pendingSafeUpdates, - ...(skipConflictUpdates ? {} : apply(metadata.pendingConflictUpdates, ({ row }) => row)), - }).forEach(([userId, update]) => { - // Determine timestamp - const timestamp = - metadata.timestampColum === NONE_PREFERENCE_MAP - ? new Date() - : new Date(update[metadata.timestampColum!]); - - // Determine updates - const updates = getPreferenceUpdatesFromRow({ - row: update, - columnToPurposeName: metadata.columnToPurposeName, - preferenceTopics, - purposeSlugs: purposes.map((x) => x.trackingType), - }); - pendingUpdates[userId] = { - userId, - partition, - timestamp: timestamp.toISOString(), - purposes: Object.entries(updates).map(([purpose, value]) => ({ - ...value, - purpose, - workflowSettings: { - attributes: parsedAttributes, - isSilent, - skipWorkflowTrigger: skipWorkflowTriggers, - ...(forceTriggerWorkflows ? { forceTriggerWorkflow: forceTriggerWorkflows } : {}), - }, - })), - }; - }); - await preferenceState.setValue(pendingUpdates, 'pendingUpdates'); - await preferenceState.setValue({}, 'failingUpdates'); - - // Exist early if dry run - if (dryRun) { - logger.info( - colors.green( - `Dry run complete, exiting. ${ - Object.values(pendingUpdates).length - } pending updates. Check file: ${receiptFilepath}`, - ), - ); - return; - } - - logger.info( - colors.magenta( - `Uploading ${Object.values(pendingUpdates).length} preferences to partition: ${partition}`, - ), - ); - - // Time duration - const t0 = new Date().getTime(); - - // create a new progress bar instance and use shades_classic theme - const progressBar = new cliProgress.SingleBar({}, cliProgress.Presets.shades_classic); - - // Build a GraphQL client - let total = 0; - const updatesToRun = Object.entries(pendingUpdates); - const chunkedUpdates = chunk(updatesToRun, skipWorkflowTriggers ? 100 : 10); - progressBar.start(updatesToRun.length, 0); - await map( - chunkedUpdates, - async (currentChunk) => { - // Make the request - try { - await sombra - .put('v1/preferences', { - json: { - records: currentChunk.map(([, update]) => update), - skipWorkflowTriggers, - }, - }) - .json(); - } catch (err) { - try { - const parsed = JSON.parse(err?.response?.body || '{}'); - if (parsed.error) { - logger.error(colors.red(`Error: ${parsed.error}`)); - } - } catch { - // continue - } - logger.error( - colors.red( - `Failed to upload ${currentChunk.length} user preferences to partition ${partition}: ${ - err?.response?.body || err?.message - }`, - ), - ); - const failingUpdates = preferenceState.getValue('failingUpdates'); - currentChunk.forEach(([userId, update]) => { - failingUpdates[userId] = { - uploadedAt: new Date().toISOString(), - update, - error: err?.response?.body || err?.message || 'Unknown error', - }; - }); - await preferenceState.setValue(failingUpdates, 'failingUpdates'); - } - - total += currentChunk.length; - progressBar.update(total); - }, - { - concurrency: 40, - }, - ); - - progressBar.stop(); - const t1 = new Date().getTime(); - const totalTime = t1 - t0; - logger.info( - colors.green( - `Successfully uploaded ${ - updatesToRun.length - } user preferences to partition ${partition} in "${totalTime / 1000}" seconds!`, - ), - ); -} diff --git a/packages/cli/src/lib/tests/codebase.test.ts b/packages/cli/src/lib/tests/codebase.test.ts index 43084f08..d412a5ea 100644 --- a/packages/cli/src/lib/tests/codebase.test.ts +++ b/packages/cli/src/lib/tests/codebase.test.ts @@ -204,7 +204,15 @@ describe('CLI Command Structure', () => { test('No unexpected files in command directories', () => { // Required + optional files in leaf command dirs const requiredFiles = ['command.ts', 'impl.ts']; - const optionalFiles = ['readme.ts', 'helpers.ts', 'types.ts', 'worker.ts', 'constants.ts']; + const optionalFiles = [ + 'readme.ts', + 'helpers.ts', + 'types.ts', + 'worker.ts', + 'constants.ts', + 'buildTaskOptions.ts', + 'schemaState.ts', + ]; // Allowed subdirectories in leaf command dirs const allowedDirs = ['artifacts', 'ui', 'upload', 'tests', '__mocks__', '__snapshots__']; diff --git a/packages/cli/tsdown.config.ts b/packages/cli/tsdown.config.ts index b593afe8..fa4919af 100644 --- a/packages/cli/tsdown.config.ts +++ b/packages/cli/tsdown.config.ts @@ -11,6 +11,7 @@ export default defineConfig({ 'src/index.ts', 'src/commands/admin/chunk-csv/worker.ts', 'src/commands/admin/parquet-to-csv/worker.ts', + 'src/commands/consent/upload-preferences/worker.ts', ], minify: true, splitting: true, diff --git a/packages/sdk/src/index.ts b/packages/sdk/src/index.ts index 7c5c1431..e8ded619 100644 --- a/packages/sdk/src/index.ts +++ b/packages/sdk/src/index.ts @@ -22,3 +22,4 @@ export function createMonorepoPackageDefinition( export * from './api/index.js'; export * from './data-inventory/index.js'; export * from './preference-management/index.js'; +export * from './preference-upload/index.js'; diff --git a/packages/sdk/src/preference-management/codecs.ts b/packages/sdk/src/preference-management/codecs.ts index 2a61a491..955457c3 100644 --- a/packages/sdk/src/preference-management/codecs.ts +++ b/packages/sdk/src/preference-management/codecs.ts @@ -87,47 +87,30 @@ export const ColumnMetadataMap = t.record(t.string, MetadataMapping); /** Override type */ export type ColumnMetadataMap = t.TypeOf; -export const FileMetadataState = t.intersection([ +export const FileFormatState = t.intersection([ t.type({ /** * Definition of how to map each column in the CSV to * the relevant purpose and preference definitions in transcend */ - columnToPurposeName: t.record(t.string, PurposeRowMapping), + columnToPurposeName: ColumnPurposeMap, /** Last time the file was last parsed at */ lastFetchedAt: t.string, - /** - * Mapping of userId to the rows in the file that need to be uploaded - * These uploads are overwriting non-existent preferences and are safe - */ - pendingSafeUpdates: t.record(t.string, t.record(t.string, t.string)), - /** - * Mapping of userId to the rows in the file that need to be uploaded - * these records have conflicts with existing consent preferences - */ - pendingConflictUpdates: t.record( - t.string, - t.type({ - record: PreferenceQueryResponseItem, - row: t.record(t.string, t.string), - }), - ), - /** - * Mapping of userId to the rows in the file that can be skipped because - * their preferences are already in the store - */ - skippedUpdates: t.record(t.string, t.record(t.string, t.string)), + /** The column name that maps to the identifier */ + columnToIdentifier: ColumnIdentifierMap, }), t.partial({ - /** Determine which column name in file maps to consent record identifier to upload on */ - identifierColumn: t.string, /** Determine which column name in file maps to the timestamp */ - timestampColum: t.string, + timestampColumn: t.string, + /** Mapping of CSV column names to metadata keys */ + columnToMetadata: ColumnMetadataMap, + /** CSV columns that should be ignored during upload */ + columnsToIgnore: t.array(t.string), }), ]); /** Override type */ -export type FileMetadataState = t.TypeOf; +export type FileFormatState = t.TypeOf; /** * This is the type of the receipts that are stored in the file @@ -219,36 +202,25 @@ export const SkippedPreferenceUpdates = t.record(t.string, t.record(t.string, t. /** Override type */ export type SkippedPreferenceUpdates = t.TypeOf; -/** Persist this data between runs of the script */ -export const PreferenceState = t.type({ - /** - * Store a cache of previous files read in - */ - fileMetadata: t.record(t.string, FileMetadataState), - /** - * The set of successful uploads to Transcend - * Mapping from userId to the upload metadata - */ - failingUpdates: t.record( - t.string, - t.type({ - /** Time upload ran at */ - uploadedAt: t.string, - /** Attempts to upload that resulted in an error */ - error: t.string, - /** The update body */ - update: PreferenceUpdateItem, - }), - ), - /** - * The set of pending uploads to Transcend - * Mapping from userId to the upload metadata - */ - pendingUpdates: t.record(t.string, PreferenceUpdateItem), +export const RequestUploadReceipts = t.type({ + /** Last time the file was last parsed at */ + lastFetchedAt: t.string, + /** Safe updates (no conflict with existing preferences) keyed by primaryKey */ + pendingSafeUpdates: PendingSafePreferenceUpdates, + /** Conflict updates (existing preferences differ) keyed by primaryKey */ + pendingConflictUpdates: PendingWithConflictPreferenceUpdates, + /** Skipped rows (already in store or duplicates) keyed by primaryKey */ + skippedUpdates: SkippedPreferenceUpdates, + /** Failed uploads keyed by primaryKey */ + failingUpdates: FailingPreferenceUpdates, + /** Pending uploads at time of last cache write; shrinks as processed */ + pendingUpdates: PreferenceUpdateMap, + /** Successfully processed uploads keyed by primaryKey */ + successfulUpdates: PreferenceUpdateMap, }); /** Override type */ -export type PreferenceState = t.TypeOf; +export type RequestUploadReceipts = t.TypeOf; export const DeletePreferenceRecordsInput = t.type({ /** Array of consent preference records to delete */ diff --git a/packages/sdk/src/preference-management/getPreferenceIdentifiersFromRow.ts b/packages/sdk/src/preference-management/getPreferenceIdentifiersFromRow.ts new file mode 100644 index 00000000..21286f01 --- /dev/null +++ b/packages/sdk/src/preference-management/getPreferenceIdentifiersFromRow.ts @@ -0,0 +1,34 @@ +import type { PreferenceStoreIdentifier } from '@transcend-io/privacy-types'; + +import type { FileFormatState } from './codecs.js'; + +/** + * Extract preference store identifiers from a CSV row based on the column-to-identifier mapping. + * + * @param options - Options + * @returns Array of identifiers for the preference store API + */ +export function getPreferenceIdentifiersFromRow({ + row, + columnToIdentifier, +}: { + /** The current row from CSV file */ + row: Record; + /** The current file metadata state */ + columnToIdentifier: FileFormatState['columnToIdentifier']; +}): PreferenceStoreIdentifier[] { + const identifiers = Object.entries(columnToIdentifier) + .filter(([col]) => !!row[col]) + .map(([col, identifierMapping]) => ({ + name: identifierMapping.name, + value: row[col]!, + })); + return identifiers.sort( + (a, b) => + (a.name === 'email' ? -1 : 0) - (b.name === 'email' ? -1 : 0) || + a.name.localeCompare(b.name, undefined, { sensitivity: 'base' }), + ); +} + +/** Sentinel value indicating no timestamp/format column was selected */ +export const NONE_PREFERENCE_MAP = '[NONE]'; diff --git a/packages/sdk/src/preference-management/getPreferencesForIdentifiers.ts b/packages/sdk/src/preference-management/getPreferencesForIdentifiers.ts index bad9201d..e8fa7431 100644 --- a/packages/sdk/src/preference-management/getPreferencesForIdentifiers.ts +++ b/packages/sdk/src/preference-management/getPreferencesForIdentifiers.ts @@ -1,14 +1,18 @@ import { PreferenceQueryResponseItem } from '@transcend-io/privacy-types'; import { decodeCodec } from '@transcend-io/type-utils'; -import { map, type Logger } from '@transcend-io/utils'; +import { extractErrorMessage, map, splitInHalf, type Logger } from '@transcend-io/utils'; import type { Got } from 'got'; import { chunk } from 'lodash-es'; -import { ConsentPreferenceResponse } from './types.js'; +import { ConsentPreferenceResponse, type PreferenceUploadProgress } from './types.js'; import { withPreferenceRetry } from './withPreferenceRetry.js'; /** - * Grab the current consent preference values for a list of identifiers + * Grab the current consent preference values for a list of identifiers. + * + * Uses recursive split-on-validation: if a group fails with + * "did not pass validation", it is halved and retried. Singletons + * that still fail are skipped. * * @param sombra - Backend to make API call to * @param options - Options @@ -19,26 +23,31 @@ export async function getPreferencesForIdentifiers( { identifiers, partitionKey, + onProgress, + logInterval = 10000, skipLogging = false, concurrency = 40, logger, - onProgress, }: { /** The list of identifiers to look up */ identifiers: { /** The value of the identifier */ value: string; + /** The name of the identifier */ + name: string; }[]; /** The partition key to look up */ partitionKey: string; /** Whether to skip logging */ skipLogging?: boolean; - /** Concurrency for requests (default 40) */ + /** The interval to log upload progress */ + logInterval?: number; + /** Concurrency for fetching identifiers */ concurrency?: number; /** Logger */ logger: Logger; - /** Optional progress callback (completed count, total identifiers) */ - onProgress?: (completed: number, total: number) => void; + /** Progress callback */ + onProgress?: (info: PreferenceUploadProgress) => void; }, ): Promise { const results: PreferenceQueryResponseItem[] = []; @@ -47,38 +56,101 @@ export async function getPreferencesForIdentifiers( const t0 = new Date().getTime(); let total = 0; - await map( - groupedIdentifiers, - async (group) => { - const rawResult = await withPreferenceRetry( - 'Preference Query', - () => - sombra - .post(`v1/preferences/${partitionKey}/query`, { - json: { - filter: { identifiers: group }, - limit: group.length, - }, - }) - .json(), - { - logger, - onRetry: (attempt, _err, msg) => { - logger.warn( - `[RETRY] group size=${group.length} partition=${partitionKey} attempt=${attempt}: ${msg}`, - ); - }, - }, + onProgress?.({ + successDelta: 0, + successTotal: 0, + fileTotal: identifiers.length, + }); + + const maybeLogProgress = (delta: number): void => { + onProgress?.({ + successDelta: delta, + successTotal: total, + fileTotal: identifiers.length, + }); + + if (skipLogging) return; + const shouldLog = + total % logInterval === 0 || + Math.floor((total - identifiers.length) / logInterval) < Math.floor(total / logInterval); + if (shouldLog) { + logger.info( + `Fetched ${total}/${identifiers.length} user preferences from partition ${partitionKey}`, ); + } + }; + + const postGroupWithRetries = async ( + group: { value: string; name: string }[], + ): Promise => { + const rawResult = await withPreferenceRetry( + 'Preference Query', + () => + sombra + .post(`v1/preferences/${partitionKey}/query`, { + json: { + filter: { identifiers: group }, + }, + }) + .json(), + { + logger, + onRetry: (attempt, _err, msg) => { + logger.warn( + `[RETRY v1/preferences/${partitionKey}/query] ` + + `group size=${group.length} partition=${partitionKey} attempt=${attempt}: ${msg}`, + ); + }, + }, + ); + + const result = decodeCodec(ConsentPreferenceResponse, rawResult); + return result.nodes; + }; - const result = decodeCodec(ConsentPreferenceResponse, rawResult); - results.push(...result.nodes); + /** + * Recursively process a group: + * - Try to fetch in one go. + * - If it fails with "did not pass validation", split into halves and recurse. + * - If the group is a singleton and still fails validation, skip it. + */ + const processGroup = async (group: { value: string; name: string }[]): Promise => { + try { + const nodes = await postGroupWithRetries(group); + results.push(...nodes); total += group.length; - onProgress?.(total, identifiers.length); - }, - { - concurrency, + maybeLogProgress(group.length); + } catch (err) { + const msg = extractErrorMessage(err); + + if (/did not pass validation/i.test(msg)) { + if (group.length === 1) { + const only = group[0]!; + logger.warn(`Skipping identifier "${only.value}" (${only.name}): ${msg}`); + total += 1; + maybeLogProgress(1); + return; + } + + const [left, right] = splitInHalf(group); + logger.warn( + `Group of ${group.length} did not pass validation. Splitting into ${left.length} and ${right.length}.`, + ); + await processGroup(left); + await processGroup(right); + return; + } + + throw err; + } + }; + + await map( + groupedIdentifiers, + async (group) => { + await processGroup(group); }, + { concurrency }, ); const t1 = new Date().getTime(); diff --git a/packages/sdk/src/preference-management/index.ts b/packages/sdk/src/preference-management/index.ts index e8ab78f7..b9dffd9a 100644 --- a/packages/sdk/src/preference-management/index.ts +++ b/packages/sdk/src/preference-management/index.ts @@ -5,6 +5,7 @@ export * from './createPreferenceAccessTokens.js'; export * from './types.js'; export * from './codecs.js'; export * from './getPreferenceMetadataFromRow.js'; +export * from './getPreferenceIdentifiersFromRow.js'; export * from './getPreferenceUpdatesFromRow.js'; export * from './checkIfPendingPreferenceUpdatesAreNoOp.js'; export * from './checkIfPendingPreferenceUpdatesCauseConflict.js'; diff --git a/packages/sdk/src/preference-management/types.ts b/packages/sdk/src/preference-management/types.ts index af6e718d..62356271 100644 --- a/packages/sdk/src/preference-management/types.ts +++ b/packages/sdk/src/preference-management/types.ts @@ -46,3 +46,13 @@ export type PreferencesQueryFilter = { /** Which dimension we chunk on */ export type ChunkMode = 'timestamp' | 'updated'; + +/** Progress info emitted during preference upload/fetch operations */ +export interface PreferenceUploadProgress { + /** how many records just succeeded */ + successDelta: number; + /** cumulative successes in this file */ + successTotal: number; + /** total records that will be uploaded in this file */ + fileTotal: number; +} diff --git a/packages/cli/src/commands/consent/upload-preferences/upload/batchUploader.ts b/packages/sdk/src/preference-upload/batchUploader.ts similarity index 70% rename from packages/cli/src/commands/consent/upload-preferences/upload/batchUploader.ts rename to packages/sdk/src/preference-upload/batchUploader.ts index 4b8ec063..cca8c39e 100644 --- a/packages/cli/src/commands/consent/upload-preferences/upload/batchUploader.ts +++ b/packages/sdk/src/preference-upload/batchUploader.ts @@ -4,11 +4,9 @@ import { getErrorStatus, retrySamePromise, splitInHalf, + type Logger, type RetryPolicy, } from '@transcend-io/utils'; -import colors from 'colors'; - -import { logger } from '../../../../logger.js'; type Entry = [string, PreferenceUpdateItem]; @@ -19,18 +17,15 @@ export interface BatchUploadPreferenceOptions { export interface BatchUploaderDeps { /** Network transport used for PUT uploads */ - putBatch: ( - /** The set of updates to put */ - updates: PreferenceUpdateItem[], - /** The global options for each update */ - opts: BatchUploadPreferenceOptions, - ) => Promise; + putBatch: (updates: PreferenceUpdateItem[], opts: BatchUploadPreferenceOptions) => Promise; /** Retry policy for retryable statuses */ retryPolicy: RetryPolicy; /** Endpoint behavior flags */ options: BatchUploadPreferenceOptions; /** Decide if a status is retryable *in place* (no splitting) */ isRetryableStatus: (status?: number) => boolean; + /** Logger */ + logger: Logger; } /** @@ -57,7 +52,8 @@ export async function uploadChunkWithSplit( onFailureBatch: (entries: Entry[], err: unknown) => Promise; }, ): Promise { - // Run the batch job + const { logger } = deps; + const putAll = (): Promise => deps.putBatch( entries.map(([, u]) => u), @@ -65,7 +61,6 @@ export async function uploadChunkWithSplit( ); try { - // 1) Try the whole batch once. await putAll(); await callbacks.onSuccess(entries); } catch (errRaw) { @@ -73,53 +68,41 @@ export async function uploadChunkWithSplit( const status = getErrorStatus(err); const msg = extractErrorMessage(err); - // 2) For retryable statuses, attempt in-place retries without splitting. const isSoftRateLimit = status === 400 && /slow down|please try again shortly|Throughput exceeds the current/i.test(msg); if (deps.isRetryableStatus(status) || isSoftRateLimit) { try { - await retrySamePromise(putAll, deps.retryPolicy, (note) => - logger.warn(colors.yellow(note)), - ); + await retrySamePromise(putAll, deps.retryPolicy, (note) => logger.warn(note)); await callbacks.onSuccess(entries); return; } catch (err2) { - // If we *still* have a retryable status after exhausting attempts, - // mark the entire batch as failed (do NOT split). if (deps.isRetryableStatus(getErrorStatus(err2))) { logger.error( - colors.red( - `Exhausted retries for batch of ${entries.length}. Marking entire batch as failed.`, - ), + `Exhausted retries for batch of ${entries.length}. Marking entire batch as failed.`, ); await callbacks.onFailureBatch(entries, err2); return; } - // Otherwise, fall through to split behavior with the new error. err = err2; } } - // 3) Non-retryable path: split the batch and recurse down to singletons. if (entries.length === 1) { - // Terminal case: one record left and it still fails → mark failure. try { await putAll(); await callbacks.onSuccess(entries); } catch (singleErr) { - await callbacks.onFailureSingle(entries[0], singleErr); + await callbacks.onFailureSingle(entries[0]!, singleErr); } return; } const [left, right] = splitInHalf(entries); logger.warn( - colors.yellow( - `Non-retryable failure for batch of ${entries.length} (status=${status}): ${msg}. ` + - `Splitting into ${left.length} and ${right.length}.`, - ), + `Non-retryable failure for batch of ${entries.length} (status=${status}): ${msg}. ` + + `Splitting into ${left.length} and ${right.length}.`, ); await uploadChunkWithSplit(left, deps, callbacks); diff --git a/packages/sdk/src/preference-upload/buildPendingUpdates.ts b/packages/sdk/src/preference-upload/buildPendingUpdates.ts new file mode 100644 index 00000000..b1cb27af --- /dev/null +++ b/packages/sdk/src/preference-upload/buildPendingUpdates.ts @@ -0,0 +1,138 @@ +import type { PreferenceUpdateItem } from '@transcend-io/privacy-types'; + +import type { + ColumnIdentifierMap, + ColumnMetadataMap, + ColumnPurposeMap, + PendingSafePreferenceUpdates, + PendingWithConflictPreferenceUpdates, +} from '../preference-management/codecs.js'; +import type { PreferenceTopic } from '../preference-management/fetchAllPreferenceTopics.js'; +import type { Purpose } from '../preference-management/fetchAllPurposes.js'; +import { + getPreferenceIdentifiersFromRow, + NONE_PREFERENCE_MAP, +} from '../preference-management/getPreferenceIdentifiersFromRow.js'; +import { getPreferenceMetadataFromRow } from '../preference-management/getPreferenceMetadataFromRow.js'; +import { getPreferenceUpdatesFromRow } from '../preference-management/getPreferenceUpdatesFromRow.js'; + +/** Attribute key-value pair for workflow settings */ +export interface FormattedAttribute { + /** Attribute key */ + key: string; + /** Attribute values */ + values: string[]; +} + +export interface BuildPendingParams { + /** Safe updates keyed by user/primaryKey */ + safe: PendingSafePreferenceUpdates; + /** Conflict updates keyed by user/primaryKey (value.row contains row data) */ + conflicts: PendingWithConflictPreferenceUpdates; + /** Only upload safe updates (ignore conflicts entirely) */ + skipConflictUpdates: boolean; + /** Name of the column to use as the preference timestamp (if available) */ + timestampColumn?: string; + /** CSV column -> purpose/preference mapping */ + columnToPurposeName: ColumnPurposeMap; + /** CSV column -> identifier mapping */ + columnToIdentifier: ColumnIdentifierMap; + /** CSV column -> metadata key mapping (optional) */ + columnToMetadata?: ColumnMetadataMap; + /** Full set of preference topics for resolving row -> preference values */ + preferenceTopics: PreferenceTopic[]; + /** Full set of purposes for resolving slugs/trackingTypes */ + purposes: Purpose[]; + /** Partition to attribute to every record */ + partition: string; + /** Static attributes injected into workflow settings */ + workflowAttrs: FormattedAttribute[]; + /** If true, downstream should avoid user-visible notifications */ + isSilent: boolean; + /** If true, skip triggering workflows downstream */ + skipWorkflowTriggers: boolean; + /** If true, force trigger workflows even if preferences haven't changed */ + forceTriggerWorkflows: boolean; +} + +/** + * Convert parsed CSV rows into a map of PreferenceUpdateItem payloads. + * + * This function is pure (no IO, logging or state writes). + * + * @param params - Transformation inputs + * @returns Map of primaryKey -> PreferenceUpdateItem + */ +export function buildPendingUpdates( + params: BuildPendingParams, +): Record { + const { + safe, + conflicts, + skipConflictUpdates, + timestampColumn, + columnToPurposeName, + columnToIdentifier, + columnToMetadata, + preferenceTopics, + purposes, + partition, + workflowAttrs, + isSilent, + skipWorkflowTriggers, + forceTriggerWorkflows, + } = params; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const merged: Record = skipConflictUpdates + ? { ...safe } + : { + ...safe, + ...Object.fromEntries(Object.entries(conflicts).map(([id, v]) => [id, v.row])), + }; + + const purposeSlugs = purposes.map((x) => x.trackingType); + const out: Record = {}; + + for (const [userId, row] of Object.entries(merged)) { + const ts = + timestampColumn === NONE_PREFERENCE_MAP || !timestampColumn + ? new Date() + : new Date(row[timestampColumn]); + + const updates = getPreferenceUpdatesFromRow({ + row, + columnToPurposeName, + preferenceTopics, + purposeSlugs, + }); + + const identifiers = getPreferenceIdentifiersFromRow({ + row, + columnToIdentifier, + }); + + const metadata = columnToMetadata + ? getPreferenceMetadataFromRow({ row, columnToMetadata }) + : undefined; + + out[userId] = { + identifiers, + partition, + timestamp: ts.toISOString(), + purposes: Object.entries(updates).map(([purpose, value]) => ({ + ...value, + purpose, + workflowSettings: { + attributes: workflowAttrs, + isSilent, + skipWorkflowTrigger: skipWorkflowTriggers, + forceTriggerWorkflow: forceTriggerWorkflows, + }, + })), + ...(metadata && metadata.length > 0 ? { metadata } : {}), + }; + } + + return out; +} diff --git a/packages/sdk/src/preference-upload/index.ts b/packages/sdk/src/preference-upload/index.ts new file mode 100644 index 00000000..8e6f18c6 --- /dev/null +++ b/packages/sdk/src/preference-upload/index.ts @@ -0,0 +1,4 @@ +export * from './progress.js'; +export * from './batchUploader.js'; +export * from './loadReferenceData.js'; +export * from './buildPendingUpdates.js'; diff --git a/packages/sdk/src/preference-upload/loadReferenceData.test.ts b/packages/sdk/src/preference-upload/loadReferenceData.test.ts new file mode 100644 index 00000000..2265d224 --- /dev/null +++ b/packages/sdk/src/preference-upload/loadReferenceData.test.ts @@ -0,0 +1,65 @@ +import type { GraphQLClient } from 'graphql-request'; +import { describe, it, expect, vi, beforeEach } from 'vitest'; + +import type { Identifier } from '../data-inventory/fetchAllIdentifiers.js'; +import type { PreferenceTopic } from '../preference-management/fetchAllPreferenceTopics.js'; +import type { Purpose } from '../preference-management/fetchAllPurposes.js'; + +const H = vi.hoisted(() => ({ + fetchAllPurposes: vi.fn(), + fetchAllPreferenceTopics: vi.fn(), + fetchAllIdentifiers: vi.fn(), +})); + +vi.mock('../preference-management/fetchAllPurposes.js', () => ({ + fetchAllPurposes: H.fetchAllPurposes, +})); +vi.mock('../preference-management/fetchAllPreferenceTopics.js', () => ({ + fetchAllPreferenceTopics: H.fetchAllPreferenceTopics, +})); +vi.mock('../data-inventory/fetchAllIdentifiers.js', () => ({ + fetchAllIdentifiers: H.fetchAllIdentifiers, +})); + +import { loadReferenceData } from './loadReferenceData.js'; + +describe('loadReferenceData', () => { + let client: GraphQLClient; + + beforeEach(() => { + vi.clearAllMocks(); + client = { + request: vi.fn().mockResolvedValue({}), + } as unknown as GraphQLClient; + }); + + it('loads purposes, topics, and identifiers', async () => { + const purposes = [{ id: 'p1' }, { id: 'p2' }] as Purpose[]; + const preferenceTopics = [{ id: 't1' }] as PreferenceTopic[]; + const identifiers = [{ id: 'i1' }, { id: 'i2' }] as Identifier[]; + + H.fetchAllPurposes.mockResolvedValueOnce(purposes); + H.fetchAllPreferenceTopics.mockResolvedValueOnce(preferenceTopics); + H.fetchAllIdentifiers.mockResolvedValueOnce(identifiers); + + const result = await loadReferenceData(client, { logger: console }); + + expect(result.purposes).toEqual(purposes); + expect(result.preferenceTopics).toEqual(preferenceTopics); + expect(result.identifiers).toEqual(identifiers); + + expect(H.fetchAllPurposes).toHaveBeenCalledTimes(1); + expect(H.fetchAllPreferenceTopics).toHaveBeenCalledTimes(1); + expect(H.fetchAllIdentifiers).toHaveBeenCalledTimes(1); + }); + + it('propagates errors (e.g., identifiers fetch fails)', async () => { + const err = new Error('boom'); + + H.fetchAllPurposes.mockResolvedValueOnce([{ id: 'p' }] as Purpose[]); + H.fetchAllPreferenceTopics.mockResolvedValueOnce([{ id: 't' }] as PreferenceTopic[]); + H.fetchAllIdentifiers.mockRejectedValueOnce(err); + + await expect(loadReferenceData(client, { logger: console })).rejects.toBe(err); + }); +}); diff --git a/packages/sdk/src/preference-upload/loadReferenceData.ts b/packages/sdk/src/preference-upload/loadReferenceData.ts new file mode 100644 index 00000000..bdd46fb3 --- /dev/null +++ b/packages/sdk/src/preference-upload/loadReferenceData.ts @@ -0,0 +1,37 @@ +import type { Logger } from '@transcend-io/utils'; +import type { GraphQLClient } from 'graphql-request'; + +import { fetchAllIdentifiers, type Identifier } from '../data-inventory/fetchAllIdentifiers.js'; +import { + fetchAllPreferenceTopics, + type PreferenceTopic, +} from '../preference-management/fetchAllPreferenceTopics.js'; +import { fetchAllPurposes, type Purpose } from '../preference-management/fetchAllPurposes.js'; + +export interface PreferenceUploadReferenceData { + /** List of purposes in the organization */ + purposes: Purpose[]; + /** List of preference topics in the organization */ + preferenceTopics: PreferenceTopic[]; + /** List of identifiers in the organization */ + identifiers: Identifier[]; +} + +/** + * Load all required reference data for an upload run. + * + * @param client - GraphQL client + * @param options - Options + * @returns Reference data arrays + */ +export async function loadReferenceData( + client: GraphQLClient, + { logger }: { logger: Logger }, +): Promise { + const [purposes, preferenceTopics, identifiers] = await Promise.all([ + fetchAllPurposes(client, { logger }), + fetchAllPreferenceTopics(client, { logger }), + fetchAllIdentifiers(client, { logger }), + ]); + return { purposes, preferenceTopics, identifiers }; +} diff --git a/packages/sdk/src/preference-upload/progress.ts b/packages/sdk/src/preference-upload/progress.ts new file mode 100644 index 00000000..31761a7b --- /dev/null +++ b/packages/sdk/src/preference-upload/progress.ts @@ -0,0 +1,56 @@ +import type { PreferenceUploadProgress } from '../preference-management/types.js'; + +/** Per-file summary emitted when a file finishes processing */ +export interface FileProgressInfo { + /** File path or identifier */ + file: string; + /** Number of safe (non-conflicting) records uploaded */ + safeCount: number; + /** Number of conflicting records uploaded */ + conflictCount: number; + /** Number of records skipped (already in sync) */ + skippedCount: number; + /** Number of records that failed to upload */ + failedCount: number; + /** Total records in the file */ + totalRecords: number; +} + +/** + * Structured progress reporting interface for preference uploads. + * + * Consumers implement this to receive machine-readable progress updates: + * - CLI: writes receipts + updates terminal dashboard + * - Container: writes progress.json for Retool, posts to Transcend API + * - Agent: emits structured events + */ +export interface UploadProgressSink { + /** Called when a file starts processing */ + onFileStart(file: string, totalRecords: number): void; + /** Called periodically as records are uploaded */ + onFileProgress(file: string, progress: PreferenceUploadProgress): void; + /** Called when a file finishes (success or partial) */ + onFileComplete(file: string, info: FileProgressInfo): void; + /** Called on non-fatal errors (e.g. skipped identifier) */ + onError(file: string, error: string): void; + /** Called when the entire job finishes */ + onJobComplete(summary: { + /** Total files processed */ + totalFiles: number; + /** Total records across all files */ + totalRecords: number; + /** Wall-clock time in milliseconds */ + elapsedMs: number; + /** Per-file summaries */ + filesCompleted: FileProgressInfo[]; + }): void; +} + +/** No-op sink for when progress reporting isn't needed */ +export const noopProgressSink: UploadProgressSink = { + onFileStart: () => {}, + onFileProgress: () => {}, + onFileComplete: () => {}, + onError: () => {}, + onJobComplete: () => {}, +};