1- // VersionID format:
1+ // Hex VersionID format:
22// timestamp sequential_position rep_group_id other_information
33// where:
44// - timestamp 14 bytes epoch in ms (good untill 5138)
55// - sequential_position 06 bytes position in the ms slot (1B ops)
66// - rep_group_id 07 bytes replication group identifier
77// - other_information arbitrary user input, such as a unique string
8+ //
9+ // Legacy Base62 VersionID:
10+ // timestamp sequential_position rep_group_id
11+ // where:
12+ // - timestamp 14 bytes epoch in ms
13+ // - sequential_position 06 bytes position in the ms slot
14+ // - rep_group_id 07 bytes replication group identifie
15+ //
16+ // Base62 VersionID:
17+ // timestamp sequential_position rep_group_id instance_id version_id_format
18+ // where:
19+ // - timestamp 14 bytes epoch in ms
20+ // - sequential_position 06 bytes position in the ms slot
21+ // - rep_group_id 07 bytes replication group identifier
22+ // - instance_id 06 bytes unique instance identifier
23+ // - version_id_format 02 bytes version ID format marker + version
824
925import base62Integer from 'base62' ;
1026import baseX from 'base-x' ;
27+ import assert from 'assert' ;
28+ import { VersioningConstants } from './constants' ;
1129const BASE62 = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ;
1230const base62String = baseX ( BASE62 ) ;
1331
1432// the lengths of the components in bytes
1533const LENGTH_TS = 14 ; // timestamp: epoch in ms
1634const LENGTH_SEQ = 6 ; // position in ms slot
1735const LENGTH_RG = 7 ; // replication group id
36+ const LENGTH_ID = 6 ; // instance id
37+ const LENGTH_FT = 2 ; // version ID format, 1 byte + separator
1838
1939// empty string template for the variables in a versionId
2040const TEMPLATE_TS = new Array ( LENGTH_TS + 1 ) . join ( '0' ) ;
2141const TEMPLATE_SEQ = new Array ( LENGTH_SEQ + 1 ) . join ( '0' ) ;
2242const TEMPLATE_RG = new Array ( LENGTH_RG + 1 ) . join ( ' ' ) ;
43+ const TEMPLATE_ID = new Array ( LENGTH_ID + 1 ) . join ( '0' ) ;
2344
2445export const S3_VERSION_ID_ENCODING_TYPE = process . env . S3_VERSION_ID_ENCODING_TYPE ;
2546
26- // Counter that is increased after each call to generateUniqueVersionId
27- export let uidCounter = 0 ;
28- export const versionIdSeed = getVersionIdSeed ( ) ;
47+ // Flag to enable the new version ID (35 characters) over legacy shortID format (27 characters).
48+ // When enabled and S3_VERSION_ID_ENCODING_TYPE is 'base62':
49+ // - Uses new format: timestamp + sequential_position + rep_group_id + instance_id + version_id_format
50+ // - Includes instance_id field to differentiate version IDs across multiple instances in the same k8s cluster
51+ // - Appends format marker and version identifier for format detection
52+ // When disabled and S3_VERSION_ID_ENCODING_TYPE is 'base62':
53+ // - Uses old format: timestamp + sequential_position + rep_group_id (legacy 27-char format)
54+ // Falls back to hex encoding if S3_VERSION_ID_ENCODING_TYPE is 'hex' or unset
55+ export const ENABLE_FORMATTED_VERSION_ID =
56+ process . env . ENABLE_FORMATTED_VERSION_ID === 'true' ||
57+ process . env . ENABLE_FORMATTED_VERSION_ID === '1' ;
58+
59+ // version ID format added to the end of the version ID
60+ const VERSION_ID_FORMAT_VERSION = '1' ;
61+ const VERSION_ID_FORMAT_SUFFIX = `${ VersioningConstants . VersionId . FormatMarker } ${ VERSION_ID_FORMAT_VERSION } ` ;
62+ assert ( VERSION_ID_FORMAT_SUFFIX . length === LENGTH_FT , `versionID format must be ${ LENGTH_FT } bytes` ) ;
63+
64+ const LEGACY_BASE62_DECODED_LENGTH = 27 ;
65+ const BASE62_DECODED_LENGTH = 35 ;
66+ const BASE62_ENCODED_LENGTH = 32 ;
2967
3068/**
3169 * Left-pad a string representation of a value with a given template.
@@ -91,23 +129,6 @@ function wait(span: number) {
91129 }
92130}
93131
94- export function getVersionIdSeed ( ) : string {
95- // The HOSTNAME environment variable is set by default by Kubernetes
96- // and populated with the pod name, containing a suffix with a unique id
97- // as a string.
98- // By default, we rely on the pid, to account for multiple workers in
99- // cluster mode. As a result, the unique id is either <pod-suffix>.<pid>
100- // or <pid>.
101- // If unique vID are needed in a multi cluster mode architecture (i.e.,
102- // multiple server instances, each with multiple workers), the
103- // HOSTNAME environment variable can be set.
104- return `${ process . env . HOSTNAME ?. split ( '-' ) . pop ( ) || '' } ${ process . pid } ` ;
105- }
106-
107- export function generateUniqueVersionId ( replicationGroupId : string ) : string {
108- return generateVersionId ( `${ versionIdSeed } .${ uidCounter ++ } ` , replicationGroupId ) ;
109- }
110-
111132/**
112133 * This function returns a "versionId" string indicating the current time as a
113134 * combination of the current time in millisecond, the position of the request
@@ -124,6 +145,20 @@ export function generateVersionId(info: string, replicationGroupId: string): str
124145 // replication group ID, like PARIS; will be trimmed if exceed LENGTH_RG
125146 const repGroupId = padRight ( replicationGroupId , TEMPLATE_RG ) ;
126147
148+ let otherInfo = '' ;
149+ let instanceIdPadded = '' ;
150+ let formatSuffix = '' ;
151+
152+ if ( ! S3_VERSION_ID_ENCODING_TYPE || S3_VERSION_ID_ENCODING_TYPE === 'hex' ) {
153+ // In HEX encoding, the full info data is used.
154+ otherInfo = info ;
155+ } else if ( ENABLE_FORMATTED_VERSION_ID ) {
156+ // In base62, info is for the instance ID and is trimmed/padded.
157+ instanceIdPadded = padRight ( info , TEMPLATE_ID ) ;
158+ // Add the version ID format marker and version.
159+ formatSuffix = VERSION_ID_FORMAT_SUFFIX ;
160+ }
161+
127162 // Need to wait for the millisecond slot got "flushed". We wait for
128163 // only a single millisecond when the module is restarted, which is
129164 // necessary for the correctness of the system. This is therefore cheap.
@@ -143,13 +178,6 @@ export function generateVersionId(info: string, replicationGroupId: string): str
143178 lastSeq = lastTimestamp === ts ? lastSeq + 1 : 0 ;
144179 lastTimestamp = ts ;
145180
146- // if S3_VERSION_ID_ENCODING_TYPE is "hex", info is used.
147- if ( S3_VERSION_ID_ENCODING_TYPE === 'hex' || ! S3_VERSION_ID_ENCODING_TYPE ) {
148- // info field stays as is
149- } else {
150- info = '' ;
151- }
152-
153181 // In the default cases, we reverse the chronological order of the
154182 // timestamps so that all versions of an object can be retrieved in the
155183 // reversed chronological order---newest versions first. This is because of
@@ -158,7 +186,9 @@ export function generateVersionId(info: string, replicationGroupId: string): str
158186 padLeft ( MAX_TS - lastTimestamp , TEMPLATE_TS ) +
159187 padLeft ( MAX_SEQ - lastSeq , TEMPLATE_SEQ ) +
160188 repGroupId +
161- info
189+ otherInfo +
190+ instanceIdPadded +
191+ formatSuffix
162192 ) ;
163193}
164194
@@ -271,6 +301,30 @@ export function base62Decode(str: string): string | Error {
271301export const ENC_TYPE_HEX = 0 ; // legacy (large) encoding
272302export const ENC_TYPE_BASE62 = 1 ; // new (tiny) encoding
273303
304+ /**
305+ * Checks if the given versionId string contains the specified format version.
306+ *
307+ * @param versionId - The versionId string to check.
308+ * @param version - The expected format version.
309+ * @returns true if the versionId contains the format marker and version, false otherwise.
310+ */
311+ function hasVersionIDFormat ( versionId : string , version : string ) : boolean {
312+ // Format marker can only exist after the required versionId sections.
313+ // This check removes the risk of looking for the format marker in the
314+ // replication group ID, which can technically contain any character as
315+ // it's set by the end user.
316+ if ( versionId . length < LENGTH_TS + LENGTH_SEQ + LENGTH_RG + LENGTH_FT ) {
317+ return false ; // Not enough characters for format marker
318+ }
319+ // For constant time lookup, we always assume that the format marker is
320+ // at the end of the versionId.
321+ const formatMarkerIdx = versionId . length - LENGTH_FT ;
322+ if ( versionId . charAt ( formatMarkerIdx ) !== VersioningConstants . VersionId . FormatMarker ) {
323+ return false ; // no format marker
324+ }
325+ return versionId . substring ( formatMarkerIdx + 1 ) === version ; // check if the version matches
326+ }
327+
274328/**
275329 * Encode a versionId to obscure internal information contained
276330 * in a version ID.
@@ -279,8 +333,9 @@ export const ENC_TYPE_BASE62 = 1; // new (tiny) encoding
279333 * @return - the encoded versionId
280334 */
281335export function encode ( str : string ) : string {
282- // default format without 'info' field will always be 27 characters
283- if ( str . length === 27 ) {
336+ // Legacy base62 version IDs (without 'info' field) are always 27 characters long.
337+ // The new base62 format is 35 characters and includes the format marker at the end.
338+ if ( str . length === LEGACY_BASE62_DECODED_LENGTH || hasVersionIDFormat ( str , VERSION_ID_FORMAT_VERSION ) ) {
284339 return base62Encode ( str ) ;
285340 } // legacy format
286341 return hexEncode ( str ) ;
@@ -296,15 +351,20 @@ export function encode(str: string): string {
296351 */
297352export function decode ( str : string ) : string | Error {
298353 // default format is exactly 32 characters when encoded
299- if ( str . length === 32 ) {
354+ if ( str . length === BASE62_ENCODED_LENGTH ) {
300355 const decoded : string | Error = base62Decode ( str ) ;
301- if ( typeof decoded === 'string' && decoded . length !== 27 ) {
302- return new Error ( `decoded ${ str } is not length 27` ) ;
356+ // Legacy base62 version IDs (without 'info' field) are always 27 characters long.
357+ // The new base62 format is always 35 characters long.
358+ if ( typeof decoded === 'string' &&
359+ decoded . length !== LEGACY_BASE62_DECODED_LENGTH &&
360+ decoded . length !== BASE62_DECODED_LENGTH ) {
361+ return new Error ( `decoded ${ str } is not length ` +
362+ `${ LEGACY_BASE62_DECODED_LENGTH } or ${ BASE62_DECODED_LENGTH } ` ) ;
303363 }
304364 return decoded ;
305365 }
306366 // legacy format
307- if ( str . length > 32 ) {
367+ if ( str . length > BASE62_ENCODED_LENGTH ) {
308368 return hexDecode ( str ) ;
309369 }
310370 return new Error ( `cannot decode str ${ str . length } ` ) ;
0 commit comments