#115 - Add inline regex support to match capital letters and lowercase letters explicitly

SebastianMC · SebastianMC · commit 88327f6314f7 · 2023-11-24T18:52:26.000+01:00
diff --git a/src/custom-sort/custom-sort.ts b/src/custom-sort/custom-sort.ts
@@ -372,13 +372,15 @@ export const determineSortingGroup = function (entry: TFile | TFolder, spec: Cus
 		switch (group.type) {
 			case CustomSortGroupType.ExactPrefix:
 				if (group.exactPrefix) {
+					console.log(`Exact prefix check`)
 					if (nameForMatching.startsWith(group.exactPrefix)) {
 						determined = true;
 					}
 				} else { // regexp is involved
 					const [matched, matchedGroup] = matchGroupRegex(group.regexPrefix!, nameForMatching)
 					determined = matched
 					derivedText = matchedGroup ?? derivedText
+					console.log(`Exact regexp prefix check ${group.regexPrefix?.regex?.toString()} vs. ${nameForMatching} = ${matched}`)
 				}
 				break;
 			case CustomSortGroupType.ExactSuffix:
diff --git a/src/custom-sort/sorting-spec-processor.spec.ts b/src/custom-sort/sorting-spec-processor.spec.ts
@@ -2957,17 +2957,23 @@ describe('convertPlainStringWithNumericSortingSymbolToRegex', () => {
 	it('should correctly include regex token for string end', () => {
 		const input1 = 'Part\\-D+:'
 		const input2 = ' \\[0-9]\\-D+'
+		const input3 = ' \\l\\[0-9]\\-D+'
 		const result1 = convertPlainStringToRegex(input1, RegexpUsedAs.Suffix)
 		const result2 = convertPlainStringToRegex(input2, RegexpUsedAs.Suffix)
+		const result3 = convertPlainStringToRegex(input3, RegexpUsedAs.Suffix)
 		expect(result1?.regexpSpec.regex).toEqual(/Part *(\d+(?:-\d+)*):$/i)
 		expect(result2?.regexpSpec.regex).toEqual(/ [0-9] *(\d+(?:-\d+)*)$/i)
+		expect(result3?.regexpSpec.regex).toEqual(/ \p{Ll}[0-9] *(\d+(?:-\d+)*)$/u)
 	})
 	it('should correctly include regex token for string begin and end', () => {
 		const input1 = 'Part\\.D+:'
 		const input2 = ' \\d \\[0-9] '
+		const input3 = ' \\d \\[0-9] \\C'
 		const result1 = convertPlainStringToRegex(input1, RegexpUsedAs.FullMatch)
 		const result2 = convertPlainStringToRegex(input2, RegexpUsedAs.FullMatch)
+		const result3 = convertPlainStringToRegex(input3, RegexpUsedAs.FullMatch)
 		expect(result1?.regexpSpec.regex).toEqual(/^Part *(\d+(?:\.\d+)*):$/i)
 		expect(result2?.regexpSpec.regex).toEqual(/^ \d [0-9] $/i)
+		expect(result3?.regexpSpec.regex).toEqual(/^ \d [0-9] [\p{Lu}\p{Lt}]$/u)
 	})
 })
diff --git a/src/custom-sort/sorting-spec-processor.ts b/src/custom-sort/sorting-spec-processor.ts
@@ -325,6 +325,9 @@ const InlineRegexSymbol_Digit1: string = '\\d'
 const InlineRegexSymbol_Digit2: string = '\\[0-9]'
 const InlineRegexSymbol_0_to_3: string = '\\[0-3]'
 
+const InlineRegexSymbol_CapitalLetter: string = '\\C'
+const InlineRegexSymbol_LowercaseLetter: string = '\\l'
+
 const UnsafeRegexCharsRegex: RegExp = /[\^$.\-+\[\]{}()|*?=!\\]/g
 
 export const escapeRegexUnsafeCharacters = (s: string): string => {
@@ -347,14 +350,24 @@ const sortingSymbolsRegex = new RegExp(sortingSymbolsArr.join('|'), 'gi')
 const inlineRegexSymbolsArrEscapedForRegex: Array<string> = [
 	escapeRegexUnsafeCharacters(InlineRegexSymbol_Digit1),
 	escapeRegexUnsafeCharacters(InlineRegexSymbol_Digit2),
-	escapeRegexUnsafeCharacters(InlineRegexSymbol_0_to_3)
+	escapeRegexUnsafeCharacters(InlineRegexSymbol_0_to_3),
+	escapeRegexUnsafeCharacters(InlineRegexSymbol_CapitalLetter),
+	escapeRegexUnsafeCharacters(InlineRegexSymbol_LowercaseLetter)
 ]
 
+interface RegexExpr {
+	regexExpr: string
+	isUnicode?: boolean
+	isCaseSensitive?: boolean
+}
+
 // Don't be confused if the source lexeme is equal to the resulting regex piece, logically these two distinct spaces
-const inlineRegexSymbolsToRegexExpressionsArr: { [key: string]: string} = {
-	[InlineRegexSymbol_Digit1]: '\\d',
-	[InlineRegexSymbol_Digit2]: '[0-9]',
-	[InlineRegexSymbol_0_to_3]: '[0-3]',
+const inlineRegexSymbolsToRegexExpressionsArr: { [key: string]: RegexExpr} = {
+	[InlineRegexSymbol_Digit1]: {regexExpr: '\\d'},
+	[InlineRegexSymbol_Digit2]: {regexExpr: '[0-9]'},
+	[InlineRegexSymbol_0_to_3]: {regexExpr: '[0-3]'},
+	[InlineRegexSymbol_CapitalLetter]: {regexExpr: '[\\p{Lu}\\p{Lt}]', isUnicode: true, isCaseSensitive: true},
+	[InlineRegexSymbol_LowercaseLetter]: {regexExpr: '\\p{Ll}', isUnicode: true, isCaseSensitive: true}
 }
 
 const inlineRegexSymbolsDetectionRegex = new RegExp(inlineRegexSymbolsArrEscapedForRegex.join('|'), 'gi')
@@ -500,12 +513,14 @@ export const convertPlainStringToRegex = (s: string, actAs: RegexpUsedAs): Regex
 		const [extractedPrefix, extractedSuffix] = s!.split(detectedSymbol)
 		const regexPrefix: string = regexMatchesStart ? '^' : ''
 		const regexSuffix: string = regexMatchesEnding ? '$' : ''
-		const escapedProcessedPrefix: string = convertInlineRegexSymbolsAndEscapeTheRest(extractedPrefix)
-		const escapedProcessedSuffix: string = convertInlineRegexSymbolsAndEscapeTheRest(extractedSuffix)
-		const regexFlags: string = replacement.unicodeRegex ? 'ui' : 'i'
+		const escapedProcessedPrefix: RegexAsString = convertInlineRegexSymbolsAndEscapeTheRest(extractedPrefix)
+		const escapedProcessedSuffix: RegexAsString = convertInlineRegexSymbolsAndEscapeTheRest(extractedSuffix)
+		const regexUnicode: boolean = !!replacement.unicodeRegex || !!escapedProcessedPrefix.isUnicodeRegex || !!escapedProcessedSuffix.isUnicodeRegex
+		const regexCaseSensitive: boolean = !!escapedProcessedPrefix.isCaseSensitiveRegex || !!escapedProcessedSuffix.isCaseSensitiveRegex
+		const regexFlags: string = `${regexUnicode?'u':''}${regexCaseSensitive?'':'i'}`
 		return {
 			regexpSpec: {
-				regex: new RegExp(`${regexPrefix}${escapedProcessedPrefix}${replacement.regexpStr}${escapedProcessedSuffix}${regexSuffix}`, regexFlags),
+				regex: new RegExp(`${regexPrefix}${escapedProcessedPrefix.s}${replacement.regexpStr}${escapedProcessedSuffix.s}${regexSuffix}`, regexFlags),
 				normalizerFn: replacement.normalizerFn
 			},
 			prefix: extractedPrefix,
@@ -516,9 +531,10 @@ export const convertPlainStringToRegex = (s: string, actAs: RegexpUsedAs): Regex
 		const replacement: RegexAsString = convertInlineRegexSymbolsAndEscapeTheRest(s)!
 		const regexPrefix: string = regexMatchesStart ? '^' : ''
 		const regexSuffix: string = regexMatchesEnding ? '$' : ''
+		const regexFlags: string = `${replacement.isUnicodeRegex?'u':''}${replacement.isCaseSensitiveRegex?'':'i'}`
 		return {
 			regexpSpec: {
-				regex: new RegExp(`${regexPrefix}${replacement}${regexSuffix}`, 'i')
+				regex: new RegExp(`${regexPrefix}${replacement.s}${regexSuffix}`, regexFlags)
 			},
 			prefix: '', // shouldn't be used anyway because of the below containsAdvancedRegex: false
 			suffix: '', // ---- // ----
@@ -529,14 +545,22 @@ export const convertPlainStringToRegex = (s: string, actAs: RegexpUsedAs): Regex
 	}
 }
 
-type RegexAsString = string
+export interface RegexAsString {
+	s: string
+	isUnicodeRegex?: boolean
+	isCaseSensitiveRegex?: boolean
+}
 
 export const convertInlineRegexSymbolsAndEscapeTheRest = (s: string): RegexAsString => {
 	if (s === '') {
-		return s
+		return {
+			s: s
+		}
 	}
 
 	let regexAsString: Array<string> = []
+	let isUnicode: boolean = false
+    let isCaseSensitive: boolean = false
 
 	while (s!.length > 0) {
 		// detect the first inline regex
@@ -562,15 +586,22 @@ export const convertInlineRegexSymbolsAndEscapeTheRest = (s: string): RegexAsStr
 				regexAsString.push(escapeRegexUnsafeCharacters(charsBeforeRegexSymbol))
 				s = s!.substring(earliestRegexSymbolIdx)
 			}
-			regexAsString.push(inlineRegexSymbolsToRegexExpressionsArr[earliestRegexSymbol!])
+			const expr = inlineRegexSymbolsToRegexExpressionsArr[earliestRegexSymbol!]
+			regexAsString.push(expr.regexExpr)
+			isUnicode ||= !!expr.isUnicode
+			isCaseSensitive ||= !!expr.isCaseSensitive
 			s = s!.substring(earliestRegexSymbol!.length)
 		} else {
 			regexAsString.push(escapeRegexUnsafeCharacters(s))
 			s = ''
 		}
 	}
 
-	return regexAsString.join('')
+	return {
+		s: regexAsString.join(''),
+		isUnicodeRegex: isUnicode,
+		isCaseSensitiveRegex: isCaseSensitive
+	}
 }
 
 export const MatchFolderNameLexeme: string = 'name:'