Skip to content

Commit c200c2e

Browse files
authored
Merge pull request #190 from SebastianMC/178-week-numbers-date-regex-patterns
#178 - week-number based date extraction patterns for titles, incl. Www, Www- and Www+ specs #191 - Explicit support for the common date formats of `yyyy-mm-dd` and `yyyy-dd-mm`
2 parents f9c9c0b + 6e7b2e1 commit c200c2e

File tree

8 files changed

+621
-33
lines changed

8 files changed

+621
-33
lines changed

src/custom-sort/matchers.ts

Lines changed: 89 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import {
2+
getDateForWeekOfYear
3+
} from "../utils/week-of-year";
4+
15
export const RomanNumberRegexStr: string = ' *([MDCLXVI]+)'; // Roman number
26
export const CompoundRomanNumberDotRegexStr: string = ' *([MDCLXVI]+(?:\\.[MDCLXVI]+)*)';// Compound Roman number with dot as separator
37
export const CompoundRomanNumberDashRegexStr: string = ' *([MDCLXVI]+(?:-[MDCLXVI]+)*)'; // Compound Roman number with dash as separator
@@ -6,15 +10,26 @@ export const NumberRegexStr: string = ' *(\\d+)'; // Plain number
610
export const CompoundNumberDotRegexStr: string = ' *(\\d+(?:\\.\\d+)*)'; // Compound number with dot as separator
711
export const CompoundNumberDashRegexStr: string = ' *(\\d+(?:-\\d+)*)'; // Compound number with dash as separator
812

13+
export const Date_yyyy_mm_dd_RegexStr: string = ' *(\\d{4}-[0-3]*[0-9]-[0-3]*[0-9])'
14+
export const Date_yyyy_dd_mm_RegexStr: string = Date_yyyy_mm_dd_RegexStr
15+
916
export const Date_dd_Mmm_yyyy_RegexStr: string = ' *([0-3]*[0-9]-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\\d{4})'; // Date like 01-Jan-2020
1017
export const Date_Mmm_dd_yyyy_RegexStr: string = ' *((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-[0-3]*[0-9]-\\d{4})'; // Date like Jan-01-2020
1118

12-
export const DOT_SEPARATOR = '.'
19+
export const Date_yyyy_Www_mm_dd_RegexStr: string = ' *(\\d{4}-W[0-5]*[0-9] \\([0-3]*[0-9]-[0-3]*[0-9]\\))'
20+
export const Date_yyyy_WwwISO_RegexStr: string = ' *(\\d{4}-W[0-5]*[0-9][-+]?)'
21+
export const Date_yyyy_Www_RegexStr: string = Date_yyyy_WwwISO_RegexStr
22+
23+
export const DOT_SEPARATOR = '.' // ASCII 46
1324
export const DASH_SEPARATOR = '-'
1425

15-
const SLASH_SEPARATOR = '/' // ASCII 47
26+
const SLASH_SEPARATOR = '/' // ASCII 47, right before ASCII 48 = '0'
27+
const GT_SEPARATOR = '>' // ASCII 62, alphabetical sorting in Collator puts it after /
1628
const PIPE_SEPARATOR = '|' // ASCII 124
1729

30+
const EARLIER_THAN_SLASH_SEPARATOR = DOT_SEPARATOR
31+
const LATER_THAN_SLASH_SEPARATOR = GT_SEPARATOR
32+
1833
export const DEFAULT_NORMALIZATION_PLACES = 8; // Fixed width of a normalized number (with leading zeros)
1934

2035
// Property escapes:
@@ -51,9 +66,9 @@ export function getNormalizedNumber(s: string = '', separator?: string, places?:
5166
// guarantees correct order (/ = ASCII 47, | = ASCII 124)
5267
if (separator) {
5368
const components: Array<string> = s.split(separator).filter(s => s)
54-
return `${components.map((c) => prependWithZeros(c, places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}//`
69+
return `${components.map((c) => prependWithZeros(c, places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
5570
} else {
56-
return `${prependWithZeros(s, places ?? DEFAULT_NORMALIZATION_PLACES)}//`
71+
return `${prependWithZeros(s, places ?? DEFAULT_NORMALIZATION_PLACES)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
5772
}
5873
}
5974

@@ -97,9 +112,9 @@ export function getNormalizedRomanNumber(s: string, separator?: string, places?:
97112
// guarantees correct order (/ = ASCII 47, | = ASCII 124)
98113
if (separator) {
99114
const components: Array<string> = s.split(separator).filter(s => s)
100-
return `${components.map((c) => prependWithZeros(romanToIntStr(c), places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}//`
115+
return `${components.map((c) => prependWithZeros(romanToIntStr(c), places ?? DEFAULT_NORMALIZATION_PLACES)).join(PIPE_SEPARATOR)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
101116
} else {
102-
return `${prependWithZeros(romanToIntStr(s), places ?? DEFAULT_NORMALIZATION_PLACES)}//`
117+
return `${prependWithZeros(romanToIntStr(s), places ?? DEFAULT_NORMALIZATION_PLACES)}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
103118
}
104119
}
105120

@@ -117,9 +132,76 @@ export function getNormalizedDate_NormalizerFn_for(separator: string, dayIdx: nu
117132
const monthValue = months ? `${1 + MONTHS.indexOf(components[monthIdx])}` : components[monthIdx]
118133
const month = prependWithZeros(monthValue, MONTH_POSITIONS)
119134
const year = prependWithZeros(components[yearIdx], YEAR_POSITIONS)
120-
return `${year}-${month}-${day}//`
135+
return `${year}-${month}-${day}${SLASH_SEPARATOR}${SLASH_SEPARATOR}`
121136
}
122137
}
123138

139+
export const getNormalizedDate_yyyy_mm_dd_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 2, 1, 0)
140+
export const getNormalizedDate_yyyy_dd_mm_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 1, 2, 0)
124141
export const getNormalizedDate_dd_Mmm_yyyy_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 0, 1, 2, MONTHS)
125142
export const getNormalizedDate_Mmm_dd_yyyy_NormalizerFn = getNormalizedDate_NormalizerFn_for('-', 1, 0, 2, MONTHS)
143+
144+
const DateExtractor_orderModifier_earlier_than = '-'
145+
const DateExtractor_orderModifier_later_than = '+'
146+
147+
const DateExtractor_yyyy_Www_mm_dd_Regex = /(\d{4})-W(\d{1,2}) \((\d{2})-(\d{2})\)/
148+
const DateExtractor_yyyy_Www_Regex = /(\d{4})-W(\d{1,2})([-+]?)/
149+
150+
// Matching groups
151+
const YEAR_IDX = 1
152+
const WEEK_IDX = 2
153+
const MONTH_IDX = 3
154+
const DAY_IDX = 4
155+
const RELATIVE_ORDER_IDX = 3 // For the yyyy-Www only: yyyy-Www- or yyyy-Www+
156+
157+
const DECEMBER = 12
158+
const JANUARY = 1
159+
160+
export function getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(consumeWeek: boolean, weeksISO?: boolean) {
161+
return (s: string): string | null => {
162+
// Assumption - the regex date matched against input s, no extensive defensive coding needed
163+
const matches = consumeWeek ? DateExtractor_yyyy_Www_Regex.exec(s) : DateExtractor_yyyy_Www_mm_dd_Regex.exec(s)
164+
const yearStr = matches![YEAR_IDX]
165+
let yearNumber = Number.parseInt(yearStr,10)
166+
let monthNumber: number
167+
let dayNumber: number
168+
let separator = SLASH_SEPARATOR // different values enforce relative > < order of same dates
169+
let useLastDayOfWeek: boolean = false
170+
if (consumeWeek) {
171+
const weekNumberStr = matches![WEEK_IDX]
172+
const weekNumber = Number.parseInt(weekNumberStr, 10)
173+
const orderModifier: string|undefined = matches![RELATIVE_ORDER_IDX]
174+
if (orderModifier === DateExtractor_orderModifier_earlier_than) {
175+
separator = EARLIER_THAN_SLASH_SEPARATOR
176+
} else if (orderModifier === DateExtractor_orderModifier_later_than) {
177+
separator = LATER_THAN_SLASH_SEPARATOR // Will also need to adjust the date to the last day of the week
178+
useLastDayOfWeek = true
179+
}
180+
const dateForWeek = getDateForWeekOfYear(yearNumber, weekNumber, weeksISO, useLastDayOfWeek)
181+
monthNumber = dateForWeek.getMonth()+1 // 1 - 12
182+
dayNumber = dateForWeek.getDate() // 1 - 31
183+
// Be careful with edge dates, which can belong to previous or next year
184+
if (weekNumber === 1) {
185+
if (monthNumber === DECEMBER) {
186+
yearNumber--
187+
}
188+
}
189+
if (weekNumber >= 50) {
190+
if (monthNumber === JANUARY) {
191+
yearNumber++
192+
}
193+
}
194+
} else { // ignore week
195+
monthNumber = Number.parseInt(matches![MONTH_IDX],10)
196+
dayNumber = Number.parseInt(matches![DAY_IDX], 10)
197+
}
198+
return `${prependWithZeros(`${yearNumber}`, YEAR_POSITIONS)}` +
199+
`-${prependWithZeros(`${monthNumber}`, MONTH_POSITIONS)}` +
200+
`-${prependWithZeros(`${dayNumber}`, DAY_POSITIONS)}` +
201+
`${separator}${SLASH_SEPARATOR}`
202+
}
203+
}
204+
205+
export const getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(false)
206+
export const getNormalizedDate_yyyy_WwwISO_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(true, true)
207+
export const getNormalizedDate_yyyy_Www_NormalizerFn = getNormalizedDate_NormalizerFn_yyyy_Www_mm_dd(true, false)

src/custom-sort/sorting-spec-processor.ts

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,19 @@ import {
1919
DASH_SEPARATOR,
2020
Date_dd_Mmm_yyyy_RegexStr,
2121
Date_Mmm_dd_yyyy_RegexStr,
22+
Date_yyyy_dd_mm_RegexStr,
23+
Date_yyyy_mm_dd_RegexStr,
24+
Date_yyyy_Www_mm_dd_RegexStr,
25+
Date_yyyy_Www_RegexStr,
26+
Date_yyyy_WwwISO_RegexStr,
2227
DOT_SEPARATOR,
2328
getNormalizedDate_dd_Mmm_yyyy_NormalizerFn,
2429
getNormalizedDate_Mmm_dd_yyyy_NormalizerFn,
30+
getNormalizedDate_yyyy_dd_mm_NormalizerFn,
31+
getNormalizedDate_yyyy_mm_dd_NormalizerFn,
32+
getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn,
33+
getNormalizedDate_yyyy_Www_NormalizerFn,
34+
getNormalizedDate_yyyy_WwwISO_NormalizerFn,
2535
getNormalizedNumber,
2636
getNormalizedRomanNumber,
2737
NumberRegexStr,
@@ -36,10 +46,7 @@ import {
3646
MATCH_CHILDREN_2_SUFFIX,
3747
NO_PRIORITY
3848
} from "./folder-matching-rules"
39-
import {
40-
MDataExtractor,
41-
tryParseAsMDataExtractorSpec
42-
} from "./mdata-extractors";
49+
import {MDataExtractor, tryParseAsMDataExtractorSpec} from "./mdata-extractors";
4350

4451
interface ProcessingContext {
4552
folderPath: string
@@ -352,8 +359,13 @@ const InlineRegexSymbol_Digit1: string = '\\d'
352359
const InlineRegexSymbol_Digit2: string = '\\[0-9]'
353360
const InlineRegexSymbol_0_to_3: string = '\\[0-3]'
354361

362+
const Date_yyyy_mm_dd_RegexSymbol: string = '\\[yyyy-mm-dd]'
363+
const Date_yyyy_dd_mm_RegexSymbol: string = '\\[yyyy-dd-mm]'
355364
const Date_dd_Mmm_yyyy_RegexSymbol: string = '\\[dd-Mmm-yyyy]'
356365
const Date_Mmm_dd_yyyy_RegexSymbol: string = '\\[Mmm-dd-yyyy]'
366+
const Date_yyyy_Www_mm_dd_RegexSymbol: string = '\\[yyyy-Www (mm-dd)]'
367+
const Date_yyyy_Www_RegexSymbol: string = '\\[yyyy-Www]'
368+
const Date_yyyy_WwwISO_RegexSymbol: string = '\\[yyyy-WwwISO]'
357369

358370
const InlineRegexSymbol_CapitalLetter: string = '\\C'
359371
const InlineRegexSymbol_LowercaseLetter: string = '\\l'
@@ -373,8 +385,13 @@ const sortingSymbolsArr: Array<string> = [
373385
escapeRegexUnsafeCharacters(CompoundRomanNumberDashRegexSymbol),
374386
escapeRegexUnsafeCharacters(WordInASCIIRegexSymbol),
375387
escapeRegexUnsafeCharacters(WordInAnyLanguageRegexSymbol),
388+
escapeRegexUnsafeCharacters(Date_yyyy_mm_dd_RegexSymbol),
389+
escapeRegexUnsafeCharacters(Date_yyyy_dd_mm_RegexSymbol),
376390
escapeRegexUnsafeCharacters(Date_dd_Mmm_yyyy_RegexSymbol),
377-
escapeRegexUnsafeCharacters(Date_Mmm_dd_yyyy_RegexSymbol)
391+
escapeRegexUnsafeCharacters(Date_Mmm_dd_yyyy_RegexSymbol),
392+
escapeRegexUnsafeCharacters(Date_yyyy_Www_mm_dd_RegexSymbol),
393+
escapeRegexUnsafeCharacters(Date_yyyy_WwwISO_RegexSymbol),
394+
escapeRegexUnsafeCharacters(Date_yyyy_Www_RegexSymbol),
378395
]
379396

380397
const sortingSymbolsRegex = new RegExp(sortingSymbolsArr.join('|'), 'gi')
@@ -442,8 +459,13 @@ export const CompoundDashRomanNumberNormalizerFn: NormalizerFn = (s: string) =>
442459
export const NumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s)
443460
export const CompoundDotNumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s, DOT_SEPARATOR)
444461
export const CompoundDashNumberNormalizerFn: NormalizerFn = (s: string) => getNormalizedNumber(s, DASH_SEPARATOR)
462+
export const Date_yyyy_mm_dd_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_mm_dd_NormalizerFn(s)
463+
export const Date_yyyy_dd_mm_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_dd_mm_NormalizerFn(s)
445464
export const Date_dd_Mmm_yyyy_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_dd_Mmm_yyyy_NormalizerFn(s)
446465
export const Date_Mmm_dd_yyyy_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_Mmm_dd_yyyy_NormalizerFn(s)
466+
export const Date_yyyy_Www_mm_dd_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_Www_mm_dd_NormalizerFn(s)
467+
export const Date_yyyy_WwwISO_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_WwwISO_NormalizerFn(s)
468+
export const Date_yyyy_Www_NormalizerFn: NormalizerFn = (s: string) => getNormalizedDate_yyyy_Www_NormalizerFn(s)
447469

448470
export enum AdvancedRegexType {
449471
None, // to allow if (advancedRegex)
@@ -455,8 +477,13 @@ export enum AdvancedRegexType {
455477
CompoundDashRomanNumber,
456478
WordInASCII,
457479
WordInAnyLanguage,
480+
Date_yyyy_mm_dd,
481+
Date_yyyy_dd_mm,
458482
Date_dd_Mmm_yyyy,
459-
Date_Mmm_dd_yyyy
483+
Date_Mmm_dd_yyyy,
484+
Date_yyyy_Www_mm_dd_yyyy,
485+
Date_yyyy_WwwISO,
486+
Date_yyyy_Www
460487
}
461488

462489
const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
@@ -501,6 +528,16 @@ const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
501528
advancedRegexType: AdvancedRegexType.WordInAnyLanguage,
502529
unicodeRegex: true
503530
},
531+
[Date_yyyy_mm_dd_RegexSymbol]: { // Intentionally retain character case
532+
regexpStr: Date_yyyy_mm_dd_RegexStr,
533+
normalizerFn: Date_yyyy_mm_dd_NormalizerFn,
534+
advancedRegexType: AdvancedRegexType.Date_yyyy_mm_dd
535+
},
536+
[Date_yyyy_dd_mm_RegexSymbol]: { // Intentionally retain character case
537+
regexpStr: Date_yyyy_dd_mm_RegexStr,
538+
normalizerFn: Date_yyyy_dd_mm_NormalizerFn,
539+
advancedRegexType: AdvancedRegexType.Date_yyyy_dd_mm
540+
},
504541
[Date_dd_Mmm_yyyy_RegexSymbol]: { // Intentionally retain character case
505542
regexpStr: Date_dd_Mmm_yyyy_RegexStr,
506543
normalizerFn: Date_dd_Mmm_yyyy_NormalizerFn,
@@ -510,6 +547,21 @@ const sortingSymbolToRegexpStr: { [key: string]: RegExpSpecStr } = {
510547
regexpStr: Date_Mmm_dd_yyyy_RegexStr,
511548
normalizerFn: Date_Mmm_dd_yyyy_NormalizerFn,
512549
advancedRegexType: AdvancedRegexType.Date_Mmm_dd_yyyy
550+
},
551+
[Date_yyyy_Www_mm_dd_RegexSymbol]: { // Intentionally retain character case
552+
regexpStr: Date_yyyy_Www_mm_dd_RegexStr,
553+
normalizerFn: Date_yyyy_Www_mm_dd_NormalizerFn,
554+
advancedRegexType: AdvancedRegexType.Date_yyyy_Www_mm_dd_yyyy
555+
},
556+
[Date_yyyy_WwwISO_RegexSymbol]: { // Intentionally retain character case
557+
regexpStr: Date_yyyy_WwwISO_RegexStr,
558+
normalizerFn: Date_yyyy_WwwISO_NormalizerFn,
559+
advancedRegexType: AdvancedRegexType.Date_yyyy_WwwISO
560+
},
561+
[Date_yyyy_Www_RegexSymbol]: { // Intentionally retain character case
562+
regexpStr: Date_yyyy_Www_RegexStr,
563+
normalizerFn: Date_yyyy_Www_NormalizerFn,
564+
advancedRegexType: AdvancedRegexType.Date_yyyy_Www
513565
}
514566
}
515567

0 commit comments

Comments
 (0)