|
| 1 | +// From |
| 2 | +// https://github.com/lodash/lodash/blob/master/deburr.js |
| 3 | + |
| 4 | +/** Used to match Latin Unicode letters (excluding mathematical operators). */ |
| 5 | +const reLatin = /[\xc0-\xd6\xd8-\xf6\xf8-\xff\u0100-\u017f]/g |
| 6 | + |
| 7 | +/** Used to compose unicode character classes. */ |
| 8 | +const rsComboMarksRange = '\\u0300-\\u036f' |
| 9 | +const reComboHalfMarksRange = '\\ufe20-\\ufe2f' |
| 10 | +const rsComboSymbolsRange = '\\u20d0-\\u20ff' |
| 11 | +const rsComboMarksExtendedRange = '\\u1ab0-\\u1aff' |
| 12 | +const rsComboMarksSupplementRange = '\\u1dc0-\\u1dff' |
| 13 | +const rsComboRange = rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange + rsComboMarksExtendedRange + rsComboMarksSupplementRange |
| 14 | + |
| 15 | +/** Used to compose unicode capture groups. */ |
| 16 | +const rsCombo = `[${rsComboRange}]` |
| 17 | + |
| 18 | +/** |
| 19 | + * Used to match [combining diacritical marks](https://en.wikipedia.org/wiki/Combining_Diacritical_Marks) and |
| 20 | + * [combining diacritical marks for symbols](https://en.wikipedia.org/wiki/Combining_Diacritical_Marks_for_Symbols). |
| 21 | + */ |
| 22 | +const reComboMark = RegExp(rsCombo, 'g') |
| 23 | + |
| 24 | +/** Used to map Latin Unicode letters to basic Latin letters. */ |
| 25 | +const deburredLetters = { |
| 26 | + // Latin-1 Supplement block. |
| 27 | + '\xc0': 'A', '\xc1': 'A', '\xc2': 'A', '\xc3': 'A', '\xc4': 'A', '\xc5': 'A', |
| 28 | + '\xe0': 'a', '\xe1': 'a', '\xe2': 'a', '\xe3': 'a', '\xe4': 'a', '\xe5': 'a', |
| 29 | + '\xc7': 'C', '\xe7': 'c', |
| 30 | + '\xd0': 'D', '\xf0': 'd', |
| 31 | + '\xc8': 'E', '\xc9': 'E', '\xca': 'E', '\xcb': 'E', |
| 32 | + '\xe8': 'e', '\xe9': 'e', '\xea': 'e', '\xeb': 'e', |
| 33 | + '\xcc': 'I', '\xcd': 'I', '\xce': 'I', '\xcf': 'I', |
| 34 | + '\xec': 'i', '\xed': 'i', '\xee': 'i', '\xef': 'i', |
| 35 | + '\xd1': 'N', '\xf1': 'n', |
| 36 | + '\xd2': 'O', '\xd3': 'O', '\xd4': 'O', '\xd5': 'O', '\xd6': 'O', '\xd8': 'O', |
| 37 | + '\xf2': 'o', '\xf3': 'o', '\xf4': 'o', '\xf5': 'o', '\xf6': 'o', '\xf8': 'o', |
| 38 | + '\xd9': 'U', '\xda': 'U', '\xdb': 'U', '\xdc': 'U', |
| 39 | + '\xf9': 'u', '\xfa': 'u', '\xfb': 'u', '\xfc': 'u', |
| 40 | + '\xdd': 'Y', '\xfd': 'y', '\xff': 'y', |
| 41 | + '\xc6': 'Ae', '\xe6': 'ae', |
| 42 | + '\xde': 'Th', '\xfe': 'th', |
| 43 | + '\xdf': 'ss', |
| 44 | + // Latin Extended-A block. |
| 45 | + '\u0100': 'A', '\u0102': 'A', '\u0104': 'A', |
| 46 | + '\u0101': 'a', '\u0103': 'a', '\u0105': 'a', |
| 47 | + '\u0106': 'C', '\u0108': 'C', '\u010a': 'C', '\u010c': 'C', |
| 48 | + '\u0107': 'c', '\u0109': 'c', '\u010b': 'c', '\u010d': 'c', |
| 49 | + '\u010e': 'D', '\u0110': 'D', '\u010f': 'd', '\u0111': 'd', |
| 50 | + '\u0112': 'E', '\u0114': 'E', '\u0116': 'E', '\u0118': 'E', '\u011a': 'E', |
| 51 | + '\u0113': 'e', '\u0115': 'e', '\u0117': 'e', '\u0119': 'e', '\u011b': 'e', |
| 52 | + '\u011c': 'G', '\u011e': 'G', '\u0120': 'G', '\u0122': 'G', |
| 53 | + '\u011d': 'g', '\u011f': 'g', '\u0121': 'g', '\u0123': 'g', |
| 54 | + '\u0124': 'H', '\u0126': 'H', '\u0125': 'h', '\u0127': 'h', |
| 55 | + '\u0128': 'I', '\u012a': 'I', '\u012c': 'I', '\u012e': 'I', '\u0130': 'I', |
| 56 | + '\u0129': 'i', '\u012b': 'i', '\u012d': 'i', '\u012f': 'i', '\u0131': 'i', |
| 57 | + '\u0134': 'J', '\u0135': 'j', |
| 58 | + '\u0136': 'K', '\u0137': 'k', '\u0138': 'k', |
| 59 | + '\u0139': 'L', '\u013b': 'L', '\u013d': 'L', '\u013f': 'L', '\u0141': 'L', |
| 60 | + '\u013a': 'l', '\u013c': 'l', '\u013e': 'l', '\u0140': 'l', '\u0142': 'l', |
| 61 | + '\u0143': 'N', '\u0145': 'N', '\u0147': 'N', '\u014a': 'N', |
| 62 | + '\u0144': 'n', '\u0146': 'n', '\u0148': 'n', '\u014b': 'n', |
| 63 | + '\u014c': 'O', '\u014e': 'O', '\u0150': 'O', |
| 64 | + '\u014d': 'o', '\u014f': 'o', '\u0151': 'o', |
| 65 | + '\u0154': 'R', '\u0156': 'R', '\u0158': 'R', |
| 66 | + '\u0155': 'r', '\u0157': 'r', '\u0159': 'r', |
| 67 | + '\u015a': 'S', '\u015c': 'S', '\u015e': 'S', '\u0160': 'S', |
| 68 | + '\u015b': 's', '\u015d': 's', '\u015f': 's', '\u0161': 's', |
| 69 | + '\u0162': 'T', '\u0164': 'T', '\u0166': 'T', |
| 70 | + '\u0163': 't', '\u0165': 't', '\u0167': 't', |
| 71 | + '\u0168': 'U', '\u016a': 'U', '\u016c': 'U', '\u016e': 'U', '\u0170': 'U', '\u0172': 'U', |
| 72 | + '\u0169': 'u', '\u016b': 'u', '\u016d': 'u', '\u016f': 'u', '\u0171': 'u', '\u0173': 'u', |
| 73 | + '\u0174': 'W', '\u0175': 'w', |
| 74 | + '\u0176': 'Y', '\u0177': 'y', '\u0178': 'Y', |
| 75 | + '\u0179': 'Z', '\u017b': 'Z', '\u017d': 'Z', |
| 76 | + '\u017a': 'z', '\u017c': 'z', '\u017e': 'z', |
| 77 | + '\u0132': 'IJ', '\u0133': 'ij', |
| 78 | + '\u0152': 'Oe', '\u0153': 'oe', |
| 79 | + '\u0149': "'n", '\u017f': 's' |
| 80 | +} |
| 81 | + |
| 82 | +const deburrLetter = function(str) { |
| 83 | + return deburredLetters[str] || str |
| 84 | +} |
| 85 | + |
| 86 | +function deburr(str) { |
| 87 | + return str && str.replace(reLatin, deburrLetter).replace(reComboMark, '') |
| 88 | +} |
| 89 | + |
| 90 | +module.exports = deburr |
0 commit comments