|
| 1 | +from unicodedata import normalize, decomposition, combining |
| 2 | +import string |
| 3 | + |
| 4 | +class normalizeUnicode: |
| 5 | + |
| 6 | + # Hand-made table from PloneTool.py |
| 7 | + mapping_custom_1 = { |
| 8 | + 138: 's', 142: 'z', 154: 's', 158: 'z', 159: 'Y' } |
| 9 | + |
| 10 | + # UnicodeData.txt does not contain normalization of Greek letters. |
| 11 | + mapping_greek = { |
| 12 | + 912: 'i', 913: 'A', 914: 'B', 915: 'G', 916: 'D', 917: 'E', 918: 'Z', |
| 13 | + 919: 'I', 920: 'TH', 921: 'I', 922: 'K', 923: 'L', 924: 'M', 925: 'N', |
| 14 | + 926: 'KS', 927: 'O', 928: 'P', 929: 'R', 931: 'S', 932: 'T', 933: 'Y', |
| 15 | + 934: 'F', 936: 'PS', 937: 'O', 938: 'I', 939: 'Y', 940: 'a', 941: 'e', |
| 16 | + 943: 'i', 944: 'y', 945: 'a', 946: 'b', 947: 'g', 948: 'd', 949: 'e', |
| 17 | + 950: 'z', 951: 'i', 952: 'th', 953: 'i', 954: 'k', 955: 'l', 956: 'm', |
| 18 | + 957: 'n', 958: 'ks', 959: 'o', 960: 'p', 961: 'r', 962: 's', 963: 's', |
| 19 | + 964: 't', 965: 'y', 966: 'f', 968: 'ps', 969: 'o', 970: 'i', 971: 'y', |
| 20 | + 972: 'o', 973: 'y' } |
| 21 | + |
| 22 | + # This may be specific to German... |
| 23 | + mapping_two_chars = { |
| 24 | + 140 : 'OE', 156: 'oe', 196: 'Ae', 246: 'oe', 252: 'ue', 214: 'Oe', |
| 25 | + 228 : 'ae', 220: 'Ue', 223: 'ss', 230: 'e', 198: 'E' } |
| 26 | + #140 : 'O', 156: 'o', 196: 'A', 246: 'o', 252: 'u', 214: 'O', |
| 27 | + #228 : 'a', 220: 'U', 223: 's', 230: 'e', 198: 'E' } |
| 28 | + |
| 29 | + mapping_latin_chars = { |
| 30 | + 192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 197 : 'A', 199 : 'C', 200 : 'E', |
| 31 | + 201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I', 206 : 'I', 207 : 'I', |
| 32 | + 208 : 'D', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O', 213 : 'O', 215 : 'x', |
| 33 | + 216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U', 221 : 'Y', 224 : 'a', 225 : 'a', |
| 34 | + 226 : 'a', 227 : 'a', 229 : 'a', 231 : 'c', 232 : 'e', 233 : 'e', 234 : 'e', |
| 35 | + 235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'd', 241 : 'n', |
| 36 | + 242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 248 : 'o', 249 : 'u', 250 : 'u', |
| 37 | + 251 : 'u', 253 : 'y', 255 : 'y' } |
| 38 | + |
| 39 | + # Feel free to add new user-defined mapping. Don't forget to update mapping dict |
| 40 | + # with your dict. |
| 41 | + |
| 42 | + mapping = {} |
| 43 | + mapping.update(mapping_custom_1) |
| 44 | + mapping.update(mapping_greek) |
| 45 | + mapping.update(mapping_two_chars) |
| 46 | + mapping.update(mapping_latin_chars) |
| 47 | + |
| 48 | + # On OpenBSD string.whitespace has a non-standard implementation |
| 49 | + # See http://plone.org/collector/4704 for details |
| 50 | + whitespace = ''.join([c for c in string.whitespace if ord(c) < 128]) |
| 51 | + allowed = string.ascii_letters + string.digits + string.punctuation + whitespace |
| 52 | + allowedid = string.ascii_letters + string.digits + '-' |
| 53 | + |
| 54 | + encoding = 'humanascii' |
| 55 | + |
| 56 | + def __init__ (self, encoding='humanascii'): |
| 57 | + self.encoding = encoding |
| 58 | + |
| 59 | + |
| 60 | + def code (self, text): |
| 61 | + """ |
| 62 | + This method is used for normalization of unicode characters to the base ASCII |
| 63 | + letters. Output is ASCII encoded string (or char) with only ASCII letters, |
| 64 | + digits, punctuation and whitespace characters. Case is preserved. |
| 65 | + """ |
| 66 | + if text == "": |
| 67 | + return "" |
| 68 | + |
| 69 | + unicodeinput = True |
| 70 | + if not isinstance(text, str): |
| 71 | + text = unicode(text, 'utf-8') |
| 72 | + unicodeinput = False |
| 73 | + |
| 74 | + res = '' |
| 75 | + if self.encoding == 'humanascii' or self.encoding == 'identifier': |
| 76 | + enc = 'ascii' |
| 77 | + else: |
| 78 | + enc = self.encoding |
| 79 | + for ch in text: |
| 80 | + if (self.encoding == 'humanascii') and (ch in self.allowed): |
| 81 | + # ASCII chars, digits etc. stay untouched |
| 82 | + res += ch |
| 83 | + continue |
| 84 | + if (self.encoding == 'identifier') and (ch in self.allowedid): |
| 85 | + # ASCII chars, digits etc. stay untouched |
| 86 | + res += ch |
| 87 | + continue |
| 88 | + else: |
| 89 | + try: |
| 90 | + ch.encode(enc,'strict') |
| 91 | + if self.encoding == 'identifier': |
| 92 | + res += '-' |
| 93 | + else: |
| 94 | + res += ch |
| 95 | + except UnicodeEncodeError: |
| 96 | + ordinal = ord(ch) |
| 97 | + if ordinal in self.mapping: |
| 98 | + # try to apply custom mappings |
| 99 | + res += self.mapping.get(ordinal) |
| 100 | + elif decomposition(ch) or len(normalize('NFKD',ch)) > 1: |
| 101 | + normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)) #.strip() |
| 102 | + # normalized string may contain non-letter chars too. Remove them |
| 103 | + # normalized string may result to more than one char |
| 104 | + if self.encoding == 'identifier': |
| 105 | + res += ''.join([c for c in normalized if c in self.allowedid]) |
| 106 | + else: |
| 107 | + res += ''.join([c for c in normalized if c in self.allowed]) |
| 108 | + else: |
| 109 | + # hex string instead of unknown char |
| 110 | + res += "%x" % ordinal |
| 111 | + if self.encoding == 'identifier': |
| 112 | + res = res.strip('-').replace('-----','-').replace('----','-').replace('---','-').replace('--','-') |
| 113 | + if not res.strip('-')[0] in string.ascii_letters: |
| 114 | + res = '-' + res |
| 115 | + if unicodeinput: |
| 116 | + return res |
| 117 | + else: |
| 118 | + return res.encode('utf-8') |
0 commit comments