Skip to content

Commit b21f58d

Browse files
committed
NormalizeUnicode for filenames and url (fixes #28)
1 parent 2a9875b commit b21f58d

File tree

3 files changed

+122
-4
lines changed

3 files changed

+122
-4
lines changed

app/jekylledit/controllers/base.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from flask import Flask, request, url_for
88
from flask.ext.babel import Babel
99
from ..ext.mailgun import Mailgun
10+
from ..ext.normalizeUnicode import normalizeUnicode
1011

1112

1213
app = Flask('jekylledit')

app/jekylledit/controllers/site.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import frontmatter
77
import hmac
8-
from unicodedata import normalize
98

109
from flask import abort, json, jsonify, request, render_template
1110
from flask.ext.cors import cross_origin
@@ -14,7 +13,7 @@
1413
from pid import PidFile, PidFileAlreadyLockedError
1514

1615
from ..model import Repository, Roles, Sites
17-
from .base import app, mailgun
16+
from .base import app, mailgun, normalizeUnicode
1817
from .auth import authorization_required
1918

2019

@@ -84,8 +83,8 @@ def site_file(site_id, file_id):
8483
postData = data['post']
8584
postIsDraft = False
8685
title = postData[languages[0]]['metadata']['title']
87-
normtitle = normalize('NFKD', title).encode('ascii', 'ignore').decode()
88-
slugtitle = normtitle.replace(' ', '-').lower()
86+
nu = normalizeUnicode('identifier')
87+
slugtitle = nu.code(title).lower()
8988
for i, language in enumerate(languages):
9089
langdata = postData[language]
9190
if not 'permalink' in langdata['metadata']:
+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
from unicodedata import normalize, decomposition, combining
2+
import string
3+
4+
class normalizeUnicode:
5+
6+
# Hand-made table from PloneTool.py
7+
mapping_custom_1 = {
8+
138: 's', 142: 'z', 154: 's', 158: 'z', 159: 'Y' }
9+
10+
# UnicodeData.txt does not contain normalization of Greek letters.
11+
mapping_greek = {
12+
912: 'i', 913: 'A', 914: 'B', 915: 'G', 916: 'D', 917: 'E', 918: 'Z',
13+
919: 'I', 920: 'TH', 921: 'I', 922: 'K', 923: 'L', 924: 'M', 925: 'N',
14+
926: 'KS', 927: 'O', 928: 'P', 929: 'R', 931: 'S', 932: 'T', 933: 'Y',
15+
934: 'F', 936: 'PS', 937: 'O', 938: 'I', 939: 'Y', 940: 'a', 941: 'e',
16+
943: 'i', 944: 'y', 945: 'a', 946: 'b', 947: 'g', 948: 'd', 949: 'e',
17+
950: 'z', 951: 'i', 952: 'th', 953: 'i', 954: 'k', 955: 'l', 956: 'm',
18+
957: 'n', 958: 'ks', 959: 'o', 960: 'p', 961: 'r', 962: 's', 963: 's',
19+
964: 't', 965: 'y', 966: 'f', 968: 'ps', 969: 'o', 970: 'i', 971: 'y',
20+
972: 'o', 973: 'y' }
21+
22+
# This may be specific to German...
23+
mapping_two_chars = {
24+
140 : 'OE', 156: 'oe', 196: 'Ae', 246: 'oe', 252: 'ue', 214: 'Oe',
25+
228 : 'ae', 220: 'Ue', 223: 'ss', 230: 'e', 198: 'E' }
26+
#140 : 'O', 156: 'o', 196: 'A', 246: 'o', 252: 'u', 214: 'O',
27+
#228 : 'a', 220: 'U', 223: 's', 230: 'e', 198: 'E' }
28+
29+
mapping_latin_chars = {
30+
192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 197 : 'A', 199 : 'C', 200 : 'E',
31+
201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I', 206 : 'I', 207 : 'I',
32+
208 : 'D', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O', 213 : 'O', 215 : 'x',
33+
216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U', 221 : 'Y', 224 : 'a', 225 : 'a',
34+
226 : 'a', 227 : 'a', 229 : 'a', 231 : 'c', 232 : 'e', 233 : 'e', 234 : 'e',
35+
235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'd', 241 : 'n',
36+
242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 248 : 'o', 249 : 'u', 250 : 'u',
37+
251 : 'u', 253 : 'y', 255 : 'y' }
38+
39+
# Feel free to add new user-defined mapping. Don't forget to update mapping dict
40+
# with your dict.
41+
42+
mapping = {}
43+
mapping.update(mapping_custom_1)
44+
mapping.update(mapping_greek)
45+
mapping.update(mapping_two_chars)
46+
mapping.update(mapping_latin_chars)
47+
48+
# On OpenBSD string.whitespace has a non-standard implementation
49+
# See http://plone.org/collector/4704 for details
50+
whitespace = ''.join([c for c in string.whitespace if ord(c) < 128])
51+
allowed = string.ascii_letters + string.digits + string.punctuation + whitespace
52+
allowedid = string.ascii_letters + string.digits + '-'
53+
54+
encoding = 'humanascii'
55+
56+
def __init__ (self, encoding='humanascii'):
57+
self.encoding = encoding
58+
59+
60+
def code (self, text):
61+
"""
62+
This method is used for normalization of unicode characters to the base ASCII
63+
letters. Output is ASCII encoded string (or char) with only ASCII letters,
64+
digits, punctuation and whitespace characters. Case is preserved.
65+
"""
66+
if text == "":
67+
return ""
68+
69+
unicodeinput = True
70+
if not isinstance(text, str):
71+
text = unicode(text, 'utf-8')
72+
unicodeinput = False
73+
74+
res = ''
75+
if self.encoding == 'humanascii' or self.encoding == 'identifier':
76+
enc = 'ascii'
77+
else:
78+
enc = self.encoding
79+
for ch in text:
80+
if (self.encoding == 'humanascii') and (ch in self.allowed):
81+
# ASCII chars, digits etc. stay untouched
82+
res += ch
83+
continue
84+
if (self.encoding == 'identifier') and (ch in self.allowedid):
85+
# ASCII chars, digits etc. stay untouched
86+
res += ch
87+
continue
88+
else:
89+
try:
90+
ch.encode(enc,'strict')
91+
if self.encoding == 'identifier':
92+
res += '-'
93+
else:
94+
res += ch
95+
except UnicodeEncodeError:
96+
ordinal = ord(ch)
97+
if ordinal in self.mapping:
98+
# try to apply custom mappings
99+
res += self.mapping.get(ordinal)
100+
elif decomposition(ch) or len(normalize('NFKD',ch)) > 1:
101+
normalized = filter(lambda i: not combining(i), normalize('NFKD', ch)) #.strip()
102+
# normalized string may contain non-letter chars too. Remove them
103+
# normalized string may result to more than one char
104+
if self.encoding == 'identifier':
105+
res += ''.join([c for c in normalized if c in self.allowedid])
106+
else:
107+
res += ''.join([c for c in normalized if c in self.allowed])
108+
else:
109+
# hex string instead of unknown char
110+
res += "%x" % ordinal
111+
if self.encoding == 'identifier':
112+
res = res.strip('-').replace('-----','-').replace('----','-').replace('---','-').replace('--','-')
113+
if not res.strip('-')[0] in string.ascii_letters:
114+
res = '-' + res
115+
if unicodeinput:
116+
return res
117+
else:
118+
return res.encode('utf-8')

0 commit comments

Comments
 (0)