forked from scipy/scipy
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathunicode-check.py
executable file
·88 lines (78 loc) · 3.43 KB
/
unicode-check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
import re
from itertools import chain
from glob import iglob
import sys
import argparse
# The set of Unicode code points greater than 127 that we allow in the source code:
# Note that the lines enclosed by the marker lines BEGIN_INCLUDE_RST/END_INCLUDE_RST
# are included in the file `doc/source/dev/missing-bits.rst` to be rendered in the user
# documentation.
#
# BEGIN_INCLUDE_RST (do not change this line!)
latin1_letters = set(chr(cp) for cp in range(192, 256))
greek_letters = set('αβγδεζηθικλμνξoπρστυϕχψω' + 'ΓΔΘΛΞΠΣϒΦΨΩ')
box_drawing_chars = set(chr(cp) for cp in range(0x2500, 0x2580))
extra_symbols = set('®ő∫≠≥≤±∞²³·→√')
allowed = latin1_letters | greek_letters | box_drawing_chars | extra_symbols
# END_INCLUDE_RST (do not change this line!)
def unicode_check(showall=False):
"""
If showall is True, all non-ASCII characters are displayed.
"""
# File encoding regular expression from PEP-263.
encoding_pat = re.compile("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
nbad = 0
for name in chain(iglob('scipy/**/*.py', recursive=True),
iglob('scipy/**/*.pyx', recursive=True),
iglob('scipy/**/*.px[di]', recursive=True)):
# Read the file as bytes, and check for any bytes greater than 127.
with open(name, 'rb') as f:
content = f.read()
if len(content) == 0:
continue
if max(content) > 127:
# There is at least one non-ASCII character in the file.
# Check the first two lines for an encoding comment.
lines = content.splitlines()
for line in lines[:2]:
match = re.match(encoding_pat,
line.decode(encoding='latin-1'))
if match:
break
# If an explicit encoding was given in a comment, use
# that to decode the contents. Otherwise use UTF-8.
if match:
encoding = match[1]
file_enc_msg = f"(explicit encoding '{encoding}')"
else:
encoding = 'utf-8'
file_enc_msg = "(no explicit encoding; utf-8 assumed)"
content = content.decode(encoding=encoding)
out = []
for n, line in enumerate(content.splitlines()):
for pos, char in enumerate(line):
cp = ord(char)
if cp > 127:
msg = (f"... line {n+1}, position {pos+1}: "
f"character '{char}', code point U+{cp:04X}")
if showall:
out.append(msg)
else:
if char not in allowed:
out.append(msg)
if len(out) > 0:
nbad += 1
print(f"{name} {file_enc_msg}")
for msg in out:
print(msg)
return nbad
if __name__ == "__main__":
descr = ('Check for disallowed Unicode characters in the SciPy Python and '
' Cython source code.')
parser = argparse.ArgumentParser(description=descr)
parser.add_argument('--showall', action='store_true',
help=('Show non-ASCII Unicode characters from all '
'files.'))
args = parser.parse_args()
sys.exit(unicode_check(args.showall) > 0)