diff --git a/reldi_tokeniser/tokeniser.py b/reldi_tokeniser/tokeniser.py index ca1f3ce..113c93e 100755 --- a/reldi_tokeniser/tokeniser.py +++ b/reldi_tokeniser/tokeniser.py @@ -13,11 +13,12 @@ reldir=os.path.dirname(os.path.abspath(__file__)) def read_abbrevs(file): - abbrevs={'B':[],'N':[],'S':[]} - for line in open(os.path.join(reldir,file),encoding='utf8'): - if not line.startswith('#'): - abbrev,type=line.strip().split('\t')[:2] - abbrevs[type].append(abbrev) + abbrevs={'B':[], 'N':[], 'S':[]} + with open(os.path.join(reldir, file), encoding='utf8') as file: + for line in file: + if not line.startswith('#'): + abbrev, type = line.strip().split('\t')[:2] + abbrevs[type].append(abbrev) return abbrevs abbrevs={ @@ -34,7 +35,8 @@ def read_abbrevs(file): url=r'https?://[-\w/%]+(?:[.#?=&@;][-\w/%]+)+|\b[\w-]+\.(?:[\w-]+\.)?(?:com|org|net|gov|edu|int|io|eu|si|hr|rs|ba|me|mk|it|at|hu|bg|ro|al|de|ch|be|dk|se|no|es|pt|ie|fr|fi|cl|co|bo|br|gr|ru|uk|us|by|cz|sk|pl|lt|lv|lu|ca|in|tr|il|iq|ir|hk|cn|jp|au|nz)/?\b' word=r'(?:[*]{2,})?\w+(?:[@­\'-]\w+|[*]+\w+)*(?:[*]{2,})?' #open('punct','w').write(''.join([chr(i) for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')])) -punct = open(os.path.join(reldir,'punct'), encoding="utf-8").read() +with open(os.path.join(reldir,'punct'), encoding="utf-8") as punct_file: + punct = punct_file.read() langs={ 'hr':{