-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtruecasing.py
More file actions
31 lines (25 loc) · 922 Bytes
/
truecasing.py
File metadata and controls
31 lines (25 loc) · 922 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
from typing import List
import stanza
stanza.download('it')
analyze = stanza.Pipeline('it', processors='tokenize, mwt, pos', use_gpu=False)
def truecase(tweet):
doc = analyze(tweet)
tweet: List = list(tweet)
for sentence in doc.sentences:
for i, token in enumerate(sentence.tokens):
if i == 0 or any(w.upos in ['PROPN', 'X'] for w in token.words):
tweet[token.start_char] = tweet[token.start_char].upper()
return ''.join(tweet) + '\n'
if __name__ == '__main__':
with open('data/test.json') as f:
for line in f:
ref = json.loads(line)['output']
pred = truecase(ref.lower())
print(f'ref > {ref}')
print(f'pred> {pred}')
print(' ', end='')
for c1, c2 in zip(ref, pred):
print('^' if c1 != c2 else ' ', end='')
print()
input()