-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexplore_corpus.py
More file actions
92 lines (79 loc) · 3.58 KB
/
Copy pathexplore_corpus.py
File metadata and controls
92 lines (79 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Explore the training corpus to find morphologically related forms."""
from collections import Counter
with open('data/words_train.txt') as f:
words = [l.strip() for l in f if l.strip()]
counts = Counter(words)
print(f'Total tokens: {len(words)}')
print(f'Unique types: {len(counts)}')
# ---------------------------------------------------------
# Strategy: find paradigm clusters.
# A stem S has a paradigm if we can find S, S+"z"/S+"s", S+"t"/S+"d", etc.
# In Klattbet: z=z, s=s, t=t, d=d, I=ɪ, Z=ʒ
#
# Past tense allomorphs: -t, -d, -Id (Klattbet: -t, -d, -Id)
# Plural/3sg allomorphs: -z, -s, -Iz (Klattbet: -z, -s, -Iz)
# ---------------------------------------------------------
# Find words ending in each target suffix
def strip_suffix(word, suffix):
if word.endswith(suffix) and len(word) > len(suffix):
return word[:-len(suffix)]
return None
# Collect stems for each suffix
past_suffixes = ['t', 'd', 'Id'] # walked, played, wanted
plural_suffixes = ['z', 's', 'Iz'] # dogs, cats, buses
# For each word ending in a past suffix, check if the stem also appears
print("\n--- Looking for past-tense paradigms (stem + {t, d, Id}) ---")
paradigm_stems_past = []
for word, cnt in counts.items():
for suf in past_suffixes:
stem = strip_suffix(word, suf)
if stem and len(stem) >= 2 and stem in counts:
# Check if any other past-tense allomorph also present
others = [s for s in past_suffixes if s != suf and stem + s in counts]
paradigm_stems_past.append((stem, suf, word, cnt, others))
# Sort by frequency of the suffixed form
paradigm_stems_past.sort(key=lambda x: -x[2+1])
seen = set()
for stem, suf, word, cnt, others in paradigm_stems_past[:40]:
if stem not in seen:
seen.add(stem)
variants = [stem] + [stem+s for s in past_suffixes if stem+s in counts]
variant_str = ', '.join(f'{v}({counts[v]})' for v in variants)
print(f' {variant_str}')
print("\n--- Looking for plural/3sg paradigms (stem + {z, s, Iz}) ---")
paradigm_stems_plur = []
for word, cnt in counts.items():
for suf in plural_suffixes:
stem = strip_suffix(word, suf)
if stem and len(stem) >= 2 and stem in counts:
others = [s for s in plural_suffixes if s != suf and stem + s in counts]
paradigm_stems_plur.append((stem, suf, word, cnt, others))
paradigm_stems_plur.sort(key=lambda x: -x[2+1])
seen = set()
for stem, suf, word, cnt, others in paradigm_stems_plur[:40]:
if stem not in seen:
seen.add(stem)
variants = [stem] + [stem+s for s in plural_suffixes if stem+s in counts]
variant_str = ', '.join(f'{v}({counts[v]})' for v in variants)
print(f' {variant_str}')
# Show all unique types ending in t/d/s/z
print("\n--- All types ending in t (sorted by freq) ---")
t_words = [(w,c) for w,c in counts.items() if w.endswith('t') and len(w)>1]
t_words.sort(key=lambda x:-x[1])
for w,c in t_words[:30]:
print(f' {w!r}: {c}')
print("\n--- All types ending in d (sorted by freq) ---")
d_words = [(w,c) for w,c in counts.items() if w.endswith('d') and len(w)>1]
d_words.sort(key=lambda x:-x[1])
for w,c in d_words[:30]:
print(f' {w!r}: {c}')
print("\n--- All types ending in z (sorted by freq) ---")
z_words = [(w,c) for w,c in counts.items() if w.endswith('z') and len(w)>1]
z_words.sort(key=lambda x:-x[1])
for w,c in z_words[:30]:
print(f' {w!r}: {c}')
print("\n--- All types ending in s (sorted by freq) ---")
s_words = [(w,c) for w,c in counts.items() if w.endswith('s') and len(w)>1]
s_words.sort(key=lambda x:-x[1])
for w,c in s_words[:20]:
print(f' {w!r}: {c}')