-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract.py
More file actions
50 lines (42 loc) · 1.7 KB
/
extract.py
File metadata and controls
50 lines (42 loc) · 1.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
def extract_ngrams(tokens, n):
"""
Extract n-grams from a list of tokens.
Returns a list of tuples, each tuple is an n-gram.
"""
ngrams = []
for i in range(len(tokens) - n + 1):
ngram = tuple(tokens[i:i+n])
ngrams.append(ngram)
return ngrams
def build_ngram_table(tokenized_file, n=3):
"""
Reads a tokenized file (one sentence per line, tokens separated by spaces)
and builds an n-gram table. For a trigram model, the key is the first n-1 tokens,
and the value is a dictionary mapping the nth token to its frequency.
"""
ngram_table = {}
with open(tokenized_file, 'r', encoding='utf-8') as f:
for line in f:
tokens = line.strip().split()
if len(tokens) < n:
continue # Skip lines that are too short
ngrams = extract_ngrams(tokens, n)
for ngram in ngrams:
key = " ".join(ngram[:-1]) # First n-1 words
next_word = ngram[-1] # nth word
if key not in ngram_table:
ngram_table[key] = {}
if next_word not in ngram_table[key]:
ngram_table[key][next_word] = 0
ngram_table[key][next_word] += 1
return ngram_table
if __name__ == '__main__':
tokenized_file = "./corpus/en_US.twitter.tok.txt"
output_file = "./corpus/en_US.twitter.trigrams.json"
ngram_table = build_ngram_table(tokenized_file, n=3)
with open(output_file, 'w', encoding='utf-8') as f_out:
json.dump(ngram_table, f_out, indent=2, ensure_ascii=False)
print(f"N-gram table written to {output_file}")