-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload-sents-hyphen.php
More file actions
executable file
·133 lines (116 loc) · 2.79 KB
/
load-sents-hyphen.php
File metadata and controls
executable file
·133 lines (116 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/usr/bin/env php
<?php
declare(strict_types=1);
require_once __DIR__.'/vendor/autoload.php';
require_once '/home/tino/oqaa/ipa-ks/php/hyphenation.php';
ini_set('memory_limit', '24G');
$db_file = $argv[1] ?? 'hyphenated-ne.sqlite';
$db = new \TDC\PDO\SQLite($db_file);
$db->exec("PRAGMA journal_mode = delete");
$db->exec("PRAGMA page_size = 65536");
$db->exec("VACUUM");
$db->exec("PRAGMA auto_vacuum = INCREMENTAL");
$db->exec("PRAGMA case_sensitive_like = ON");
$db->exec("PRAGMA foreign_keys = OFF");
$db->exec("PRAGMA ignore_check_constraints = ON");
$db->exec("PRAGMA journal_mode = MEMORY");
$db->exec("PRAGMA locking_mode = EXCLUSIVE");
$db->exec("PRAGMA synchronous = OFF");
$db->exec("PRAGMA threads = 4");
$db->exec("PRAGMA trusted_schema = OFF");
$db->exec("CREATE TABLE sents (
s_id INTEGER NOT NULL,
s_tokens INTEGER NOT NULL,
s_text TEXT NOT NULL,
PRIMARY KEY (s_id)
) WITHOUT ROWID");
$db->beginTransaction();
$ins = $db->prepare("INSERT INTO sents (s_id, s_tokens, s_text) VALUES (?, ?, ?)");
$uniq = [];
$in_par = false;
$tokens = 0;
$sent = '';
$cohort = '';
$i = 0;
$cache = [];
function handle_cohort() {
global $tokens, $sent, $cohort, $cache;
if (!empty($cohort)) {
if (!preg_match('~^"<(.+?)>"~u', $cohort, $m)) {
echo "BAD COHORT: $cohort\n";
$cohort = '';
return;
}
$word = $m[1];
if (strpos($cohort, ' Prop') === false && strpos($cohort, ' ?') === false) {
$word = mb_strtolower($word);
}
$ws = explode(' ', $word);
foreach ($ws as $w) {
if (!array_key_exists($w, $cache)) {
$cache[$w] = str_replace("\u{00ad}", ' ', kal_hyphenate($w));
}
$sent .= $cache[$w];
$sent .= ' ';
}
++$tokens;
}
$cohort = '';
}
function save_sent() {
global $db, $ins, $uniq, $tokens, $sent, $i;
handle_cohort();
$sent = trim($sent);
if (empty($sent)) {
return;
}
$hash = sha1($sent);
if (!array_key_exists($hash, $uniq)) {
$uniq[$hash] = ++$i;
$ins->execute([$i, $tokens, $sent]);
if ($i % 10000 === 0) {
echo "$i\r";
$db->commit();
$db->beginTransaction();
}
}
$sent = '';
$tokens = 0;
}
while ($line = fgets(STDIN)) {
if (preg_match('~^<s(\d+)>~', $line, $m)) {
$in_par = intval($m[1]);
}
else if (preg_match('~^</s(\d+)>~', $line, $m)) {
if (intval($m[1]) !== $in_par) {
echo "MISMATCH: {$m[1]} != {$in_par}\n";
$in_par = false;
$tokens = 0;
$sent = '';
continue;
}
save_sent();
$in_par = false;
$tokens = 0;
$sent = '';
}
else if ($in_par) {
if (preg_match('~^"<~u', $line)) {
handle_cohort();
++$tokens;
$cohort = $line;
}
else if (preg_match('~^\s+"~u', $line)) {
$cohort .= $line;
}
else {
// Empty line means sentence break
save_sent();
}
}
}
save_sent();
echo "$i\n";
$db->commit();
$db->exec("PRAGMA ignore_check_constraints = OFF");
$db->exec("PRAGMA locking_mode = NORMAL");