Skip to content

Commit f6edd1e

Browse files
committed
Deprecate Text::match_pregexes, call methods directly
And stop caching most deaccented versions.
1 parent f773c1c commit f6edd1e

File tree

15 files changed

+113
-120
lines changed

15 files changed

+113
-120
lines changed

lib/text.php

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function __construct($raw, $utf8) {
1919

2020
/** @return TextPregexes */
2121
static function make_empty() {
22-
return new TextPregexes('(?!)', '(?!)');
22+
return new TextPregexes(null, '(?!)');
2323
}
2424

2525
/** @return bool */
@@ -28,23 +28,44 @@ function is_empty() {
2828
}
2929

3030
/** @param string $text
31-
* @param ?string $deaccented_text
3231
* @return bool */
33-
function match($text, $deaccented_text) {
32+
function match_raw($text) {
3433
if ($this->preg_raw === null) {
3534
return !!preg_match("{{$this->preg_utf8}}ui", $text);
36-
} else if ((string) $deaccented_text !== "" && $deaccented_text !== $text) {
37-
return !!preg_match("{{$this->preg_utf8}}ui", $deaccented_text);
38-
} else {
39-
return !!preg_match("{{$this->preg_raw}}i", $text);
4035
}
36+
return !!preg_match("{{$this->preg_raw}}i", $text);
37+
}
38+
39+
/** @param string $text
40+
* @return bool */
41+
function match($text) {
42+
if ($this->preg_raw === null) {
43+
return !!preg_match("{{$this->preg_utf8}}ui", $text);
44+
} else if (($text_da = UnicodeHelper::maybe_deaccent($text)) !== null) {
45+
return !!preg_match("{{$this->preg_utf8}}ui", $text_da);
46+
}
47+
return !!preg_match("{{$this->preg_raw}}i", $text);
48+
}
49+
50+
/** @param string $text
51+
* @param ?string $text_da
52+
* @return bool */
53+
function match_da($text, $text_da) {
54+
if ($this->preg_raw === null) {
55+
return !!preg_match("{{$this->preg_utf8}}ui", $text);
56+
} else if ((string) $text_da !== "" && $text_da !== $text) {
57+
return !!preg_match("{{$this->preg_utf8}}ui", $text_da);
58+
}
59+
return !!preg_match("{{$this->preg_raw}}i", $text);
4160
}
4261

4362
function add_matches(TextPregexes $r) {
44-
if ($this->is_empty()) {
63+
if ($r->is_empty()) {
64+
// do nothing
65+
} else if ($this->is_empty()) {
4566
$this->preg_utf8 = $r->preg_utf8;
4667
$this->preg_raw = $r->preg_raw;
47-
} else if (!$r->is_empty()) {
68+
} else {
4869
$this->preg_utf8 .= "|{$r->preg_utf8}";
4970
if ($r->preg_raw === null) {
5071
$this->preg_raw = null;
@@ -338,9 +359,8 @@ static function utf8_word_regex($word, $literal = false) {
338359
return ($aw ? self::UTF8_INITIAL_NONLETTERDIGIT : '')
339360
. str_replace(" ", $sp, preg_quote($word))
340361
. ($zw ? self::UTF8_FINAL_NONLETTERDIGIT : '');
341-
} else {
342-
return self::utf8_word_regex(convert_to_utf8($word));
343362
}
363+
return self::utf8_word_regex(convert_to_utf8($word));
344364
}
345365

346366
/** @param string $word
@@ -373,9 +393,10 @@ static function star_text_pregexes($word, $literal = false) {
373393
/** @param ?TextPregexes $reg
374394
* @param string $text
375395
* @param ?string $deaccented_text
376-
* @return bool */
396+
* @return bool
397+
* @deprecated */
377398
static function match_pregexes($reg, $text, $deaccented_text) {
378-
return $reg && $reg->match($text, $deaccented_text);
399+
return $reg && $reg->match_da($text, $deaccented_text);
379400
}
380401

381402

lib/unicodehelper.php

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,34 +49,43 @@ private static function deaccent_result($di) {
4949
}
5050
}
5151

52-
/** @param string $x
53-
* @return string */
54-
static function deaccent($x) {
55-
if (preg_match_all("/[\xC0-\xFF]/", $x, $m, PREG_OFFSET_CAPTURE)) {
56-
if (self::$deaccent_map === null) {
57-
self::make_deaccent_map();
58-
}
59-
$first = 0;
60-
$out = "";
61-
foreach ($m[0] as $mx) {
62-
$i = $mx[1];
63-
$l = ord($mx[0]) < 0xE0 ? 2 : 3;
64-
$ch = substr($x, $i, $l);
65-
if (($di = self::$deaccent_map[$ch] ?? null) !== null) {
66-
$out .= substr($x, $first, $i - $first) . self::deaccent_result($di);
67-
$first = $i + $l;
68-
}
52+
/** @param string $s
53+
* @return ?string */
54+
static function maybe_deaccent($s) {
55+
if (!preg_match_all("/[\xC0-\xFF]/", $s, $m, PREG_OFFSET_CAPTURE)) {
56+
return null;
57+
}
58+
if (self::$deaccent_map === null) {
59+
self::make_deaccent_map();
60+
}
61+
$first = 0;
62+
$out = "";
63+
foreach ($m[0] as $mx) {
64+
$i = $mx[1];
65+
$l = ord($mx[0]) < 0xE0 ? 2 : 3;
66+
$ch = substr($s, $i, $l);
67+
if (($di = self::$deaccent_map[$ch] ?? null) !== null) {
68+
$out .= substr($s, $first, $i - $first) . self::deaccent_result($di);
69+
$first = $i + $l;
6970
}
70-
$x = $out . substr($x, $first);
7171
}
72-
return $x;
72+
if ($first === 0) {
73+
return null;
74+
}
75+
return $out . substr($s, $first);
76+
}
77+
78+
/** @param string $s
79+
* @return string */
80+
static function deaccent($s) {
81+
return self::maybe_deaccent($s) ?? $s;
7382
}
7483

75-
/** @param string $x
84+
/** @param string $s
7685
* @return array{string,list<int>} */
77-
static function deaccent_offsets($x) {
86+
static function deaccent_offsets($s) {
7887
$offsetmap = [0, 0];
79-
if (preg_match_all("/[\xC0-\xFF]/", $x, $m, PREG_OFFSET_CAPTURE)) {
88+
if (preg_match_all("/[\xC0-\xFF]/", $s, $m, PREG_OFFSET_CAPTURE)) {
8089
if (self::$deaccent_map === null) {
8190
self::make_deaccent_map();
8291
}
@@ -85,17 +94,17 @@ static function deaccent_offsets($x) {
8594
foreach ($m[0] as $mx) {
8695
$i = $mx[1];
8796
$l = ord($mx[0]) < 0xE0 ? 2 : 3;
88-
$ch = substr($x, $i, $l);
97+
$ch = substr($s, $i, $l);
8998
if (($di = self::$deaccent_map[$ch] ?? null) !== null) {
90-
$out .= substr($x, $first, $i - $first) . self::deaccent_result($di);
99+
$out .= substr($s, $first, $i - $first) . self::deaccent_result($di);
91100
$first = $i + $l;
92101
$offsetmap[] = strlen($out);
93102
$offsetmap[] = $first;
94103
}
95104
}
96-
$x = $out . substr($x, $first);
105+
$s = $out . substr($s, $first);
97106
}
98-
return [$x, $offsetmap];
107+
return [$s, $offsetmap];
99108
}
100109

101110
/** @param list<int> $offsetmap

src/authormatcher.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,10 +262,10 @@ function test($au, $prefer_name = false) {
262262
&& $au->lastName !== ""
263263
&& ($this->lastName_simple
264264
? $this->lastName_simple === $au->deaccent(1)
265-
: Text::match_pregexes($this->lastName_matcher, $au->lastName, $au->deaccent(1)))
265+
: $this->lastName_matcher->match_da($au->lastName, $au->deaccent(1)))
266266
&& ($au->firstName === ""
267267
|| !$this->firstName_matcher
268-
|| Text::match_pregexes($this->firstName_matcher, $au->firstName, $au->deaccent(0)))) {
268+
|| $this->firstName_matcher->match_da($au->firstName, $au->deaccent(0)))) {
269269
return self::MATCH_NAME;
270270
}
271271
if ($this->affiliation_matcher

src/contactsearch.php

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,11 +254,8 @@ private function check_user() {
254254
break;
255255
}
256256
$ids[] = $id;
257-
} else if ($nreg) {
258-
$n = $acct->searchable_name();
259-
if (Text::match_pregexes($nreg, $n, UnicodeHelper::deaccent($n))) {
260-
$ids[] = $id;
261-
}
257+
} else if ($nreg && $nreg->match($acct->searchable_name())) {
258+
$ids[] = $id;
262259
}
263260
}
264261

src/formulas/f_author.php

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ static function count_matches(PaperInfo $prow, $matchidx) {
6868
}
6969
} else {
7070
foreach ($prow->author_list() as $au) {
71-
$text = $au->name(NAME_E|NAME_A);
72-
if (Text::match_pregexes($mf, $text, UnicodeHelper::deaccent($text)))
71+
if ($mf->match($au->name(NAME_E|NAME_A)))
7372
++$n;
7473
}
7574
}

src/paperinfo.php

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -747,8 +747,6 @@ class PaperInfo {
747747
private $_flags = 0;
748748
/** @var ?SubmissionRound */
749749
private $_submission_round;
750-
/** @var ?array<string,string> */
751-
private $_deaccents;
752750
/** @var ?list<Author> */
753751
private $_author_array;
754752
/** @var ?list<PaperConflictInfo> */
@@ -1227,7 +1225,6 @@ function set_prop($prop, $v) {
12271225
}
12281226
$this->$prop = $v;
12291227
// clear caches, sometimes conservatively
1230-
$this->_deaccents = null;
12311228
if ($prop === "authorInformation") {
12321229
$this->_author_array = $this->_ctype_list = null;
12331230
} else if ($prop === "collaborators") {
@@ -1605,7 +1602,7 @@ private function _potential_conflict(Contact $user) {
16051602
}
16061603
$auproblems = 0;
16071604
$pcs = [];
1608-
if ($this->field_match_pregexes($user->aucollab_general_pregexes(), "authorInformation")) {
1605+
if ($user->aucollab_general_pregexes()->match($this->authorInformation)) {
16091606
foreach ($this->author_list() as $au) {
16101607
foreach ($user->aucollab_matchers() as $userm) {
16111608
if (($why = $userm->test($au, $userm->is_nonauthor()))) {
@@ -1617,7 +1614,7 @@ private function _potential_conflict(Contact $user) {
16171614
}
16181615
$userm = $user->full_matcher();
16191616
$collab = $this->full_collaborators();
1620-
if (Text::match_pregexes($userm->general_pregexes(), $collab, UnicodeHelper::deaccent($collab))) {
1617+
if ($userm->general_pregexes()->match($collab)) {
16211618
foreach ($this->collaborator_list() as $co) {
16221619
if (($co->lastName !== ""
16231620
|| ($auproblems & AuthorMatcher::MATCH_AFFILIATION) === 0)
@@ -1715,26 +1712,12 @@ function unparse_pseudonym($viewer, $cid) {
17151712
}
17161713

17171714

1718-
/** @param 'title'|'abstract'|'authorInformation'|'collaborators' $field
1719-
* @return ?string */
1720-
private function deaccented_field($field) {
1721-
$this->_deaccents = $this->_deaccents ?? [];
1722-
if (!array_key_exists($field, $this->_deaccents)) {
1723-
$str = $this->{$field}();
1724-
if ($str !== "" && !is_usascii($str)) {
1725-
$this->_deaccents[$field] = UnicodeHelper::deaccent($str);
1726-
} else {
1727-
$this->_deaccents[$field] = null;
1728-
}
1729-
}
1730-
return $this->_deaccents[$field];
1731-
}
1732-
17331715
/** @param TextPregexes $reg
17341716
* @param 'title'|'abstract'|'authorInformation'|'collaborators' $field
1735-
* @return bool */
1717+
* @return bool
1718+
* @deprecated */
17361719
function field_match_pregexes($reg, $field) {
1737-
return Text::match_pregexes($reg, $this->{$field}(), $this->deaccented_field($field));
1720+
return $reg->match($this->{$field}());
17381721
}
17391722

17401723

src/papertable.php

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -388,11 +388,11 @@ private function _print_foldpaper_div() {
388388
if (!$match && $this->abstract_foldable($abstract)) {
389389
$this->_allow_collapse["abstract"] = true;
390390
}
391-
$match = false;
391+
$matches = 0;
392392
if ($this->matchPreg) {
393-
$this->highlight($this->prow->authorInformation, "au", $match);
393+
$this->highlight($this->prow->authorInformation, "au", $matches);
394394
}
395-
if (!$match && $vas !== 0) {
395+
if ($matches === 0 && $vas !== 0) {
396396
$this->_allow_collapse["authors"] = true;
397397
} else {
398398
$this->_allow_collapse["anonau"] = false;

src/reviewfields/rf_text.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ function sqlexpr() {
2525
function test_value($rrow, $fv) {
2626
if ($fv !== null
2727
&& $fv !== ""
28-
&& $rrow->field_match_pregexes($this->preg, $this->rf->order)) {
28+
&& $this->preg->match($rrow->fields[$this->rf->order])) {
2929
return true;
3030
}
3131
if (($this->op & CountMatcher::RELALL) !== 0 && $fv !== null) {

src/reviewinfo.php

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,6 @@ class ReviewInfo implements JsonSerializable {
7575

7676
/** @var list<null|int|string> */
7777
public $fields;
78-
/** @var ?list<null|false|string> */
79-
private $_deaccent_fields;
8078

8179
// scores
8280
// These scores are loaded from the database, but exposed only in `fields`
@@ -989,20 +987,11 @@ static function check_ambiguous_names($rrows) {
989987

990988
/** @param ?TextPregexes $reg
991989
* @param int $order
992-
* @return bool */
990+
* @return bool
991+
* @deprecated */
993992
function field_match_pregexes($reg, $order) {
994993
$data = $this->fields[$order];
995-
if (!isset($this->_deaccent_fields[$order])) {
996-
if (!isset($this->_deaccent_fields)) {
997-
$this->_deaccent_fields = $this->conf->review_form()->order_array(null);
998-
}
999-
if (is_usascii($data)) {
1000-
$this->_deaccent_fields[$order] = false;
1001-
} else {
1002-
$this->_deaccent_fields[$order] = UnicodeHelper::deaccent($data);
1003-
}
1004-
}
1005-
return Text::match_pregexes($reg, $data, $this->_deaccent_fields[$order]);
994+
return $reg && $reg->match($this->fields[$order]);
1006995
}
1007996

1008997

src/search/st_author.php

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,9 @@ function test(PaperInfo $row, $xinfo) {
7070
}
7171
} else if ($can_view) {
7272
foreach ($row->author_list() as $au) {
73-
if ($this->regex) {
74-
$text = $au->name(NAME_E|NAME_A);
75-
if (!Text::match_pregexes($this->regex, $text,
76-
UnicodeHelper::deaccent($text))) {
77-
continue;
78-
}
73+
if ($this->regex
74+
&& !$this->regex->match($au->name(NAME_E|NAME_A))) {
75+
continue;
7976
}
8077
++$n;
8178
}

0 commit comments

Comments
 (0)