Skip to content

Commit eb35e12

Browse files
committed
Allowed search results for Django code terms which contain stop words.
1 parent 0070473 commit eb35e12

File tree

3 files changed

+171
-4
lines changed

3 files changed

+171
-4
lines changed

Diff for: docs/models.py

+14-4
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
TrigramSimilarity,
1616
)
1717
from django.core.cache import cache
18-
from django.db import models, transaction
18+
from django.db import connection, models, transaction
1919
from django.db.models import Prefetch, Q
2020
from django.db.models.fields.json import KeyTextTransform
2121
from django.utils.functional import cached_property
@@ -174,6 +174,18 @@ def sync_to_db(self, decoded_documents):
174174
if line.startswith(f"Disallow: /{self.lang}/{self.release_id}/")
175175
]
176176

177+
language_mapping = TSEARCH_CONFIG_LANGUAGES
178+
english = "custom_english"
179+
with connection.cursor() as cursor:
180+
cursor.execute(
181+
"SELECT EXISTS(SELECT 1 FROM pg_ts_config WHERE cfgname = %s)",
182+
[english],
183+
)
184+
has_custom_english_config = cursor.fetchone()[0]
185+
186+
if has_custom_english_config:
187+
language_mapping["en"] = english
188+
177189
for document in decoded_documents:
178190
if (
179191
"body" not in document
@@ -192,9 +204,7 @@ def sync_to_db(self, decoded_documents):
192204
path=document_path,
193205
title=html.unescape(strip_tags(document["title"])),
194206
metadata=document,
195-
config=TSEARCH_CONFIG_LANGUAGES.get(
196-
self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG
197-
),
207+
config=language_mapping.get(self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG),
198208
)
199209
for document in self.documents.all():
200210
document.metadata["breadcrumbs"] = list(

Diff for: docs/stopwords/README.md

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Instructions to create a new search dictionary
2+
3+
In this folder, there is `custom_english.stop`.
4+
5+
This copies the [snowball english stop words](https://github.com/postgres/postgres/blob/master/src/backend/snowball/stopwords/english.stop)
6+
but removes some stop words such as "through" and "when". This is because these
7+
terms are also used in Django code.
8+
9+
The file format is a list of words, one per line. Blank lines and trailing
10+
spaces are ignored, and upper case is folded to lower case, but no other
11+
processing is done on the file contents.
12+
13+
This file needs to be created in `$SHAREDIR/tsearch_data/custom_english.stop`,
14+
where `$SHAREDIR` means the PostgreSQL installation's shared-data directory,
15+
available via `pg_config --sharedir`.
16+
17+
See https://www.postgresql.org/docs/current/textsearch-dictionaries.html
18+
19+
Once the custom stop words file has been created, we can run the following SQL:
20+
21+
```sql
22+
CREATE TEXT SEARCH DICTIONARY english_custom (
23+
TEMPLATE = snowball,
24+
Language = english,
25+
StopWords = english_custom
26+
);
27+
28+
CREATE TEXT SEARCH CONFIGURATION public.english_custom (
29+
COPY = pg_catalog.english
30+
);
31+
32+
ALTER TEXT SEARCH CONFIGURATION public.english_custom
33+
ALTER MAPPING
34+
FOR asciiword, asciihword, hword_asciipart, hword, hword_part, word
35+
WITH english_custom;
36+
```
37+
38+
This should then mean the `english_custom` search dictionary is available.

Diff for: docs/stopwords/custom_english.stop

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
i
2+
me
3+
my
4+
myself
5+
we
6+
our
7+
ours
8+
ourselves
9+
you
10+
your
11+
yours
12+
yourself
13+
yourselves
14+
he
15+
him
16+
his
17+
himself
18+
she
19+
her
20+
hers
21+
herself
22+
it
23+
its
24+
itself
25+
they
26+
them
27+
their
28+
theirs
29+
themselves
30+
what
31+
which
32+
who
33+
whom
34+
this
35+
that
36+
these
37+
those
38+
am
39+
is
40+
are
41+
was
42+
were
43+
be
44+
been
45+
being
46+
have
47+
has
48+
had
49+
having
50+
do
51+
does
52+
did
53+
doing
54+
a
55+
an
56+
the
57+
and
58+
but
59+
or
60+
because
61+
as
62+
until
63+
while
64+
of
65+
at
66+
by
67+
about
68+
against
69+
between
70+
into
71+
during
72+
before
73+
after
74+
above
75+
below
76+
to
77+
from
78+
up
79+
down
80+
in
81+
out
82+
on
83+
off
84+
over
85+
under
86+
again
87+
further
88+
then
89+
once
90+
here
91+
there
92+
where
93+
why
94+
how
95+
any
96+
both
97+
each
98+
few
99+
more
100+
most
101+
other
102+
some
103+
such
104+
no
105+
nor
106+
not
107+
own
108+
same
109+
so
110+
than
111+
too
112+
very
113+
s
114+
t
115+
can
116+
will
117+
just
118+
don
119+
should

0 commit comments

Comments
 (0)