@@ -45,7 +45,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
4545 ----------
4646 valid_pos_tags:
4747 Set of valid part of speech tags, defaults to nouns and
48- adjectives. I.e. `{'N ', 'Ne', 'AJ', 'AJe '}`.
48+ adjectives. I.e. `{'NOUN ', 'ADJ '}`.
4949 """
5050 self .word_normalization_method : Optional [str ] = None
5151 self .sentences : List [Sentence ] = []
@@ -54,13 +54,14 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
5454 punctuation_marks
5555 )
5656 if valid_pos_tags is None :
57- valid_pos_tags = {'N ' , 'Ne' , 'AJ' , 'AJe ' }
57+ valid_pos_tags = {'NOUN ' , 'ADJ ' }
5858 self .valid_pos_tags : Set [str ] = valid_pos_tags
5959
6060 def load_text (
6161 self ,
6262 input : Union [str , Path ],
6363 word_normalization_method : WordNormalizationMethod = 'stemming' ,
64+ universal_pos_tags : bool = True ,
6465 ) -> None :
6566 """
6667 Loads the text of a document or string.
@@ -74,9 +75,15 @@ def load_text(
7475 Word normalization method, defaults to `'stemming'`. See
7576 `perke.base.types.WordNormalizationMethod` for available
7677 methods.
78+
79+ universal_pos_tags:
80+ Whether to use universal part of speech tags or not,
81+ defaults to `True`.
7782 """
7883 # Initialize reader
79- reader = RawTextReader (input , word_normalization_method )
84+ reader = RawTextReader (
85+ input , word_normalization_method , universal_pos_tags
86+ )
8087
8188 # Load sentences
8289 self .sentences = reader .read ()
@@ -225,7 +232,7 @@ def _add_candidate_occurrence(
225232 The offset of the occurrence
226233
227234 normalized_words:
228- List of normalized of words of the occurrence
235+ List of normalized words of the occurrence
229236 """
230237 # Build the canonical form of the candidate
231238 canonical_form = ' ' .join (normalized_words )
@@ -306,7 +313,7 @@ def _select_candidates_with_longest_sequences(
306313 first = sequence_offsets [0 ]
307314 last = sequence_offsets [- 1 ]
308315
309- # Add the ngram as a new candidate occurrence
316+ # Add the n-gram as a new candidate occurrence
310317 self ._add_candidate_occurrence (
311318 words = sentence .words [first : last + 1 ],
312319 offset = offset_shift + first ,
@@ -336,20 +343,20 @@ def _select_candidates_with_grammar(
336343 defaults to::
337344 r\" ""
338345 NP:
339- <P>{<N >}<V >
346+ {<NOUN >}<VERB >
340347 NP:
341- {<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe? >*}
342- <N >}{<.*e?>'
348+ {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON >*}
349+ <NOUN >}{<.*(,EZ)?>
343350 \" ""
344351 """
345352 # Initialize default grammar if none provided
346353 if grammar is None :
347354 grammar = r"""
348355 NP:
349- <P>{<N >}<V >
356+ {<NOUN >}<VERB >
350357 NP:
351- {<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe? >*}
352- <N >}{<.*e ?>
358+ {<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON >*}
359+ <NOUN >}{<.*(,EZ) ?>
353360 """
354361
355362 # Initialize parser
0 commit comments