Skip to content

Commit 4c40097

Browse files
committed
Adopt Perke with changes in Hazm 0.9 release
1 parent 73e5dec commit 4c40097

25 files changed

+99
-136
lines changed

.github/workflows/tests.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ jobs:
1010
runs-on: ${{ matrix.os }}
1111
strategy:
1212
matrix:
13-
os: [ubuntu-latest, macos-latest]
14-
python-version: ['3.8', '3.9', '3.10']
13+
os: [ubuntu-latest, macos-latest, windows-latest]
14+
python-version: ['3.8', '3.9', '3.10', '3.11']
1515

1616
steps:
1717
- name: Checkout repository
@@ -35,7 +35,7 @@ jobs:
3535
- name: Download resources
3636
run: |
3737
python3 -m pip install .
38-
python3 -m perke download ${{ secrets.GITHUB_TOKEN }}
38+
python3 -m perke download
3939
4040
- name: Run tests
4141
run: pytest

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ repos:
1818
- --profile=black
1919
- --line-length=79
2020
- repo: https://github.com/psf/black
21-
rev: 23.1.0
21+
rev: 23.3.0
2222
hooks:
2323
- id: black
2424
args:

CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

88

99
## [Unreleased]
10+
### Added
11+
- Added support for Python `3.11`
12+
- Brought Windows tests back
13+
14+
### Changed
15+
- Adopted Perke with all changes in
16+
[Hazm](https://github.com/roshan-research/hazm) `0.9` release
1017

1118
## [0.4.1] - 2023-03-15
1219
### Fixed
@@ -32,7 +39,8 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
3239
- Changed CI from Travis CI to GitHub workflows
3340

3441
### Removed
35-
- Removed Windows tests since hazm runs on WSL and WSL tests is same as Linux
42+
- Removed Windows tests since [Hazm](https://github.com/roshan-research/hazm)
43+
runs on WSL and WSL tests is same as Linux
3644

3745
### Fixed
3846
- Removed type hints from docstrings

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
[![pre-commit.ci](https://results.pre-commit.ci/badge/github/AlirezaTheH/perke/main.svg)](https://results.pre-commit.ci/latest/github/alirezatheh/perke/main)
44
[![PyPI Version](https://img.shields.io/pypi/v/perke)](https://pypi.python.org/pypi/perke)
55
[![Python Versions](https://img.shields.io/pypi/pyversions/perke)](https://pypi.org/project/perke)
6-
[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/latest/?badge=stable)
6+
[![Documentation Status](https://readthedocs.org/projects/perke/badge/?version=stable)](https://perke.readthedocs.io/en/stable/?badge=stable)
77

88
Perke is a Python keyphrase extraction package for Persian language. It
99
provides an end-to-end keyphrase extraction pipeline in which each component

examples/unsupervised/graph_based/multipartite_rank.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from perke.unsupervised.graph_based import MultipartiteRank
44

55
# Define the set of valid part of speech tags to occur in the model.
6-
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
6+
valid_pos_tags = {'NOUN', 'ADJ'}
77

88
# 1. Create a MultipartiteRank extractor.
99
extractor = MultipartiteRank(valid_pos_tags=valid_pos_tags)

examples/unsupervised/graph_based/position_rank.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,27 @@
33
from perke.unsupervised.graph_based import PositionRank
44

55
# Define the set of valid part of speech tags to occur in the model.
6-
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
6+
valid_pos_tags = {'NOUN', 'NOUN,EZ', 'ADJ', 'ADJ,EZ'}
77

88
# Define the grammar for selecting the keyphrase candidates
99
grammar = r"""
1010
NP:
11-
<P>{<N>}<V>
11+
{<NOUN>}<VERB>
1212
NP:
13-
{<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
14-
<N>}{<.*e?>
13+
{<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
14+
<NOUN>}{<.*(,EZ)?>
1515
"""
1616

1717
# 1. Create a PositionRank extractor.
1818
extractor = PositionRank(valid_pos_tags=valid_pos_tags)
1919

2020
# 2. Load the text.
2121
input_filepath = Path(__file__).parent.parent.parent / 'input.txt'
22-
extractor.load_text(input=input_filepath, word_normalization_method=None)
22+
extractor.load_text(
23+
input=input_filepath,
24+
word_normalization_method=None,
25+
universal_pos_tags=False,
26+
)
2327

2428
# 3. Select the noun phrases up to 3 words as keyphrase candidates.
2529
extractor.select_candidates(grammar=grammar, maximum_word_number=3)

examples/unsupervised/graph_based/single_rank.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from perke.unsupervised.graph_based import SingleRank
44

55
# Define the set of valid part of speech tags to occur in the model.
6-
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
6+
valid_pos_tags = {'NOUN', 'ADJ'}
77

88
# 1. Create a SingleRank extractor.
99
extractor = SingleRank(valid_pos_tags=valid_pos_tags)

examples/unsupervised/graph_based/text_rank.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from perke.unsupervised.graph_based import TextRank
44

55
# Define the set of valid part of speech tags to occur in the model.
6-
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
6+
valid_pos_tags = {'NOUN', 'ADJ'}
77

88
# 1. Create a TextRank extractor.
99
extractor = TextRank(valid_pos_tags=valid_pos_tags)

examples/unsupervised/graph_based/topic_rank.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
1-
from os.path import dirname, join
1+
from pathlib import Path
22

33
from perke.unsupervised.graph_based import TopicRank
44

55
# Define the set of valid part of speech tags to occur in the model.
6-
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
6+
valid_pos_tags = {'NOUN', 'ADJ'}
77

88
# 1. Create a TopicRank extractor.
99
extractor = TopicRank(valid_pos_tags=valid_pos_tags)
1010

1111
# 2. Load the text.
12-
input_filepath = join(dirname(dirname(dirname(__file__))), 'input.txt')
12+
input_filepath = Path(__file__).parent.parent.parent / 'input.txt'
1313
extractor.load_text(input=input_filepath, word_normalization_method='stemming')
1414

1515
# 3. Select the longest sequences of nouns and adjectives, that do

perke/base/extractor.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
4545
----------
4646
valid_pos_tags:
4747
Set of valid part of speech tags, defaults to nouns and
48-
adjectives. I.e. `{'N', 'Ne', 'AJ', 'AJe'}`.
48+
adjectives. I.e. `{'NOUN', 'ADJ'}`.
4949
"""
5050
self.word_normalization_method: Optional[str] = None
5151
self.sentences: List[Sentence] = []
@@ -54,13 +54,14 @@ def __init__(self, valid_pos_tags: Optional[Set[str]] = None) -> None:
5454
punctuation_marks
5555
)
5656
if valid_pos_tags is None:
57-
valid_pos_tags = {'N', 'Ne', 'AJ', 'AJe'}
57+
valid_pos_tags = {'NOUN', 'ADJ'}
5858
self.valid_pos_tags: Set[str] = valid_pos_tags
5959

6060
def load_text(
6161
self,
6262
input: Union[str, Path],
6363
word_normalization_method: WordNormalizationMethod = 'stemming',
64+
universal_pos_tags: bool = True,
6465
) -> None:
6566
"""
6667
Loads the text of a document or string.
@@ -74,9 +75,15 @@ def load_text(
7475
Word normalization method, defaults to `'stemming'`. See
7576
`perke.base.types.WordNormalizationMethod` for available
7677
methods.
78+
79+
universal_pos_tags:
80+
Whether to use universal part of speech tags or not,
81+
defaults to `True`.
7782
"""
7883
# Initialize reader
79-
reader = RawTextReader(input, word_normalization_method)
84+
reader = RawTextReader(
85+
input, word_normalization_method, universal_pos_tags
86+
)
8087

8188
# Load sentences
8289
self.sentences = reader.read()
@@ -225,7 +232,7 @@ def _add_candidate_occurrence(
225232
The offset of the occurrence
226233
227234
normalized_words:
228-
List of normalized of words of the occurrence
235+
List of normalized words of the occurrence
229236
"""
230237
# Build the canonical form of the candidate
231238
canonical_form = ' '.join(normalized_words)
@@ -306,7 +313,7 @@ def _select_candidates_with_longest_sequences(
306313
first = sequence_offsets[0]
307314
last = sequence_offsets[-1]
308315

309-
# Add the ngram as a new candidate occurrence
316+
# Add the n-gram as a new candidate occurrence
310317
self._add_candidate_occurrence(
311318
words=sentence.words[first : last + 1],
312319
offset=offset_shift + first,
@@ -336,20 +343,20 @@ def _select_candidates_with_grammar(
336343
defaults to::
337344
r\"""
338345
NP:
339-
<P>{<N>}<V>
346+
{<NOUN>}<VERB>
340347
NP:
341-
{<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
342-
<N>}{<.*e?>'
348+
{<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
349+
<NOUN>}{<.*(,EZ)?>
343350
\"""
344351
"""
345352
# Initialize default grammar if none provided
346353
if grammar is None:
347354
grammar = r"""
348355
NP:
349-
<P>{<N>}<V>
356+
{<NOUN>}<VERB>
350357
NP:
351-
{<DETe?|Ne?|NUMe?|AJe|PRO|CL|RESe?><DETe?|Ne?|NUMe?|AJe?|PRO|CL|RESe?>*}
352-
<N>}{<.*e?>
358+
{<DET(,EZ)?|NOUN(,EZ)?|NUM(,EZ)?|ADJ(,EZ)|PRON><DET(,EZ)|NOUN(,EZ)|NUM(,EZ)|ADJ(,EZ)|PRON>*}
359+
<NOUN>}{<.*(,EZ)?>
353360
"""
354361

355362
# Initialize parser

0 commit comments

Comments
 (0)