Skip to content

Commit 1a7fe0c

Browse files
authored
Merge pull request #59 from bigdata-ustc/dev
[FEATURE] version upgrade
2 parents cc357e6 + 3c3ff14 commit 1a7fe0c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+5865
-987
lines changed

AUTHORS.md

+2
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,7 @@
1414

1515
[Pingzhi Li](https://github.com/pingzhiLi)
1616

17+
[Meikai Bao](https://github.com/BAOOOOOM)
18+
1719

1820
The stared contributors are the corresponding authors.

CHANGE.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
v0.0.6:
2+
1. dev: add half-pretrained rnn model
3+
2. important!!!: rename TextTokenizer to PureTextTokenizer, and add a new tokenizer named TextTokenizer (the two have similar but not the same behaviours).
4+
3. sif: add $\textf{}$ syntax
5+
4. add two pretrained w2v model: w2v_sci_300 and w2v_lit_300
6+
17
v0.0.5:
28
1. fix the missing stopwords.txt when use pip install
39

@@ -18,4 +24,4 @@ v0.0.2:
1824
v0.0.1:
1925
1. Add Formula class to parse latex formula, which will generate the abstract syntax tree.
2026
2. Add SIF v0.0.2.
21-
3. Add sif4sci function which serves as a preprocess function for downstream tasks.
27+
3. Add sif4sci function which serves as a preprocess function for downstream tasks.

EduNLP/Pretrain/gensim_vec.py

+37-37
Original file line numberDiff line numberDiff line change
@@ -15,36 +15,36 @@
1515

1616

1717
class GensimWordTokenizer(object):
18+
"""
19+
20+
Parameters
21+
----------
22+
symbol:
23+
gm
24+
fgm
25+
gmas
26+
fgmas
27+
general:
28+
True when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly.
29+
False when use 'ast' mothed to tokenize formulas instead of 'linear'.
30+
31+
Returns
32+
----------
33+
34+
Examples
35+
----------
36+
>>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
37+
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
38+
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
39+
>>> print(token_item.tokens[:10])
40+
['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
41+
>>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False)
42+
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
43+
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
44+
>>> print(token_item.tokens[:10])
45+
['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
46+
"""
1847
def __init__(self, symbol="gm", general=False):
19-
"""
20-
21-
Parameters
22-
----------
23-
symbol:
24-
gm
25-
fgm
26-
gmas
27-
fgmas
28-
general:
29-
True when item isn't in standard format, and want to tokenize formulas(except formulas in figure) linearly.
30-
False when use 'ast' mothed to tokenize formulas instead of 'linear'.
31-
32-
Returns
33-
----------
34-
35-
Examples
36-
----------
37-
>>> tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
38-
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
39-
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
40-
>>> print(token_item.tokens[:10])
41-
['公式', '[FORMULA]', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[FORMULA]']
42-
>>> tokenizer = GensimWordTokenizer(symbol="fgmas", general=False)
43-
>>> token_item = tokenizer("有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
44-
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$")
45-
>>> print(token_item.tokens[:10])
46-
['公式', '[FORMULA]', '如图', '[FIGURE]', '[FORMULA]', '约束条件', '公式', '[FORMULA]', '[SEP]', '[FORMULA]']
47-
"""
4848
self.symbol = symbol
4949
if general is True:
5050
self.tokenization_params = {
@@ -72,15 +72,15 @@ def __call__(self, item):
7272

7373

7474
class GensimSegTokenizer(object): # pragma: no cover
75+
"""
76+
77+
Parameters
78+
----------
79+
symbol:
80+
gms
81+
fgm
82+
"""
7583
def __init__(self, symbol="gms", depth=None, flatten=False, **kwargs):
76-
"""
77-
78-
Parameters
79-
----------
80-
symbol:
81-
gms
82-
fgm
83-
"""
8484
self.symbol = symbol
8585
self.tokenization_params = {
8686
"formula_params": {

EduNLP/utils/data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def dict2str4sif(obj: dict, key_as_tag=True, tag_mode="delimiter", add_list_no_t
3838
add_list_no_tag
3939
keys
4040
41-
Returns
41+
Examples
4242
-------
4343
>>> item = {
4444
... "stem": r"若复数$z=1+2 i+i^{3}$,则$|z|=$",

EduNLP/utils/path.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def path_append(path, *addition, to_str=False):
2727
--------
2828
.. code-block:: python
2929
30-
path_append("../", "../data", "../dataset1/", "train", to_str=True)
31-
'../../data/../dataset1/train'
30+
path_append("../", "../data", "../dataset1/", "train", to_str=True)
31+
'../../data/../dataset1/train'
3232
3333
Parameters
3434
----------

README.md

+8
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ pip install EduNLP
4141
pip install EduNLP[full]
4242
```
4343

44+
### Usage
45+
46+
```python
47+
from EduNLP import get_pretrained_i2v
48+
i2v = get_pretrained_i2v("d2v_all_256", "./model")
49+
item_vector, token_vector = i2v(["the content of item 1", "the content of item 2"])
50+
```
51+
4452
### Tutorial
4553

4654
For more details, please refer to the full documentation ([latest](https://edunlp.readthedocs.io/en/latest) | [stable](https://edunlp.readthedocs.io/en/stable)).

asset/_static/d2v.png

6.48 KB
Loading

asset/_static/d2v_bow_tfidf.png

12.1 KB
Loading

asset/_static/d2v_general.png

10 KB
Loading

asset/_static/d2v_stem_tf.png

9.67 KB
Loading

asset/_static/data.png

42.5 KB
Loading

asset/_static/formula.png

13.6 KB
Loading

asset/_static/i2v.png

41.2 KB
Loading

asset/_static/parse.png

25.1 KB
Loading

asset/_static/prepare_dataset.jpg

74.1 KB
Loading

asset/_static/seg.png

27.4 KB
Loading

asset/_static/sif.png

5.22 KB
Loading

asset/_static/sif_addition.png

29.5 KB
Loading

asset/_static/tokenizer.png

9.14 KB
Loading

asset/_static/w2v_stem_text.png

34.1 KB
Loading

asset/_static/w2v_stem_tf.png

48.3 KB
Loading

docs/SIF4TI_CH.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 标准测试项目格式
1+
# 标准项目格式
22

33
version: 0.2
44

docs/requirements.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ sphinx
22
sphinx_rtd_theme
33
sphinx_toggleprompt
44
sphinx-gallery>=0.6
5-
nbsphinx
5+
nbsphinx
6+
m2r2

docs/source/api/ModelZoo.rst

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
EduNLP.ModelZoo
2+
==============
3+
4+
rnn
5+
-----------
6+
7+
.. automodule:: EduNLP.ModelZoo.rnn
8+
:members:
9+
:imported-members:
10+
11+
utils
12+
-----------
13+
14+
.. automodule:: EduNLP.ModelZoo.utils
15+
:members:
16+
:imported-members:

docs/source/api/index.rst

+41
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,43 @@
11
EduNLP
22
======
3+
4+
SIF
5+
----------------------
6+
.. automodule:: EduNLP.SIF.sif
7+
:members:
8+
:imported-members:
9+
10+
EduNLP.Formula
11+
---------------------
12+
13+
.. automodule:: EduNLP.Formula.ast
14+
:members:
15+
:imported-members:
16+
17+
EduNLP.I2V
18+
-----------------
19+
20+
.. automodule:: EduNLP.I2V.i2v
21+
:members:
22+
:imported-members:
23+
24+
EduNLP.Pretrain
25+
-------------------
26+
27+
.. automodule:: EduNLP.Pretrain
28+
:members:
29+
:imported-members:
30+
31+
EduNLP.Tokenizer
32+
----------------------
33+
34+
.. automodule:: EduNLP.Tokenizer
35+
:members:
36+
:imported-members:
37+
38+
Vector
39+
---------------
40+
41+
.. automodule:: EduNLP.Vector
42+
:members:
43+
:imported-members:

docs/source/api/pretrain.rst

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
EduNLP.Pretrain
2+
==================
3+
4+
.. automodule:: EduNLP.Pretrain
5+
:members:
6+
:imported-members:

docs/source/api/tokenizer.rst

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
EduNLP.Tokenizer
2+
=====================================
3+
4+
.. automodule:: EduNLP.Tokenizer
5+
:members:
6+
:imported-members:

docs/source/api/utils.rst

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
EduNLP.utils
2+
====================
3+
4+
.. automodule:: EduNLP.utils
5+
:members:
6+
:imported-members:

docs/source/api/vector.rst

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
EduNLP.Vector
2+
==========================
3+
4+
Vector
5+
---------------
6+
7+
.. automodule:: EduNLP.Vector
8+
:members:
9+
:imported-members:
10+

docs/source/conf.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,34 @@ def copy_tree(src, tar):
4646
'sphinx.ext.mathjax',
4747
'sphinx_toggleprompt',
4848
'nbsphinx',
49-
'sphinx_gallery.load_style'
49+
'sphinx_gallery.load_style',
50+
'm2r2',
51+
'IPython.sphinxext.ipython_console_highlighting',
52+
'IPython.sphinxext.ipython_directive'
5053
]
5154

5255
# extension variables setting
5356
# npsphinx
5457

5558
nbsphinx_thumbnails = {
56-
'build/blitz/sif/sif': '_static/item_figure.png',
59+
'build/blitz/sif/sif': '_static/sif.png',
60+
'build/blitz/sif/sif_addition': '_static/sif_addition.png',
61+
'build/blitz/utils/data': '_static/data.png',
62+
'build/blitz/formula/formula': '_static/formula.png',
63+
'build/blitz/seg/seg': '_static/seg.png',
64+
'build/blitz/parse/parse': '_static/parse.png',
65+
'build/blitz/formula/formula': '_static/formula.png',
66+
'build/blitz/tokenizer/tokenizer': '_static/tokenizer.png',
67+
'build/blitz/vectorization/i2v': '_static/i2v.png',
68+
'build/blitz/pretrain/prepare_dataset': '_static/prepare_dataset.jpg',
69+
'build/blitz/pretrain/gensim/d2v_bow_tfidf': '_static/d2v_bow_tfidf.png',
70+
'build/blitz/pretrain/gensim/d2v_general': '_static/d2v_general.png',
71+
'build/blitz/pretrain/gensim/d2v_stem_tf': '_static/d2v_stem_tf.png',
72+
'build/blitz/pretrain/gensim/w2v_stem_text': '_static/w2v_stem_text.png',
73+
'build/blitz/pretrain/gensim/w2v_stem_tf': '_static/w2v_stem_tf.png',
74+
'build/blitz/pretrain/seg_token/d2v': '_static/d2v.png',
75+
'build/blitz/pretrain/seg_token/d2v_d1': '_static/d2v_d1.png',
76+
'build/blitz/pretrain/seg_token/d2v_d2': '_static/d2v_d2.png',
5777
}
5878

5979
# Add any paths that contain templates here, relative to this directory.
@@ -62,7 +82,7 @@ def copy_tree(src, tar):
6282
# The suffix(es) of source filenames.
6383
# You can specify multiple suffix as a list of string:
6484
#
65-
source_suffix = ['.rst', '.md', '.ipynb']
85+
source_suffix = ['.rst', '.md']
6686
# source_suffix = '.rst'
6787

6888
# The language for content autogenerated by Sphinx. Refer to documentation
@@ -75,7 +95,7 @@ def copy_tree(src, tar):
7595
# List of patterns, relative to source directory, that match files and
7696
# directories to ignore when looking for source files.
7797
# This pattern also affects html_static_path and html_extra_path.
78-
exclude_patterns = ['_build']
98+
exclude_patterns = ['_build','**.ipynb_checkpoints']
7999

80100
# -- Options for HTML output -------------------------------------------------
81101

0 commit comments

Comments
 (0)