Skip to content

Commit 95cd9fc

Browse files
authored
Merge pull request #152 from bigdata-ustc/dev
[FEATURE] Upgrade version to 1.0.0
2 parents ee29eb6 + 598d788 commit 95cd9fc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

63 files changed

+5282
-491
lines changed

.github/workflows/python-test.yml

+6-2
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,14 @@ on: [push, pull_request]
66
jobs:
77
build:
88

9-
runs-on: ubuntu-latest
9+
runs-on: ${{ matrix.os }}
1010
strategy:
1111
matrix:
1212
python-version: [3.6, 3.7, 3.8, 3.9]
13+
include:
14+
- os: "ubuntu-latest"
15+
- os: "ubuntu-20.04"
16+
python-version: "3.6"
1317

1418
steps:
1519
- uses: actions/checkout@v2
@@ -24,4 +28,4 @@ jobs:
2428
- name: Test with pytest
2529
run: |
2630
pytest
27-
codecov
31+
codecov

AUTHORS.md

+1
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@
2020

2121
[Jundong Wu](https://github.com/wintermelon008)
2222

23+
[Shangzi Xue](https://github.com/ShangziXue)
2324

2425
The stared contributors are the corresponding authors.

CHANGE.txt

+6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
v1.0.0
2+
1. Support cuda for I2V and T2V.
3+
2. Add demos for downstream tasks including knowledge & difficulty & discrimination prediction, similarity prediction and paper segmentation.
4+
3. Refactor quesnet for pretrain and vectorization.
5+
4. Update documents about tutorials and API.
6+
17
v0.0.9
28
1. Refactor tokenizer Basic Tokenizer and Pretrained Tokenizer
39
2. Refactor model structures following huggingface styles for Elmo, BERT, DisenQNet and QuesNet

EduNLP/I2V/i2v.py

+44-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coding: utf-8
22
# 2021/8/1 @ tongshiwei
33

4+
import torch
45
import json
56
import os.path
67
from typing import List, Tuple
@@ -59,12 +60,12 @@ class I2V(object):
5960
"""
6061

6162
def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
62-
pretrained_t2v=False, model_dir=MODEL_DIR, **kwargs):
63+
pretrained_t2v=False, model_dir=MODEL_DIR, device='cpu', **kwargs):
6364
if pretrained_t2v:
6465
logger.info("Use pretrained t2v model %s" % t2v)
65-
self.t2v = get_t2v_pretrained_model(t2v, model_dir)
66+
self.t2v = get_t2v_pretrained_model(t2v, model_dir, device)
6667
else:
67-
self.t2v = T2V(t2v, *args, **kwargs)
68+
self.t2v = T2V(t2v, device=device, *args, **kwargs)
6869
if tokenizer == 'bert':
6970
self.tokenizer = BertTokenizer.from_pretrained(
7071
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
@@ -82,31 +83,53 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
8283
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
8384
self.params = {
8485
"tokenizer": tokenizer,
85-
"tokenizer_kwargs": tokenizer_kwargs,
8686
"t2v": t2v,
8787
"args": args,
88+
"tokenizer_kwargs": tokenizer_kwargs,
89+
"pretrained_t2v": pretrained_t2v,
90+
"model_dir": model_dir,
8891
"kwargs": kwargs,
89-
"pretrained_t2v": pretrained_t2v
9092
}
93+
self.device = torch.device(device)
9194

9295
def __call__(self, items, *args, **kwargs):
9396
"""transfer item to vector"""
9497
return self.infer_vector(items, *args, **kwargs)
9598

9699
def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list:
97-
# """tokenize item"""
100+
"""
101+
tokenize item
102+
Parameter
103+
----------
104+
items: a list of questions
105+
Return
106+
----------
107+
tokens: list
108+
"""
98109
return self.tokenizer(items, *args, key=key, **kwargs)
99110

100111
def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple:
112+
"""
113+
get question embedding
114+
NotImplemented
115+
"""
101116
raise NotImplementedError
102117

103118
def infer_item_vector(self, tokens, *args, **kwargs) -> ...:
119+
"""NotImplemented"""
104120
return self.infer_vector(tokens, *args, **kwargs)[0]
105121

106122
def infer_token_vector(self, tokens, *args, **kwargs) -> ...:
123+
"""NotImplemented"""
107124
return self.infer_vector(tokens, *args, **kwargs)[1]
108125

109126
def save(self, config_path):
127+
"""
128+
save model weights in config_path
129+
Parameter:
130+
----------
131+
config_path: str
132+
"""
110133
with open(config_path, "w", encoding="utf-8") as wf:
111134
json.dump(self.params, wf, ensure_ascii=False, indent=2)
112135

@@ -123,6 +146,7 @@ def load(cls, config_path, *args, **kwargs):
123146

124147
@classmethod
125148
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
149+
"""NotImplemented"""
126150
raise NotImplementedError
127151

128152
@property
@@ -327,13 +351,13 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
327351
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)
328352

329353
@classmethod
330-
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
354+
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
331355
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
332356
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
333357
model_path = model_path.replace(i, "")
334358
logger.info("model_path: %s" % model_path)
335359
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
336-
return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir,
360+
return cls("elmo", name, pretrained_t2v=True, model_dir=model_dir, device=device,
337361
tokenizer_kwargs=tokenizer_kwargs)
338362

339363

@@ -386,17 +410,19 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
386410
--------
387411
vector:list
388412
"""
413+
is_batch = isinstance(items, list)
414+
items = items if is_batch else [items]
389415
inputs = self.tokenize(items, key=key, return_tensors=return_tensors)
390416
return self.t2v.infer_vector(inputs, *args, **kwargs), self.t2v.infer_tokens(inputs, *args, **kwargs)
391417

392418
@classmethod
393-
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
419+
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
394420
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
395421
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
396422
model_path = model_path.replace(i, "")
397423
logger.info("model_path: %s" % model_path)
398424
tokenizer_kwargs = {"tokenizer_config_dir": model_path}
399-
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir,
425+
return cls("bert", name, pretrained_t2v=True, model_dir=model_dir, device=device,
400426
tokenizer_kwargs=tokenizer_kwargs)
401427

402428

@@ -452,7 +478,7 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
452478
return i_vec, t_vec
453479

454480
@classmethod
455-
def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs):
481+
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', **kwargs):
456482
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
457483
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
458484
model_path = model_path.replace(i, "")
@@ -461,7 +487,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, **kwargs):
461487
tokenizer_kwargs = {
462488
"tokenizer_config_dir": model_path,
463489
}
464-
return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir,
490+
return cls("disenq", name, pretrained_t2v=True, model_dir=model_dir, device=device,
465491
tokenizer_kwargs=tokenizer_kwargs, **kwargs)
466492

467493

@@ -495,18 +521,20 @@ def infer_vector(self, items: Tuple[List[str], List[dict], str, dict],
495521
token embeddings
496522
question embedding
497523
"""
524+
is_batch = isinstance(items, list)
525+
items = items if is_batch else [items]
498526
encodes = self.tokenize(items, key=key, meta=meta, *args, **kwargs)
499527
return self.t2v.infer_vector(encodes), self.t2v.infer_tokens(encodes)
500528

501529
@classmethod
502-
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
530+
def from_pretrained(cls, name, model_dir=MODEL_DIR, device='cpu', *args, **kwargs):
503531
model_path = path_append(model_dir, get_pretrained_model_info(name)[0].split('/')[-1], to_str=True)
504532
for i in [".tar.gz", ".tar.bz2", ".tar.bz", ".tar.tgz", ".tar", ".tgz", ".zip", ".rar"]:
505533
model_path = model_path.replace(i, "")
506534
logger.info("model_path: %s" % model_path)
507535
tokenizer_kwargs = {
508536
"tokenizer_config_dir": model_path}
509-
return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir,
537+
return cls("quesnet", name, pretrained_t2v=True, model_dir=model_dir, device=device,
510538
tokenizer_kwargs=tokenizer_kwargs)
511539

512540

@@ -520,7 +548,7 @@ def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
520548
}
521549

522550

523-
def get_pretrained_i2v(name, model_dir=MODEL_DIR):
551+
def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'):
524552
"""
525553
It is a good idea if you want to switch item to vector earily.
526554
@@ -560,4 +588,4 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR):
560588
)
561589
_, t2v = get_pretrained_model_info(name)
562590
_class, *params = MODEL_MAP[t2v], name
563-
return _class.from_pretrained(*params, model_dir=model_dir)
591+
return _class.from_pretrained(*params, model_dir=model_dir, device=device)

EduNLP/ModelZoo/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
from .utils import *
22
from .bert import *
33
from .rnn import *
4+
from .disenqnet import *
5+
from .quesnet import *

EduNLP/ModelZoo/base_model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def from_pretrained(cls, pretrained_model_path, *args, **kwargs):
3131
config_path = os.path.join(pretrained_model_path, "config.json")
3232
model_path = os.path.join(pretrained_model_path, "pytorch_model.bin")
3333
model = cls.from_config(config_path, *args, **kwargs)
34-
loaded_state_dict = torch.load(model_path)
34+
loaded_state_dict = torch.load(model_path, map_location=torch.device('cpu'))
3535
loaded_keys = loaded_state_dict.keys()
3636
expected_keys = model.state_dict().keys()
3737

EduNLP/ModelZoo/bert/bert.py

+42-29
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,35 @@
11
import torch
22
from torch import nn
3-
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
4-
from baize.torch import load_net
5-
import torch.nn.functional as F
63
import json
74
import os
85
from ..base_model import BaseModel
9-
from transformers.modeling_outputs import ModelOutput
10-
from transformers import BertModel, PretrainedConfig
11-
from typing import List, Optional
6+
from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput
7+
from transformers import BertModel, PretrainedConfig, BertConfig
8+
from typing import List
129
from ..rnn.harnn import HAM
1310

14-
__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"]
15-
1611

17-
class BertForPPOutput(ModelOutput):
18-
loss: torch.FloatTensor = None
19-
logits: torch.FloatTensor = None
12+
__all__ = ["BertForPropertyPrediction", "BertForKnowledgePrediction"]
2013

2114

2215
class BertForPropertyPrediction(BaseModel):
23-
def __init__(self, pretrained_model_dir=None, head_dropout=0.5):
16+
def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True):
2417
super(BertForPropertyPrediction, self).__init__()
25-
self.bert = BertModel.from_pretrained(pretrained_model_dir)
18+
bert_config = BertConfig.from_pretrained(pretrained_model_dir)
19+
if init:
20+
print(f'Load BertModel from checkpoint: {pretrained_model_dir}')
21+
self.bert = BertModel.from_pretrained(pretrained_model_dir)
22+
else:
23+
print(f'Load BertModel from config: {pretrained_model_dir}')
24+
self.bert = BertModel(bert_config)
2625
self.hidden_size = self.bert.config.hidden_size
2726
self.head_dropout = head_dropout
2827
self.dropout = nn.Dropout(head_dropout)
2928
self.classifier = nn.Linear(self.hidden_size, 1)
3029
self.sigmoid = nn.Sigmoid()
3130
self.criterion = nn.MSELoss()
3231

33-
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
32+
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]}
3433
self.config['architecture'] = 'BertForPropertyPrediction'
3534
self.config = PretrainedConfig.from_dict(self.config)
3635

@@ -47,44 +46,54 @@ def forward(self,
4746
loss = None
4847
if labels is not None:
4948
loss = self.criterion(logits, labels) if labels is not None else None
50-
return BertForPPOutput(
49+
return PropertyPredictionOutput(
5150
loss=loss,
5251
logits=logits,
5352
)
5453

5554
@classmethod
5655
def from_config(cls, config_path, **kwargs):
56+
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
5757
with open(config_path, "r", encoding="utf-8") as rf:
5858
model_config = json.load(rf)
59+
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
5960
model_config.update(kwargs)
6061
return cls(
6162
pretrained_model_dir=model_config['pretrained_model_dir'],
62-
head_dropout=model_config.get("head_dropout", 0.5)
63+
head_dropout=model_config.get("head_dropout", 0.5),
64+
init=model_config.get('init', False)
6365
)
6466

65-
# @classmethod
66-
# def from_pretrained(cls):
67-
# NotImplementedError
68-
# # 需要验证是否和huggingface的模型兼容
67+
def save_config(self, config_dir):
68+
config_path = os.path.join(config_dir, "model_config.json")
69+
with open(config_path, "w", encoding="utf-8") as wf:
70+
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
71+
self.bert.config.save_pretrained(config_dir)
6972

7073

7174
class BertForKnowledgePrediction(BaseModel):
7275
def __init__(self,
76+
pretrained_model_dir=None,
7377
num_classes_list: List[int] = None,
7478
num_total_classes: int = None,
75-
pretrained_model_dir=None,
7679
head_dropout=0.5,
7780
flat_cls_weight=0.5,
7881
attention_unit_size=256,
7982
fc_hidden_size=512,
8083
beta=0.5,
84+
init=True
8185
):
8286
super(BertForKnowledgePrediction, self).__init__()
83-
self.bert = BertModel.from_pretrained(pretrained_model_dir)
87+
bert_config = BertConfig.from_pretrained(pretrained_model_dir)
88+
if init:
89+
print(f'Load BertModel from checkpoint: {pretrained_model_dir}')
90+
self.bert = BertModel.from_pretrained(pretrained_model_dir)
91+
else:
92+
print(f'Load BertModel from config: {pretrained_model_dir}')
93+
self.bert = BertModel(bert_config)
8494
self.hidden_size = self.bert.config.hidden_size
8595
self.head_dropout = head_dropout
8696
self.dropout = nn.Dropout(head_dropout)
87-
self.classifier = nn.Linear(self.hidden_size, 1)
8897
self.sigmoid = nn.Sigmoid()
8998
self.criterion = nn.MSELoss()
9099
self.flat_classifier = nn.Linear(self.hidden_size, num_total_classes)
@@ -101,7 +110,7 @@ def __init__(self,
101110
self.num_classes_list = num_classes_list
102111
self.num_total_classes = num_total_classes
103112

104-
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__"]}
113+
self.config = {k: v for k, v in locals().items() if k not in ["self", "__class__", "bert_config"]}
105114
self.config['architecture'] = 'BertForKnowledgePrediction'
106115
self.config = PretrainedConfig.from_dict(self.config)
107116

@@ -124,15 +133,17 @@ def forward(self,
124133
labels = torch.sum(torch.nn.functional.one_hot(labels, num_classes=self.num_total_classes), dim=1)
125134
labels = labels.float()
126135
loss = self.criterion(logits, labels) if labels is not None else None
127-
return BertForPPOutput(
136+
return KnowledgePredictionOutput(
128137
loss=loss,
129138
logits=logits,
130139
)
131140

132141
@classmethod
133142
def from_config(cls, config_path, **kwargs):
143+
config_path = os.path.join(os.path.dirname(config_path), 'model_config.json')
134144
with open(config_path, "r", encoding="utf-8") as rf:
135145
model_config = json.load(rf)
146+
model_config['pretrained_model_dir'] = os.path.dirname(config_path)
136147
model_config.update(kwargs)
137148
return cls(
138149
pretrained_model_dir=model_config['pretrained_model_dir'],
@@ -143,9 +154,11 @@ def from_config(cls, config_path, **kwargs):
143154
attention_unit_size=model_config.get('attention_unit_size', 256),
144155
fc_hidden_size=model_config.get('fc_hidden_size', 512),
145156
beta=model_config.get('beta', 0.5),
157+
init=model_config.get('init', False)
146158
)
147159

148-
# @classmethod
149-
# def from_pretrained(cls):
150-
# NotImplementedError
151-
# # 需要验证是否和huggingface的模型兼容
160+
def save_config(self, config_dir):
161+
config_path = os.path.join(config_dir, "model_config.json")
162+
with open(config_path, "w", encoding="utf-8") as wf:
163+
json.dump(self.config.to_dict(), wf, ensure_ascii=False, indent=2)
164+
self.bert.config.save_pretrained(config_dir)

0 commit comments

Comments
 (0)