Skip to content

Commit 0d305e5

Browse files
committed
update version to '0.2.4'
1 parent 5c46b1c commit 0d305e5

13 files changed

Lines changed: 112 additions & 351 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
.cache
2+
__pycache__/
3+
results/

README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ Vocab(size=8002, unk="[UNK]", reserved="['[MASK]', '[SEP]', '[CLS]']")
181181

182182
```python
183183
>>> from gluonnlp.data import SentencepieceTokenizer
184-
>>> from kobert import get_tokenizer
185-
>>> tok_path = get_tokenizer()
184+
>>> from kobert import get_tokenizer_path
185+
>>> tok_path = get_tokenizer_path()
186186
>>> sp = SentencepieceTokenizer(tok_path)
187187
>>> sp('한국어 모델을 공유합니다.')
188188
['▁한국', '', '▁모델', '', '▁공유', '합니다', '.']
@@ -226,18 +226,20 @@ decoding_ner_sentence: [CLS] <SKTBrain:ORG>에서 <KoBERT:POH> 모델을 공개
226226

227227
## Release
228228

229+
* v0.2.4
230+
* 대용량 파일을 Hugging Face Hub에서 받도록 변경
229231
* v0.2.3
230-
* support `onnx 1.8.0`
232+
* `onnx 1.8.0` 지원
231233
* v0.2.2
232-
* fix `No module named 'kobert.utils'`
234+
* 에러 수정: `No module named 'kobert.utils'`
233235
* v0.2.1
234-
* guide default 'import statements'
236+
* import 구문 수정
235237
* v0.2
236-
* download large files from `aws s3`
237-
* rename functions
238+
* 대용량 파일을 `aws s3`에서 받도록 변경
239+
* 함수명 변경
238240
* v0.1.2
239-
* Guaranteed compatibility with higher versions of transformers
240-
* fix pad token index id
241+
* transformers 라이브러리 호환성 수정
242+
* pad token의 index 수정
241243
* v0.1.1
242244
* 사전(vocabulary)과 토크나이저 통합
243245
* v0.1

kobert/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16-
from kobert.utils.utils import download, get_tokenizer
16+
from kobert.utils.utils import download, get_tokenizer_path
1717
from kobert.pytorch_kobert import get_pytorch_kobert_model
1818
from kobert.mxnet_kobert import get_mxnet_kobert_model
1919
from kobert.onnx_kobert import get_onnx_kobert_model
2020

21-
__all__ = ("download", "get_tokenizer", "get_pytorch_kobert_model" ,"get_mxnet_kobert_model", "get_onnx_kobert_model")
21+
__all__ = ("download", "get_tokenizer_path", "get_pytorch_kobert_model" ,"get_mxnet_kobert_model", "get_onnx_kobert_model")

kobert/mxnet_kobert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import mxnet as mx
1818
from gluonnlp.model import BERTEncoder, BERTModel
1919

20-
from kobert import download, get_tokenizer
20+
from kobert import download, get_tokenizer_path
2121

2222

2323
def get_mxnet_kobert_model(
@@ -83,7 +83,7 @@ def get_kobert_model(
8383
return (net, vocab_b_obj)
8484

8585
mxnet_kobert = {
86-
"url": "s3://skt-lsl-nlp-model/KoBERT/models/mxnet_kobert_45b6957552.params",
86+
"url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/mxnet_kobert_45b6957552.params",
8787
"chksum": "45b6957552",
8888
}
8989

@@ -93,7 +93,7 @@ def get_kobert_model(
9393
model_info["url"], model_info["chksum"], cachedir=cachedir
9494
)
9595
# download vocab
96-
vocab_path = get_tokenizer()
96+
vocab_path = get_tokenizer_path()
9797
return get_kobert_model(
9898
model_path, vocab_path, use_pooler, use_decoder, use_classifier, ctx
9999
)

kobert/onnx_kobert.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
def get_onnx_kobert_model(cachedir=".cache"):
2020
"""Get KoBERT ONNX file path after downloading"""
2121
onnx_kobert = {
22-
"url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert.onnx1.8.0.onnx",
22+
"url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert.onnx1.8.0.onnx",
2323
"chksum": "6f6610f2e3b61da6de8dbce",
2424
}
2525

kobert/pytorch_kobert.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from transformers import BertModel
2020
import gluonnlp as nlp
2121

22-
from kobert import download, get_tokenizer
22+
from kobert import download, get_tokenizer_path
2323

2424

2525
def get_pytorch_kobert_model(ctx="cpu", cachedir=".cache"):
@@ -34,7 +34,7 @@ def get_kobert_model(model_path, vocab_file, ctx="cpu"):
3434
return bertmodel, vocab_b_obj
3535

3636
pytorch_kobert = {
37-
"url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert_v1.zip",
37+
"url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert_v1.zip",
3838
"chksum": "411b242919", # 411b2429199bc04558576acdcac6d498
3939
}
4040

@@ -48,7 +48,7 @@ def get_kobert_model(model_path, vocab_file, ctx="cpu"):
4848
zipf.extractall(path=cachedir_full)
4949
model_path = os.path.join(os.path.expanduser(cachedir), "kobert_from_pretrained")
5050
# download vocab
51-
vocab_path = get_tokenizer()
51+
vocab_path = get_tokenizer_path()
5252
return get_kobert_model(model_path, vocab_path, ctx)
5353

5454

kobert/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from kobert.utils.utils import download, get_tokenizer
1+
from kobert.utils.utils import download, get_tokenizer_path

kobert/utils/aws_s3_downloader.py

Lines changed: 0 additions & 67 deletions
This file was deleted.

kobert/utils/utils.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# coding=utf-8
2-
# Copyright 2019 SK T-Brain Authors.
2+
# Copyright 2019-2025 SKTelecom
33
#
44
# Licensed under the Apache License, Version 2.0 (the "License");
55
# you may not use this file except in compliance with the License.
@@ -15,8 +15,7 @@
1515

1616
import hashlib
1717
import os
18-
19-
from kobert.utils.aws_s3_downloader import AwsS3Downloader
18+
import urllib.request
2019

2120

2221
def download(url, chksum=None, cachedir=".cache"):
@@ -29,22 +28,27 @@ def download(url, chksum=None, cachedir=".cache"):
2928
print(f"using cached model. {file_path}")
3029
return file_path, True
3130

32-
s3 = AwsS3Downloader()
33-
file_path = s3.download(url, cachedir_full)
31+
print(f"downloading model from {url}...")
32+
try:
33+
urllib.request.urlretrieve(url, file_path)
34+
except Exception as e:
35+
print(f"download failed: {e}")
36+
return None, False
37+
3438
if chksum:
3539
assert (
3640
chksum[:10] == hashlib.md5(open(file_path, "rb").read()).hexdigest()[:10]
3741
), "corrupted file!"
3842
return file_path, False
3943

4044

41-
def get_tokenizer(cachedir=".cache"):
45+
def get_tokenizer_path(cachedir=".cache"):
4246
"""Get KoBERT Tokenizer file path after downloading"""
4347
tokenizer = {
44-
"url": "s3://skt-lsl-nlp-model/KoBERT/tokenizers/kobert_news_wiki_ko_cased-1087f8699e.spiece",
48+
"url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert_news_wiki_ko_cased-1087f8699e.spiece",
4549
"chksum": "ae5711deb3",
4650
}
4751

4852
model_info = tokenizer
49-
model_path, is_cached = download(model_info["url"], model_info["chksum"], cachedir=cachedir)
53+
model_path, _ = download(model_info["url"], model_info["chksum"], cachedir=cachedir)
5054
return model_path

requirements.txt

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
boto3 <=1.15.18
1+
accelerate
2+
datasets
23
gluonnlp >= 0.6.0, <=0.10.0
34
mxnet >= 1.4.0, <=1.7.0.post2
4-
onnxruntime == 1.8.0, <=1.8.0
5+
onnxruntime
6+
protobuf
57
sentencepiece >= 0.1.6, <=0.1.96
6-
torch >= 1.7.0, <=1.10.1
7-
transformers >= 4.8.1, <=4.8.1
8+
torch
9+
transformers

0 commit comments

Comments
 (0)