update version to '0.2.4'

singleheart · singleheart · commit 0d305e5dc3ec · 2025-06-12T10:37:42.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
 .cache
+__pycache__/
+results/
diff --git a/README.md b/README.md
@@ -181,8 +181,8 @@ Vocab(size=8002, unk="[UNK]", reserved="['[MASK]', '[SEP]', '[CLS]']")
 
 ```python
 >>> from gluonnlp.data import SentencepieceTokenizer
->>> from kobert import get_tokenizer
->>> tok_path = get_tokenizer()
+>>> from kobert import get_tokenizer_path
+>>> tok_path = get_tokenizer_path()
 >>> sp  = SentencepieceTokenizer(tok_path)
 >>> sp('한국어 모델을 공유합니다.')
 ['▁한국', '어', '▁모델', '을', '▁공유', '합니다', '.']
@@ -226,18 +226,20 @@ decoding_ner_sentence: [CLS] <SKTBrain:ORG>에서 <KoBERT:POH> 모델을 공개
 
 ## Release
 
+* v0.2.4
+  * 대용량 파일을 Hugging Face Hub에서 받도록 변경
 * v0.2.3
-  * support `onnx 1.8.0`
+  * `onnx 1.8.0` 지원
 * v0.2.2
-  * fix `No module named 'kobert.utils'`
+  * 에러 수정: `No module named 'kobert.utils'`
 * v0.2.1
-  * guide default 'import statements'
+  * import 구문 수정
 * v0.2
-  * download large files from `aws s3`
-  * rename functions
+  * 대용량 파일을 `aws s3`에서 받도록 변경
+  * 함수명 변경
 * v0.1.2
-  * Guaranteed compatibility with higher versions of transformers
-  * fix pad token index id
+  * transformers 라이브러리 호환성 수정
+  * pad token의 index 수정
 * v0.1.1
   * 사전(vocabulary)과 토크나이저 통합
 * v0.1
diff --git a/kobert/__init__.py b/kobert/__init__.py
@@ -13,9 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from kobert.utils.utils import download, get_tokenizer
+from kobert.utils.utils import download, get_tokenizer_path
 from kobert.pytorch_kobert import get_pytorch_kobert_model
 from kobert.mxnet_kobert import get_mxnet_kobert_model
 from kobert.onnx_kobert import get_onnx_kobert_model
 
-__all__ = ("download", "get_tokenizer", "get_pytorch_kobert_model" ,"get_mxnet_kobert_model", "get_onnx_kobert_model")
+__all__ = ("download", "get_tokenizer_path", "get_pytorch_kobert_model" ,"get_mxnet_kobert_model", "get_onnx_kobert_model")
diff --git a/kobert/mxnet_kobert.py b/kobert/mxnet_kobert.py
@@ -17,7 +17,7 @@
 import mxnet as mx
 from gluonnlp.model import BERTEncoder, BERTModel
 
-from kobert import download, get_tokenizer
+from kobert import download, get_tokenizer_path
 
 
 def get_mxnet_kobert_model(
@@ -83,7 +83,7 @@ def get_kobert_model(
         return (net, vocab_b_obj)
 
     mxnet_kobert = {
-        "url": "s3://skt-lsl-nlp-model/KoBERT/models/mxnet_kobert_45b6957552.params",
+        "url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/mxnet_kobert_45b6957552.params",
         "chksum": "45b6957552",
     }
 
@@ -93,7 +93,7 @@ def get_kobert_model(
         model_info["url"], model_info["chksum"], cachedir=cachedir
     )
     # download vocab
-    vocab_path = get_tokenizer()
+    vocab_path = get_tokenizer_path()
     return get_kobert_model(
         model_path, vocab_path, use_pooler, use_decoder, use_classifier, ctx
     )
diff --git a/kobert/onnx_kobert.py b/kobert/onnx_kobert.py
@@ -19,7 +19,7 @@
 def get_onnx_kobert_model(cachedir=".cache"):
     """Get KoBERT ONNX file path after downloading"""
     onnx_kobert = {
-        "url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert.onnx1.8.0.onnx",
+        "url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert.onnx1.8.0.onnx",
         "chksum": "6f6610f2e3b61da6de8dbce",
     }
 
diff --git a/kobert/pytorch_kobert.py b/kobert/pytorch_kobert.py
@@ -19,7 +19,7 @@
 from transformers import BertModel
 import gluonnlp as nlp
 
-from kobert import download, get_tokenizer
+from kobert import download, get_tokenizer_path
 
 
 def get_pytorch_kobert_model(ctx="cpu", cachedir=".cache"):
@@ -34,7 +34,7 @@ def get_kobert_model(model_path, vocab_file, ctx="cpu"):
         return bertmodel, vocab_b_obj
 
     pytorch_kobert = {
-        "url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert_v1.zip",
+        "url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert_v1.zip",
         "chksum": "411b242919",  # 411b2429199bc04558576acdcac6d498
     }
 
@@ -48,7 +48,7 @@ def get_kobert_model(model_path, vocab_file, ctx="cpu"):
     zipf.extractall(path=cachedir_full)
     model_path = os.path.join(os.path.expanduser(cachedir), "kobert_from_pretrained")
     # download vocab
-    vocab_path = get_tokenizer()
+    vocab_path = get_tokenizer_path()
     return get_kobert_model(model_path, vocab_path, ctx)
 
 
diff --git a/kobert/utils/__init__.py b/kobert/utils/__init__.py
@@ -1 +1 @@
-from kobert.utils.utils import download, get_tokenizer
+from kobert.utils.utils import download, get_tokenizer_path
diff --git a/kobert/utils/aws_s3_downloader.py b/kobert/utils/aws_s3_downloader.py
diff --git a/kobert/utils/utils.py b/kobert/utils/utils.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2019 SK T-Brain Authors.
+# Copyright 2019-2025 SKTelecom
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,8 +15,7 @@
 
 import hashlib
 import os
-
-from kobert.utils.aws_s3_downloader import AwsS3Downloader
+import urllib.request
 
 
 def download(url, chksum=None, cachedir=".cache"):
@@ -29,22 +28,27 @@ def download(url, chksum=None, cachedir=".cache"):
             print(f"using cached model. {file_path}")
             return file_path, True
 
-    s3 = AwsS3Downloader()
-    file_path = s3.download(url, cachedir_full)
+    print(f"downloading model from {url}...")
+    try:
+        urllib.request.urlretrieve(url, file_path)
+    except Exception as e:
+        print(f"download failed: {e}")
+        return None, False
+
     if chksum:
         assert (
             chksum[:10] == hashlib.md5(open(file_path, "rb").read()).hexdigest()[:10]
         ), "corrupted file!"
     return file_path, False
 
 
-def get_tokenizer(cachedir=".cache"):
+def get_tokenizer_path(cachedir=".cache"):
     """Get KoBERT Tokenizer file path after downloading"""
     tokenizer = {
-        "url": "s3://skt-lsl-nlp-model/KoBERT/tokenizers/kobert_news_wiki_ko_cased-1087f8699e.spiece",
+        "url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert_news_wiki_ko_cased-1087f8699e.spiece",
         "chksum": "ae5711deb3",
     }
 
     model_info = tokenizer
-    model_path, is_cached = download(model_info["url"], model_info["chksum"], cachedir=cachedir)
+    model_path, _ = download(model_info["url"], model_info["chksum"], cachedir=cachedir)
     return model_path
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,9 @@
-boto3 <=1.15.18
+accelerate
+datasets
 gluonnlp >= 0.6.0, <=0.10.0
 mxnet >= 1.4.0, <=1.7.0.post2
-onnxruntime == 1.8.0, <=1.8.0
+onnxruntime
+protobuf
 sentencepiece >= 0.1.6, <=0.1.96
-torch >= 1.7.0, <=1.10.1
-transformers >= 4.8.1, <=4.8.1
+torch
+transformers
diff --git a/scripts/NSMC/naver_review_classifications_gluon_kobert.ipynb b/scripts/NSMC/naver_review_classifications_gluon_kobert.ipynb
@@ -44,7 +44,7 @@
     "import gluonnlp as nlp\n",
     "\n",
     "from kobert import get_mxnet_kobert_model\n",
-    "from kobert import get_tokenizer"
+    "from kobert import get_tokenizer_path"
    ]
   },
   {
@@ -101,7 +101,7 @@
    },
    "outputs": [],
    "source": [
-    "tokenizer = get_tokenizer()\n",
+    "tokenizer = get_tokenizer_path()\n",
     "tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)"
    ]
   },
@@ -146,8 +146,8 @@
    },
    "outputs": [],
    "source": [
-    "!wget -O .cache/ratings_train.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_train.txt\n",
-    "!wget -O .cache/ratings_test.txt http://skt-lsl-nlp-model.s3.amazonaws.com/KoBERT/datasets/nsmc/ratings_test.txt"
+    "from datasets import load_dataset\n",
+    "dataset = load_dataset(\"e9t/nsmc\", trust_remote_code=True)"
    ]
   },
   {
@@ -160,8 +160,8 @@
    },
    "outputs": [],
    "source": [
-    "dataset_train = nlp.data.TSVDataset(\".cache/ratings_train.txt\", field_indices=[1,2], num_discard_samples=1)\n",
-    "dataset_test = nlp.data.TSVDataset(\".cache/ratings_test.txt\", field_indices=[1,2], num_discard_samples=1)"
+    "dataset_train = dataset['train']\n",
+    "dataset_test = dataset['test']"
    ]
   },
   {
@@ -175,22 +175,21 @@
    "outputs": [],
    "source": [
     "class BERTDataset(mx.gluon.data.Dataset):\n",
-    "    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,\n",
-    "                 pad, pair):\n",
+    "    def __init__(self, dataset, bert_tokenizer, max_len, pad, pair):\n",
     "        transform = nlp.data.BERTSentenceTransform(\n",
     "            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)\n",
     "        sent_dataset = gluon.data.SimpleDataset([[\n",
-    "            i[sent_idx],\n",
+    "            i['document'],\n",
     "        ] for i in dataset])\n",
     "        self.sentences = sent_dataset.transform(transform)\n",
     "        self.labels = gluon.data.SimpleDataset(\n",
-    "            [np.array(np.int32(i[label_idx])) for i in dataset])\n",
+    "            [np.array(np.int32(i['label'])) for i in dataset])\n",
     "\n",
     "    def __getitem__(self, i):\n",
     "        return (self.sentences[i] + (self.labels[i], ))\n",
     "\n",
     "    def __len__(self):\n",
-    "        return (len(self.labels))\n"
+    "        return (len(self.labels))"
    ]
   },
   {
@@ -216,8 +215,8 @@
    },
    "outputs": [],
    "source": [
-    "data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)\n",
-    "data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)"
+    "data_train = BERTDataset(dataset_train, tok, max_len, True, False)\n",
+    "data_test = BERTDataset(dataset_test, tok, max_len, True, False)"
    ]
   },
   {
@@ -247,8 +246,7 @@
     "\n",
     "    def forward(self, inputs, token_types, valid_length=None):\n",
     "        _, pooler = self.bert(inputs, token_types, valid_length)\n",
-    "        return self.classifier(pooler)\n",
-    "                                           "
+    "        return self.classifier(pooler)\n"
    ]
   },
   {
@@ -262,7 +260,7 @@
    "outputs": [],
    "source": [
     "model = BERTClassifier(bert_base, num_classes=2, dropout=0.1)\n",
-    "# 분류 레이어만 초기화 한다. \n",
+    "# 분류 레이어만 초기화 한다.\n",
     "model.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)\n",
     "model.hybridize()\n",
     "\n",
@@ -316,7 +314,7 @@
    },
    "outputs": [],
    "source": [
-    "# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다. \n",
+    "# LayerNorm과 Bias에는 Weight Decay를 적용하지 않는다.\n",
     "for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():\n",
     "    v.wd_mult = 0.0\n",
     "params = [\n",
@@ -360,7 +358,7 @@
    },
    "outputs": [],
    "source": [
-    "#learning rate warmup을 위한 준비 \n",
+    "#learning rate warmup을 위한 준비\n",
     "accumulate = 4\n",
     "step_size = batch_size * accumulate if accumulate else batch_size\n",
     "num_train_examples = len(data_train)\n",
diff --git a/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb b/scripts/NSMC/naver_review_classifications_pytorch_kobert.ipynb
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`.cache`
	`2`	`+__pycache__/`
	`3`	`+results/`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@`
`19`	`19`	`def get_onnx_kobert_model(cachedir=".cache"):`
`20`	`20`	`"""Get KoBERT ONNX file path after downloading"""`
`21`	`21`	`onnx_kobert = {`
`22`		`- "url": "s3://skt-lsl-nlp-model/KoBERT/models/kobert.onnx1.8.0.onnx",`
	`22`	`+ "url": "https://huggingface.co/skt/kobert-base-v1/resolve/main/legacy/kobert.onnx1.8.0.onnx",`
`23`	`23`	`"chksum": "6f6610f2e3b61da6de8dbce",`
`24`	`24`	`}`
`25`	`25`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from kobert.utils.utils import download, get_tokenizer`
	`1`	`+from kobert.utils.utils import download, get_tokenizer_path`