11# coding=utf-8
2- # Copyright 2019 SK T-Brain Authors.
2+ # Copyright 2019-2025 SKTelecom
33#
44# Licensed under the Apache License, Version 2.0 (the "License");
55# you may not use this file except in compliance with the License.
1515
1616import hashlib
1717import os
18-
19- from kobert .utils .aws_s3_downloader import AwsS3Downloader
18+ import urllib .request
2019
2120
2221def download (url , chksum = None , cachedir = ".cache" ):
@@ -29,22 +28,27 @@ def download(url, chksum=None, cachedir=".cache"):
2928 print (f"using cached model. { file_path } " )
3029 return file_path , True
3130
32- s3 = AwsS3Downloader ()
33- file_path = s3 .download (url , cachedir_full )
31+ print (f"downloading model from { url } ..." )
32+ try :
33+ urllib .request .urlretrieve (url , file_path )
34+ except Exception as e :
35+ print (f"download failed: { e } " )
36+ return None , False
37+
3438 if chksum :
3539 assert (
3640 chksum [:10 ] == hashlib .md5 (open (file_path , "rb" ).read ()).hexdigest ()[:10 ]
3741 ), "corrupted file!"
3842 return file_path , False
3943
4044
41- def get_tokenizer (cachedir = ".cache" ):
45+ def get_tokenizer_path (cachedir = ".cache" ):
4246 """Get KoBERT Tokenizer file path after downloading"""
4347 tokenizer = {
44- "url" : "s3 ://skt-lsl-nlp-model/KoBERT/tokenizers /kobert_news_wiki_ko_cased-1087f8699e.spiece" ,
48+ "url" : "https ://huggingface.co/ skt/kobert-base-v1/resolve/main/legacy /kobert_news_wiki_ko_cased-1087f8699e.spiece" ,
4549 "chksum" : "ae5711deb3" ,
4650 }
4751
4852 model_info = tokenizer
49- model_path , is_cached = download (model_info ["url" ], model_info ["chksum" ], cachedir = cachedir )
53+ model_path , _ = download (model_info ["url" ], model_info ["chksum" ], cachedir = cachedir )
5054 return model_path
0 commit comments