From 62f988d4d32e8ef878c248705da5bbeab21b3285 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Sun, 25 Jun 2023 20:56:20 +0800
Subject: [PATCH 01/24] stylistic update based on Codacy

---
 notebooks/README.md                              |  6 +++---
 scripts/README.md                                | 14 +++++++-------
 scripts/ceval/evaluator.py                       |  2 +-
 scripts/langchain/langchain_sum.py               |  5 ++---
 scripts/merge_llama_with_chinese_lora_low_mem.py |  4 ++--
 scripts/merge_tokenizer/merge_tokenizers.py      |  1 -
 scripts/openai_server_demo/README.md             |  2 +-
 scripts/openai_server_demo/openai_api_server.py  |  4 ++--
 scripts/training/run_clm_sft_with_peft.py        |  4 ++--
 9 files changed, 20 insertions(+), 22 deletions(-)
diff --git a/notebooks/README.md b/notebooks/README.md
index 6183ae6..313daab 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -1,6 +1,6 @@
 # 笔记本示例 Notebooks
 
-###  ceval_example_for_chinese_alpaca.ipynb
+### ceval_example_for_chinese_alpaca.ipynb
 
 利用Chinese Alpaca模型解码C-Eval数据集的示例。
 
@@ -8,7 +8,7 @@ Example of decoding C-Eval dataset with Chinese Alpaca.
 
 建议查看Colab上的最新版 / Check latest notebook：<a href="https://colab.research.google.com/drive/12YewimRT7JuqJGOejxN7YG8jq2de4DnF?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 
-###  convert_and_quantize_chinese_llama_and_alpaca.ipynb
+### convert_and_quantize_chinese_llama_and_alpaca.ipynb
 
 Colab上的转换和量化中文LLaMA/Alpaca（含Plus版本）的运行示例（仅供流程参考）。
 
@@ -40,7 +40,7 @@ Example of running the Gradio demo on Colab.
 
 在Colab中打开 / Open the notebook in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ymcui/Chinese-LLaMA-Alpaca/blob/main/notebooks/gradio_web_demo.ipynb) 
 
-###  legacy/
+### legacy/
 
 旧版notebook，供参考，但不会再更新。
 
diff --git a/scripts/README.md b/scripts/README.md
index 81ed2f8..8a390d5 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -1,6 +1,6 @@
 # 代码与脚本 Code and Scripts
 
-###  training/
+### training/
 
 预训练与指令精调代码，Wiki：
 
@@ -12,13 +12,13 @@ Pre-training and instruction finetuning code, Wiki:
 - Pre-training: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Pretraining-Script
 - Instruction finetuning: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/SFT-Script
 
-###  inference/
+### inference/
 
 使用🤗transformers进行推理，Wiki：[https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/使用Transformers推理](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/使用Transformers推理)
 
 Inference using 🤗transformers, Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Inference-with-Transformers
 
-###  langchain/
+### langchain/
 
 使用LangChain进行检索式问答和文本摘要的示例，Wiki：[https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/与LangChain进行集成](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/与LangChain进行集成)
 
@@ -30,25 +30,25 @@ Using LangChain for Retrieval QA and Summarization, Wiki: https://github.com/ymc
 
 A server that implements OPENAI API using fastapi, Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/API-Calls](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/API-Calls)
 
-###  merge_tokenizer/
+### merge_tokenizer/
 
 中文词表扩充代码，Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#准备工作词表扩充](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#准备工作词表扩充)
 
 Code for extending Chinese vocabulary, Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Training-Details#preparation-vocabulary-expansion
 
-###  merge_llama_with_chinese_lora.py
+### merge_llama_with_chinese_lora.py
 
 合并LLaMA/Alpaca LoRA脚本，Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换)
 
 Script for merging LLaMA/Alpaca LoRA. Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Manual-Conversion
 
-###  merge_llama_with_chinese_lora_low_mem.py
+### merge_llama_with_chinese_lora_low_mem.py
 
 （推荐）低资源版合并LLaMA/Alpaca LoRA脚本，Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换)
 
 （recommended）Script for merging LLaMA/Alpaca LoRA (low-resource version). Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Manual-Conversion
 
-###  crawl_prompt.py
+### crawl_prompt.py
 
 指令数据爬取脚本，Wiki：[https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#训练数据](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#训练数据)
 
diff --git a/scripts/ceval/evaluator.py b/scripts/ceval/evaluator.py
index 45e7964..c6027ae 100644
--- a/scripts/ceval/evaluator.py
+++ b/scripts/ceval/evaluator.py
@@ -26,7 +26,7 @@ def generate_few_shot_prompt(self, subject, dev_df):
         for i in range(k):
             prompt += self.format_example(dev_df.iloc[i, :])
         return prompt
-    
+
     def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None):
         pass
 
diff --git a/scripts/langchain/langchain_sum.py b/scripts/langchain/langchain_sum.py
index 56585b2..de4bb37 100644
--- a/scripts/langchain/langchain_sum.py
+++ b/scripts/langchain/langchain_sum.py
@@ -15,10 +15,9 @@
 from langchain import HuggingFacePipeline
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.prompts import PromptTemplate
-from langchain.docstore.document import Document
 from langchain.chains.summarize import load_summarize_chain
 
-prompt_template = """Below is an instruction that describes a task. 
+prompt_template = """Below is an instruction that describes a task.
                     Write a response that appropriately completes the request.\n\n
                     ### Instruction:\n请为以下文字写一段摘要:\n{text}\n\n### Response: """
 refine_template = (
@@ -41,7 +40,7 @@
         device = torch.device(0)
     else:
         device = torch.device('cpu')
-    
+
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100, length_function=len)
     with open(file_path) as f:
         text = f.read()
diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index 4c6b76c..2c4025e 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -210,7 +210,7 @@ def merge_shards(output_dir, num_shards: int):
         shards_merged = {}
         for d in shards_dicts:
             shards_merged |= d
-    
+
         print(f"Saving the merged shard to " + os.path.join(output_dir, f"consolidated.0{i}.pth"))
         torch.save(shards_merged, os.path.join(output_dir, f"consolidated.0{i}.pth"))
 
@@ -305,7 +305,7 @@ def merge_shards(output_dir, num_shards: int):
                         print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}")
                     state_dict[k] += (
                         transpose(
-                            t_and_l['state_dict'][lora_key_B].float() 
+                            t_and_l['state_dict'][lora_key_B].float()
                           @ t_and_l['state_dict'][lora_key_A].float(), t_and_l['fan_in_fan_out']) * t_and_l['scaling']
                     )
             weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype)
diff --git a/scripts/merge_tokenizer/merge_tokenizers.py b/scripts/merge_tokenizer/merge_tokenizers.py
index 622b008..e04aa89 100644
--- a/scripts/merge_tokenizer/merge_tokenizers.py
+++ b/scripts/merge_tokenizer/merge_tokenizers.py
@@ -62,6 +62,5 @@
 text='''白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
 The primary use of LLaMA is research on large language models, including'''
 print("Test text:\n",text)
-print
 print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}")
 print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_llama_tokenizer.tokenize(text)}")
\ No newline at end of file
diff --git a/scripts/openai_server_demo/README.md b/scripts/openai_server_demo/README.md
index d84d8e8..05b4ffc 100644
--- a/scripts/openai_server_demo/README.md
+++ b/scripts/openai_server_demo/README.md
@@ -116,7 +116,7 @@ json返回体：
 
 `top_k`: 在随机采样（random sampling）时，前top_k高概率的token将作为候选token被随机采样。
 
-`top_p`: 在随机采样（random sampling）时，累积概率超过top_p的token将作为候选token被随机采样，越低随机性越大，举个例子，当top_p设定为0.6时，概率前5的token概率分别为[0.23, 0.20, 0.18, 0.11, 0.10]时，前三个token的累积概率为0.61，那么第4个token将被过滤掉，只有前三的token将作为候选token被随机采样。
+`top_p`: 在随机采样（random sampling）时，累积概率超过top_p的token将作为候选token被随机采样，越低随机性越大，举个例子，当top_p设定为0.6时，概率前5的token概率分别为{0.23, 0.20, 0.18, 0.11, 0.10}时，前三个token的累积概率为0.61，那么第4个token将被过滤掉，只有前三的token将作为候选token被随机采样。
 
 `repetition_penalty`: 重复惩罚，具体细节可以参考这篇文章：<https://arxiv.org/pdf/1909.05858.pdf> 。
 
diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py
index 75dde96..13f0710 100644
--- a/scripts/openai_server_demo/openai_api_server.py
+++ b/scripts/openai_server_demo/openai_api_server.py
@@ -182,7 +182,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
     else:
         msgs = [ChatMessage(role=x['role'],content=x['message']) for x in msgs]
     output = predict(
-        input=msgs, 
+        input=msgs,
         max_new_tokens=request.max_tokens,
         top_p=request.top_p,
         top_k=request.top_k,
@@ -200,7 +200,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
 async def create_completion(request: CompletionRequest):
     """Creates a completion"""
     output = predict(
-        input=request.prompt, 
+        input=request.prompt,
         max_new_tokens=request.max_tokens,
         top_p=request.top_p,
         top_k=request.top_k,
diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index 246a797..4bfe7a3 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -322,8 +322,8 @@ def main():
             files = [os.path.join(path,file.name) for file in path.glob("*.json")]
             logger.info(f"training files: {' '.join(files)}")
             train_dataset = buid_instruction_dataset(
-                data_path=files, 
-                tokenizer=tokenizer, 
+                data_path=files,
+                tokenizer=tokenizer,
                 max_seq_length=data_args.max_seq_length,
                 data_cache_dir = None, 
                 preprocessing_num_workers = data_args.preprocessing_num_workers)

From e66c9f71614e262625582cda13f5238d8b65998a Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Sun, 25 Jun 2023 20:58:01 +0800
Subject: [PATCH 02/24] stylistic update based on Codacy

---
 data/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/README.md b/data/README.md
index 3ca3a24..6a53968 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,12 +1,12 @@
 # 数据 Data
 
-###  alpaca_data_zh_51k.json
+### alpaca_data_zh_51k.json
 
 中文Alpaca数据，包含51k个从ChatGPT (gpt-3.5-turbo)爬取的指令数据。
 
 Chinese Alpaca dataset, containing 51k instruction data crawled from ChatGPT (gpt-3.5-turbo).
 
-###  pt_sample_data.txt
+### pt_sample_data.txt
 
 CLM任务预训练样例数据
 

From 0c95b6178fd9951e6aa530795e795de03cdd4938 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Sun, 25 Jun 2023 21:16:34 +0800
Subject: [PATCH 03/24] Update prompt_template

---
 scripts/langchain/langchain_sum.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/langchain/langchain_sum.py b/scripts/langchain/langchain_sum.py
index de4bb37..4958afb 100644
--- a/scripts/langchain/langchain_sum.py
+++ b/scripts/langchain/langchain_sum.py
@@ -17,9 +17,9 @@
 from langchain.prompts import PromptTemplate
 from langchain.chains.summarize import load_summarize_chain
 
-prompt_template = """Below is an instruction that describes a task.
-                    Write a response that appropriately completes the request.\n\n
-                    ### Instruction:\n请为以下文字写一段摘要:\n{text}\n\n### Response: """
+prompt_template = ("Below is an instruction that describes a task. "
+                   "Write a response that appropriately completes the request.\n\n"
+                   "### Instruction:\n请为以下文字写一段摘要:\n{text}\n\n### Response: ")
 refine_template = (
     "Below is an instruction that describes a task."
     "Write a response that appropriately completes the request.\n\n"

From ce78c23227ca2197609910013dcd6b6d010bb18f Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Sun, 25 Jun 2023 21:34:46 +0800
Subject: [PATCH 04/24] stylistic update based on Codacy

---
 data/README.md          | 2 +-
 notebooks/README.md     | 2 +-
 scripts/crawl_prompt.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/README.md b/data/README.md
index 6a53968..5732b3f 100644
--- a/data/README.md
+++ b/data/README.md
@@ -10,4 +10,4 @@ Chinese Alpaca dataset, containing 51k instruction data crawled from ChatGPT (gp
 
 CLM任务预训练样例数据
 
-Pre-training sample data
\ No newline at end of file
+Pre-training sample data
diff --git a/notebooks/README.md b/notebooks/README.md
index 313daab..82bcd4a 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -44,4 +44,4 @@ Example of running the Gradio demo on Colab.
 
 旧版notebook，供参考，但不会再更新。
 
-Old notebook. Reference only, will not be updated.
\ No newline at end of file
+Old notebook. Reference only, will not be updated.
diff --git a/scripts/crawl_prompt.py b/scripts/crawl_prompt.py
index 0924ac2..df39098 100644
--- a/scripts/crawl_prompt.py
+++ b/scripts/crawl_prompt.py
@@ -1,5 +1,4 @@
 import openai
-import json
 import sys
 import random
 
@@ -23,11 +22,12 @@ def return_random_prompt():
   system_prompt += "4. 除非特别要求，请使用中文，指令可以是命令句、疑问句、或其他合适的类型。\n"
   system_prompt += "5. 为指令生成一个适当且涉及真实情况的<input>，不应该只包含简单的占位符。<input>应提供实质性的内容，具有挑战性。字数不超过" + str(random.randint(80, 120)) + "字。\n"
   system_prompt += "6. <output>应该是对指令的适当且真实的回应，不能只回复答应或拒绝请求。如果需要额外信息才能回复时，请努力预测用户意图并尝试回复。<output>的内容应少于" + str(random.randint(128, 512)) + "字。\n\n"
-  
+
   system_prompt += "请给出满足条件的20条JSON格式数据：\n"
 
   return system_prompt
 
+
 if __name__ == "__main__":
   if len(sys.argv) != 2:
     print("Usage: python crawl_prompt.py <output_file>")

From 7e926054740941c10ba81825ee7afa4292ce0222 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Sun, 25 Jun 2023 21:36:17 +0800
Subject: [PATCH 05/24] stylistic update based on Codacy

---
 scripts/ceval/eval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/ceval/eval.py b/scripts/ceval/eval.py
index 86747a1..c6d18dc 100644
--- a/scripts/ceval/eval.py
+++ b/scripts/ceval/eval.py
@@ -11,7 +11,6 @@
 choices = ["A", "B", "C", "D"]
 
 def main(args, evaluator,take):
-    
     assert os.path.exists("subject_mapping.json"), "subject_mapping.json not found!"
     with open("subject_mapping.json") as f:
         subject_mapping = json.load(f)

From f531c6f840d8671fc4d0d00703cebbf44115df99 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Sun, 25 Jun 2023 21:38:47 +0800
Subject: [PATCH 06/24] stylistic update based on Codacy

---
 scripts/ceval/evaluator.py       |  1 -
 scripts/ceval/llama_evaluator.py | 16 ++++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/scripts/ceval/evaluator.py b/scripts/ceval/evaluator.py
index c6027ae..691af6f 100644
--- a/scripts/ceval/evaluator.py
+++ b/scripts/ceval/evaluator.py
@@ -1,6 +1,5 @@
 # This code is modified from C-Eval Project: https://github.com/SJTU-LIT/ceval
 
-import re
 import string
 class Evaluator:
     def __init__(self, choices, model_name, k=-1):
diff --git a/scripts/ceval/llama_evaluator.py b/scripts/ceval/llama_evaluator.py
index 0929fc3..6f91f0d 100644
--- a/scripts/ceval/llama_evaluator.py
+++ b/scripts/ceval/llama_evaluator.py
@@ -42,13 +42,13 @@ def __init__(self, choices, k, model_path, device, temperature=0.2):
         self.D_id = self.tokenizer.encode("：D")[-1]
 
 
-    def eval_subject(self, subject_name, 
-            test_df, 
-            dev_df=None, 
-            few_shot=False, 
-            cot=False, 
-            save_result_dir=None, 
-            with_prompt=False, 
+    def eval_subject(self, subject_name,
+            test_df,
+            dev_df=None,
+            few_shot=False,
+            cot=False,
+            save_result_dir=None,
+            with_prompt=False,
             constrained_decoding=False,
             do_test=False):
         all_answers = {}
@@ -81,7 +81,7 @@ def eval_subject(self, subject_name,
 
             inputs = self.tokenizer(instruction, return_tensors="pt")
             generation_output = self.model.generate(
-                    input_ids = inputs["input_ids"].to(self.device), 
+                    input_ids = inputs["input_ids"].to(self.device),
                     attention_mask = inputs['attention_mask'].to(self.device),
                     eos_token_id=self.tokenizer.eos_token_id,
                     pad_token_id=self.tokenizer.pad_token_id,

From d8bd567e30bc5a47891cbc34beca63eacf1cf690 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Sun, 25 Jun 2023 21:47:49 +0800
Subject: [PATCH 07/24] stylistic update based on Codacy

---
 scripts/ceval/eval.py                            | 2 --
 scripts/crawl_prompt.py                          | 2 +-
 scripts/inference/gradio_demo.py                 | 2 +-
 scripts/inference/inference_hf.py                | 4 ++--
 scripts/langchain/langchain_qa.py                | 8 ++++----
 scripts/merge_llama_with_chinese_lora.py         | 2 +-
 scripts/merge_llama_with_chinese_lora_low_mem.py | 2 +-
 7 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/scripts/ceval/eval.py b/scripts/ceval/eval.py
index c6d18dc..6e74cb2 100644
--- a/scripts/ceval/eval.py
+++ b/scripts/ceval/eval.py
@@ -76,8 +76,6 @@ def main(args, evaluator,take):
     json.dump(summary,open(save_result_dir+'/summary.json','w'),ensure_ascii=False,indent=2)
 
 
-
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str)
diff --git a/scripts/crawl_prompt.py b/scripts/crawl_prompt.py
index df39098..357eb1a 100644
--- a/scripts/crawl_prompt.py
+++ b/scripts/crawl_prompt.py
@@ -32,7 +32,7 @@ def return_random_prompt():
   if len(sys.argv) != 2:
     print("Usage: python crawl_prompt.py <output_file>")
     exit(1)
-  
+
   output_file = open(sys.argv[1], 'w')
 
   MAX_EPOCHS = 1    # number of data to generate (each prompt contains 20 JSON-formatted data)
diff --git a/scripts/inference/gradio_demo.py b/scripts/inference/gradio_demo.py
index f10431a..328f047 100644
--- a/scripts/inference/gradio_demo.py
+++ b/scripts/inference/gradio_demo.py
@@ -43,7 +43,7 @@
 tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)
 
 base_model = LlamaForCausalLM.from_pretrained(
-    args.base_model, 
+    args.base_model,
     load_in_8bit=load_in_8bit,
     torch_dtype=load_type,
     low_cpu_mem_usage=True,
diff --git a/scripts/inference/inference_hf.py b/scripts/inference/inference_hf.py
index 3a15110..4b1f241 100644
--- a/scripts/inference/inference_hf.py
+++ b/scripts/inference/inference_hf.py
@@ -60,7 +60,7 @@ def generate_prompt(instruction, input=None):
     tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)
 
     base_model = LlamaForCausalLM.from_pretrained(
-        args.base_model, 
+        args.base_model,
         load_in_8bit=False,
         torch_dtype=load_type,
         low_cpu_mem_usage=True,
@@ -116,7 +116,7 @@ def generate_prompt(instruction, input=None):
                     input_text = raw_input_text
                 inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
                 generation_output = model.generate(
-                    input_ids = inputs["input_ids"].to(device), 
+                    input_ids = inputs["input_ids"].to(device),
                     attention_mask = inputs['attention_mask'].to(device),
                     eos_token_id=tokenizer.eos_token_id,
                     pad_token_id=tokenizer.pad_token_id,
diff --git a/scripts/langchain/langchain_qa.py b/scripts/langchain/langchain_qa.py
index 514f03c..8ed50b3 100644
--- a/scripts/langchain/langchain_qa.py
+++ b/scripts/langchain/langchain_qa.py
@@ -59,7 +59,7 @@
         device = torch.device(0)
     else:
         device = torch.device('cpu')
-    
+
     loader = TextLoader(file_path)
     documents = loader.load()
     text_splitter = RecursiveCharacterTextSplitter(
@@ -89,8 +89,8 @@
         chain_type_kwargs = {"prompt": PROMPT}
         qa = RetrievalQA.from_chain_type(
             llm=model,
-            chain_type="stuff", 
-            retriever=docsearch.as_retriever(search_kwargs={"k": 1}), 
+            chain_type="stuff",
+            retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
             chain_type_kwargs=chain_type_kwargs)
 
     elif args.chain_type == "refine":
@@ -104,7 +104,7 @@
         )
         chain_type_kwargs = {"question_prompt": initial_qa_prompt, "refine_prompt": refine_prompt}
         qa = RetrievalQA.from_chain_type(
-            llm=model, chain_type="refine", 
+            llm=model, chain_type="refine",
             retriever=docsearch.as_retriever(search_kwargs={"k": 1}),
             chain_type_kwargs=chain_type_kwargs)
 
diff --git a/scripts/merge_llama_with_chinese_lora.py b/scripts/merge_llama_with_chinese_lora.py
index d85d0a7..7bb01b0 100644
--- a/scripts/merge_llama_with_chinese_lora.py
+++ b/scripts/merge_llama_with_chinese_lora.py
@@ -322,7 +322,7 @@ def save_shards(model_sd, num_shards: int):
                     transpose(lora_model_sd[lora_b_key].float() @ lora_model_sd[lora_a_key].float(),fan_in_fan_out) * lora_scaling
                 )
                 assert base_model_sd[original_key].dtype == torch.float16
-    
+
         # did we do anything?
         assert not torch.allclose(first_weight_old, first_weight)
 
diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py
index 2c4025e..13c0d3f 100644
--- a/scripts/merge_llama_with_chinese_lora_low_mem.py
+++ b/scripts/merge_llama_with_chinese_lora_low_mem.py
@@ -22,7 +22,7 @@
                     type=str, help="Please specify a base model")
 parser.add_argument('--lora_model', default=None, required=True,
                     type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models")
-parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], 
+parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'],
                     type=str, help="Save the merged model in pth or huggingface format")
 parser.add_argument('--output_dir', default='./merged_model',
                     type=str, help="The output folder to save the merged model")

From b88c2ed76c596858655464adbe4b4347f8b9de29 Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Sun, 25 Jun 2023 21:48:00 +0800
Subject: [PATCH 08/24] Update openai_api_server.py

remove unused import
---
 scripts/openai_server_demo/openai_api_server.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py
index 13f0710..c725bfc 100644
--- a/scripts/openai_server_demo/openai_api_server.py
+++ b/scripts/openai_server_demo/openai_api_server.py
@@ -1,4 +1,3 @@
-import pdb
 import argparse
 import os
 from fastapi import FastAPI

From 2935363148941ad477beacc7628aad8d356ab48a Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Sun, 25 Jun 2023 22:07:15 +0800
Subject: [PATCH 09/24] stylistic update based on Codacy

---
 scripts/training/run_clm_pt_with_peft.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/scripts/training/run_clm_pt_with_peft.py b/scripts/training/run_clm_pt_with_peft.py
index b918a49..ac6bf1a 100644
--- a/scripts/training/run_clm_pt_with_peft.py
+++ b/scripts/training/run_clm_pt_with_peft.py
@@ -46,13 +46,12 @@
     HfArgumentParser,
     Trainer,
     TrainingArguments,
-    default_data_collator,
     is_torch_tpu_available,
     set_seed,
 )
 from transformers.testing_utils import CaptureLogger
 from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils import send_example_telemetry
 from transformers.utils.versions import require_version
 
 from sklearn.metrics import accuracy_score
@@ -107,8 +106,6 @@ def preprocess_logits_for_metrics(logits, labels):
 
 
 def fault_tolerance_data_collator(features: List) -> Dict[str, Any]:
-    import torch
-
     if not isinstance(features[0], Mapping):
         features = [vars(f) for f in features]
     first = features[0]
@@ -483,7 +480,7 @@ def group_texts(examples):
                     remove_columns="text",
                     load_from_cache_file=True,
                     keep_in_memory=False,
-                    cache_file_names = {k: os.path.join(cache_dir, f'tokenized.arrow') for k in raw_dataset},
+                    cache_file_names = {k: os.path.join(cache_dir, 'tokenized.arrow') for k in raw_dataset},
                     desc="Running tokenizer on dataset",
                 )
                 grouped_datasets = tokenized_dataset.map(
@@ -492,7 +489,7 @@ def group_texts(examples):
                     num_proc=data_args.preprocessing_num_workers,
                     load_from_cache_file=True,
                     keep_in_memory=False,
-                    cache_file_names = {k: os.path.join(cache_dir, f'grouped.arrow') for k in tokenized_dataset},
+                    cache_file_names = {k: os.path.join(cache_dir, 'grouped.arrow') for k in tokenized_dataset},
                     desc=f"Grouping texts in chunks of {block_size}",
                 )
                 processed_dataset = grouped_datasets
@@ -579,7 +576,7 @@ def group_texts(examples):
             task_type=TaskType.CAUSAL_LM,
             target_modules=target_modules,
             inference_mode=False, 
-            r=lora_rank, lora_alpha=lora_alpha, 
+            r=lora_rank, lora_alpha=lora_alpha,
             lora_dropout=lora_dropout,
             modules_to_save=modules_to_save)
         model = get_peft_model(model, peft_config)

From f493d9a542857f75572983ed9ad58d786668a717 Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Sun, 25 Jun 2023 22:14:34 +0800
Subject: [PATCH 10/24] Update run_clm_sft_with_peft.py by Codacy

---
 scripts/training/run_clm_sft_with_peft.py | 24 ++++++++---------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index 4bfe7a3..946ef2b 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -22,21 +22,18 @@
 # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 
 import logging
-import numpy as np
 import math
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any, Mapping
+from typing import Optional, Dict, Any, Mapping
 from pathlib import Path
 import datasets
-import json
 import torch
 from build_dataset import buid_instruction_dataset, DataCollatorForSupervisedDataset
 import transformers
 from transformers import (
     CONFIG_MAPPING,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
     AutoConfig,
     AutoModelForCausalLM,
     LlamaForCausalLM,
@@ -47,24 +44,19 @@
     TrainingArguments,
     set_seed,
 )
-from transformers.testing_utils import CaptureLogger
 from transformers.trainer_utils import get_last_checkpoint
-from transformers.utils import check_min_version, send_example_telemetry
+from transformers.utils import send_example_telemetry
 from transformers.utils.versions import require_version
 
 from peft import LoraConfig, TaskType, get_peft_model, PeftModel, get_peft_model_state_dict
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 
-
 IGNORE_INDEX = -100
 DEFAULT_PAD_TOKEN = "[PAD]"
 DEFAULT_EOS_TOKEN = "</s>"
 DEFAULT_BOS_TOKEN = "<s>"
 DEFAULT_UNK_TOKEN = "<unk>"
 
-# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-# check_min_version("4.28.0.dev0")
-
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
 
@@ -335,10 +327,10 @@ def main():
             files = [data_args.validation_file]
             logger.info(f"training files: {' '.join(files)}")
             eval_dataset = buid_instruction_dataset(
-                data_path=files, 
-                tokenizer=tokenizer, 
+                data_path=files,
+                tokenizer=tokenizer,
                 max_seq_length=data_args.max_seq_length,
-                data_cache_dir = None, 
+                data_cache_dir = None,
                 preprocessing_num_workers = data_args.preprocessing_num_workers)
         logger.info(f"Num eval_samples  {len(eval_dataset)}")
         logger.info("eval example:")
@@ -386,10 +378,10 @@ def main():
         logger.info(f"target_modules: {target_modules}")
         logger.info(f"lora_rank: {lora_rank}")
         peft_config = LoraConfig(
-            task_type=TaskType.CAUSAL_LM, 
+            task_type=TaskType.CAUSAL_LM,
             target_modules=target_modules,
-            inference_mode=False, 
-            r=lora_rank, lora_alpha=lora_alpha, 
+            inference_mode=False,
+            r=lora_rank, lora_alpha=lora_alpha,
             lora_dropout=lora_dropout,
             modules_to_save=modules_to_save)
         model = get_peft_model(model, peft_config)

From d89f53ca41a8954154ceeb6e2f9eaf075c20df98 Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Sun, 25 Jun 2023 22:18:16 +0800
Subject: [PATCH 11/24] Stylistic fixes based on Codacy suggestions

---
 scripts/openai_server_demo/openai_api_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py
index c725bfc..dbeeb1d 100644
--- a/scripts/openai_server_demo/openai_api_server.py
+++ b/scripts/openai_server_demo/openai_api_server.py
@@ -53,7 +53,7 @@
 tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)
 
 base_model = LlamaForCausalLM.from_pretrained(
-    args.base_model, 
+    args.base_model,
     load_in_8bit=load_in_8bit,
     torch_dtype=load_type,
     low_cpu_mem_usage=True,
@@ -189,7 +189,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
         num_beams=request.num_beams,
         repetition_penalty=request.repetition_penalty,
     )
-    choices = [ChatCompletionResponseChoice(index = i, message = msg) 
+    choices = [ChatCompletionResponseChoice(index = i, message = msg)
                for i, msg in enumerate(msgs)]
     choices += [ChatCompletionResponseChoice(index = len(choices),
                                               message = ChatMessage(role='assistant',content=output))]

From c74c72f594164e288058e57834760942b54d99ab Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Sun, 25 Jun 2023 22:19:38 +0800
Subject: [PATCH 12/24] Stylistic fixes based on Codacy suggestions

---
 scripts/openai_server_demo/openai_api_server.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py
index dbeeb1d..77d5005 100644
--- a/scripts/openai_server_demo/openai_api_server.py
+++ b/scripts/openai_server_demo/openai_api_server.py
@@ -189,10 +189,8 @@ async def create_chat_completion(request: ChatCompletionRequest):
         num_beams=request.num_beams,
         repetition_penalty=request.repetition_penalty,
     )
-    choices = [ChatCompletionResponseChoice(index = i, message = msg)
-               for i, msg in enumerate(msgs)]
-    choices += [ChatCompletionResponseChoice(index = len(choices),
-                                              message = ChatMessage(role='assistant',content=output))]
+    choices = [ChatCompletionResponseChoice(index = i, message = msg) for i, msg in enumerate(msgs)]
+    choices += [ChatCompletionResponseChoice(index = len(choices), message = ChatMessage(role='assistant',content=output))]
     return ChatCompletionResponse(choices = choices)
 
 @app.post("/v1/completions")

From e926f872b8cd75d19ccb9806e766ffd33940d8d7 Mon Sep 17 00:00:00 2001
From: yaoxin <35353688+iMountTai@users.noreply.github.com>
Date: Sun, 25 Jun 2023 22:22:25 +0800
Subject: [PATCH 13/24] Update build_dataset.py by Codacy

---
 scripts/training/build_dataset.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/scripts/training/build_dataset.py b/scripts/training/build_dataset.py
index 83ec95d..17677a8 100644
--- a/scripts/training/build_dataset.py
+++ b/scripts/training/build_dataset.py
@@ -1,14 +1,12 @@
 import logging
 import os
 from dataclasses import dataclass
-from typing import Optional, Dict, Sequence, Union, List
+from typing import Dict, Sequence, Union, List
 import datasets
 import torch
-import logging
 from datasets import load_dataset, concatenate_datasets
-import copy
 import transformers
-import random
+
 
 IGNORE_INDEX = -100
 

From 66fe1b6e45d5bbd54a1d71d3cd1928e0b6107af5 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Sun, 25 Jun 2023 22:36:00 +0800
Subject: [PATCH 14/24] Update openai_api_server.py

---
 scripts/openai_server_demo/openai_api_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py
index 77d5005..de88bcf 100644
--- a/scripts/openai_server_demo/openai_api_server.py
+++ b/scripts/openai_server_demo/openai_api_server.py
@@ -120,7 +120,7 @@ def predict(
     type(input) == str -> /v1/completions
     type(input) == list -> /v1/chat/completions
     """
-    if type(input) == str:
+    if isinstance(input,str):
         prompt = generate_completion_prompt(input)
     else:
         prompt = generate_chat_prompt(input)
@@ -176,7 +176,7 @@ def get_embedding(input):
 async def create_chat_completion(request: ChatCompletionRequest):
     """Creates a completion for the chat message"""
     msgs = request.messages
-    if type(msgs) == str:
+    if isinstance(msgs, str):
         msgs = [ChatMessage(role='user',content=msgs)]
     else:
         msgs = [ChatMessage(role=x['role'],content=x['message']) for x in msgs]

From 5817b40f21ada802345bcf40a81d6e091e75e804 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Sun, 25 Jun 2023 22:36:35 +0800
Subject: [PATCH 15/24] Update openai_api_server.py

---
 scripts/openai_server_demo/openai_api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py
index de88bcf..e2ae8e3 100644
--- a/scripts/openai_server_demo/openai_api_server.py
+++ b/scripts/openai_server_demo/openai_api_server.py
@@ -120,7 +120,7 @@ def predict(
     type(input) == str -> /v1/completions
     type(input) == list -> /v1/chat/completions
     """
-    if isinstance(input,str):
+    if isinstance(input, str):
         prompt = generate_completion_prompt(input)
     else:
         prompt = generate_chat_prompt(input)

From a9f9ed2aa19cacfdde8e4626116d98dd4ca334ea Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Sun, 25 Jun 2023 22:54:37 +0800
Subject: [PATCH 16/24] Stylistic update run_clm_pt_with_peft.py based on
 Codacy

---
 scripts/training/run_clm_pt_with_peft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/run_clm_pt_with_peft.py b/scripts/training/run_clm_pt_with_peft.py
index ac6bf1a..9adaf66 100644
--- a/scripts/training/run_clm_pt_with_peft.py
+++ b/scripts/training/run_clm_pt_with_peft.py
@@ -122,7 +122,7 @@ def fault_tolerance_data_collator(features: List) -> Dict[str, Any]:
         if isinstance(first["label_ids"], torch.Tensor):
             batch["labels"] = torch.stack([f["label_ids"] for f in features])
         else:
-            dtype = torch.long if type(first["label_ids"][0]) is int else torch.float
+            dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
             batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)
 
     # Handling of all other possible keys.

From 4d02338def650acefd03a9deb31a80e2f850c98d Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Sun, 25 Jun 2023 23:01:57 +0800
Subject: [PATCH 17/24] Update run_clm_sft_with_peft.py

Stylistic update run_clm_sft_with_peft.py based on Codacy
---
 scripts/training/run_clm_sft_with_peft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index 946ef2b..a19dd12 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -26,7 +26,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Optional, Dict, Any, Mapping
+from typing import Optional, Dict
 from pathlib import Path
 import datasets
 import torch

From 53b5607596bbc0ed6521b743e77ed0b23438de34 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Mon, 26 Jun 2023 08:17:52 +0800
Subject: [PATCH 18/24] Trailing whitespace

---
 scripts/ceval/eval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/ceval/eval.py b/scripts/ceval/eval.py
index 6e74cb2..35149e8 100644
--- a/scripts/ceval/eval.py
+++ b/scripts/ceval/eval.py
@@ -89,7 +89,6 @@ def main(args, evaluator,take):
     parser.add_argument("--do_save_csv", choices=["False","True"], default="False")
     parser.add_argument("--output_dir", type=str)
     parser.add_argument("--do_test", choices=["False","True"], default="False")
-    
 
     args = parser.parse_args()
 

From c04757d7e8468f1667bbf424670a42df9d84f2e9 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Mon, 26 Jun 2023 08:20:48 +0800
Subject: [PATCH 19/24] Trailing whitespace

---
 scripts/inference/inference_hf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/inference/inference_hf.py b/scripts/inference/inference_hf.py
index 4b1f241..a73ed5a 100644
--- a/scripts/inference/inference_hf.py
+++ b/scripts/inference/inference_hf.py
@@ -140,7 +140,7 @@ def generate_prompt(instruction, input=None):
                     input_text = example
                 inputs = tokenizer(input_text,return_tensors="pt")  #add_special_tokens=False ?
                 generation_output = model.generate(
-                    input_ids = inputs["input_ids"].to(device), 
+                    input_ids = inputs["input_ids"].to(device),
                     attention_mask = inputs['attention_mask'].to(device),
                     eos_token_id=tokenizer.eos_token_id,
                     pad_token_id=tokenizer.pad_token_id,

From c98555219d0356b80e901eef00f241ed31f53ba2 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Mon, 26 Jun 2023 08:24:29 +0800
Subject: [PATCH 20/24] Trailing whitespace

---
 scripts/training/run_clm_pt_with_peft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/run_clm_pt_with_peft.py b/scripts/training/run_clm_pt_with_peft.py
index 9adaf66..3ec1aee 100644
--- a/scripts/training/run_clm_pt_with_peft.py
+++ b/scripts/training/run_clm_pt_with_peft.py
@@ -575,7 +575,7 @@ def group_texts(examples):
         peft_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             target_modules=target_modules,
-            inference_mode=False, 
+            inference_mode=False,
             r=lora_rank, lora_alpha=lora_alpha,
             lora_dropout=lora_dropout,
             modules_to_save=modules_to_save)

From 8f5f621d51fbc5d5c57bd63fcff6271acf93fc40 Mon Sep 17 00:00:00 2001
From: ymcui <me@ymcui.com>
Date: Mon, 26 Jun 2023 08:25:44 +0800
Subject: [PATCH 21/24] Trailing whitespace

---
 scripts/training/run_clm_sft_with_peft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index a19dd12..2713698 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -317,7 +317,7 @@ def main():
                 data_path=files,
                 tokenizer=tokenizer,
                 max_seq_length=data_args.max_seq_length,
-                data_cache_dir = None, 
+                data_cache_dir = None,
                 preprocessing_num_workers = data_args.preprocessing_num_workers)
         logger.info(f"Num train_samples  {len(train_dataset)}")
         logger.info("training example:")

From 6406fe6bc8b2a0edee1ffc0505672f1051d5ec80 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Mon, 26 Jun 2023 08:32:20 +0800
Subject: [PATCH 22/24] remove unused variable in run_clm_sft_with_peft.py

---
 scripts/training/run_clm_sft_with_peft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index 2713698..e4cd87d 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -300,7 +300,7 @@ def main():
         raise ValueError(f"The vocab size of the tokenizer must be 49954, but found {len(tokenizer)}.\n"
                          "Please use Chinese Alpaca tokenizer!")
     if tokenizer.pad_token is None:
-        num_new_tokens = smart_tokenizer_and_embedding_resize(
+        smart_tokenizer_and_embedding_resize(
             special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
             tokenizer=tokenizer)
 

From e5567180672b9a3bfadf17020ef30bf73b24dee1 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Mon, 26 Jun 2023 08:44:46 +0800
Subject: [PATCH 23/24] remove unused function
 smart_tokenizer_and_embedding_resize

---
 scripts/training/run_clm_sft_with_peft.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index e4cd87d..1eca83b 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -300,9 +300,8 @@ def main():
         raise ValueError(f"The vocab size of the tokenizer must be 49954, but found {len(tokenizer)}.\n"
                          "Please use Chinese Alpaca tokenizer!")
     if tokenizer.pad_token is None:
-        smart_tokenizer_and_embedding_resize(
-            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
-            tokenizer=tokenizer)
+        print(f"Adding pad token {DEFAULT_PAD_TOKEN}")
+        tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN))
 
     data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
     eval_dataset=None
@@ -438,16 +437,5 @@ def main():
         trainer.save_metrics("eval", metrics)
 
 
-def smart_tokenizer_and_embedding_resize(
-    special_tokens_dict: Dict,
-    tokenizer: transformers.PreTrainedTokenizer,
-):
-    """Resize tokenizer and embedding.
-    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
-    """
-    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
-    return num_new_tokens
-
-
 if __name__ == "__main__":
     main()

From a96009b08e6281e3611349f81dbbf4ffd8e618b4 Mon Sep 17 00:00:00 2001
From: Ziqing Yang <yangziqing@163.com>
Date: Mon, 26 Jun 2023 08:50:24 +0800
Subject: [PATCH 24/24] remove unused Dict imported from typing

---
 scripts/training/run_clm_sft_with_peft.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py
index 1eca83b..c0444d5 100644
--- a/scripts/training/run_clm_sft_with_peft.py
+++ b/scripts/training/run_clm_sft_with_peft.py
@@ -26,7 +26,7 @@
 import os
 import sys
 from dataclasses import dataclass, field
-from typing import Optional, Dict
+from typing import Optional
 from pathlib import Path
 import datasets
 import torch