From 62f988d4d32e8ef878c248705da5bbeab21b3285 Mon Sep 17 00:00:00 2001 From: ymcui Date: Sun, 25 Jun 2023 20:56:20 +0800 Subject: [PATCH 01/24] stylistic update based on Codacy --- notebooks/README.md | 6 +++--- scripts/README.md | 14 +++++++------- scripts/ceval/evaluator.py | 2 +- scripts/langchain/langchain_sum.py | 5 ++--- scripts/merge_llama_with_chinese_lora_low_mem.py | 4 ++-- scripts/merge_tokenizer/merge_tokenizers.py | 1 - scripts/openai_server_demo/README.md | 2 +- scripts/openai_server_demo/openai_api_server.py | 4 ++-- scripts/training/run_clm_sft_with_peft.py | 4 ++-- 9 files changed, 20 insertions(+), 22 deletions(-) diff --git a/notebooks/README.md b/notebooks/README.md index 6183ae6..313daab 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -1,6 +1,6 @@ # 笔记本示例 Notebooks -### ceval_example_for_chinese_alpaca.ipynb +### ceval_example_for_chinese_alpaca.ipynb 利用Chinese Alpaca模型解码C-Eval数据集的示例。 @@ -8,7 +8,7 @@ Example of decoding C-Eval dataset with Chinese Alpaca. 建议查看Colab上的最新版 / Check latest notebook:Open In Colab -### convert_and_quantize_chinese_llama_and_alpaca.ipynb +### convert_and_quantize_chinese_llama_and_alpaca.ipynb Colab上的转换和量化中文LLaMA/Alpaca(含Plus版本)的运行示例(仅供流程参考)。 @@ -40,7 +40,7 @@ Example of running the Gradio demo on Colab. 在Colab中打开 / Open the notebook in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ymcui/Chinese-LLaMA-Alpaca/blob/main/notebooks/gradio_web_demo.ipynb) -### legacy/ +### legacy/ 旧版notebook,供参考,但不会再更新。 diff --git a/scripts/README.md b/scripts/README.md index 81ed2f8..8a390d5 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,6 +1,6 @@ # 代码与脚本 Code and Scripts -### training/ +### training/ 预训练与指令精调代码,Wiki: @@ -12,13 +12,13 @@ Pre-training and instruction finetuning code, Wiki: - Pre-training: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Pretraining-Script - Instruction finetuning: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/SFT-Script -### inference/ +### inference/ 使用🤗transformers进行推理,Wiki:[https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/使用Transformers推理](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/使用Transformers推理) Inference using 🤗transformers, Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Inference-with-Transformers -### langchain/ +### langchain/ 使用LangChain进行检索式问答和文本摘要的示例,Wiki:[https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/与LangChain进行集成](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/与LangChain进行集成) @@ -30,25 +30,25 @@ Using LangChain for Retrieval QA and Summarization, Wiki: https://github.com/ymc A server that implements OPENAI API using fastapi, Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/API-Calls](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/API-Calls) -### merge_tokenizer/ +### merge_tokenizer/ 中文词表扩充代码,Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#准备工作词表扩充](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#准备工作词表扩充) Code for extending Chinese vocabulary, Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Training-Details#preparation-vocabulary-expansion -### merge_llama_with_chinese_lora.py +### merge_llama_with_chinese_lora.py 合并LLaMA/Alpaca LoRA脚本,Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换) Script for merging LLaMA/Alpaca LoRA. Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Manual-Conversion -### merge_llama_with_chinese_lora_low_mem.py +### merge_llama_with_chinese_lora_low_mem.py (推荐)低资源版合并LLaMA/Alpaca LoRA脚本,Wiki: [https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/手动模型合并与转换) (recommended)Script for merging LLaMA/Alpaca LoRA (low-resource version). Wiki: https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/Manual-Conversion -### crawl_prompt.py +### crawl_prompt.py 指令数据爬取脚本,Wiki:[https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#训练数据](https://github.com/ymcui/Chinese-LLaMA-Alpaca/wiki/训练细节#训练数据) diff --git a/scripts/ceval/evaluator.py b/scripts/ceval/evaluator.py index 45e7964..c6027ae 100644 --- a/scripts/ceval/evaluator.py +++ b/scripts/ceval/evaluator.py @@ -26,7 +26,7 @@ def generate_few_shot_prompt(self, subject, dev_df): for i in range(k): prompt += self.format_example(dev_df.iloc[i, :]) return prompt - + def eval_subject(self, subject_name, test_df, dev_df=None, few_shot=False, save_result_dir=None): pass diff --git a/scripts/langchain/langchain_sum.py b/scripts/langchain/langchain_sum.py index 56585b2..de4bb37 100644 --- a/scripts/langchain/langchain_sum.py +++ b/scripts/langchain/langchain_sum.py @@ -15,10 +15,9 @@ from langchain import HuggingFacePipeline from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.prompts import PromptTemplate -from langchain.docstore.document import Document from langchain.chains.summarize import load_summarize_chain -prompt_template = """Below is an instruction that describes a task. +prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n ### Instruction:\n请为以下文字写一段摘要:\n{text}\n\n### Response: """ refine_template = ( @@ -41,7 +40,7 @@ device = torch.device(0) else: device = torch.device('cpu') - + text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100, length_function=len) with open(file_path) as f: text = f.read() diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index 4c6b76c..2c4025e 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -210,7 +210,7 @@ def merge_shards(output_dir, num_shards: int): shards_merged = {} for d in shards_dicts: shards_merged |= d - + print(f"Saving the merged shard to " + os.path.join(output_dir, f"consolidated.0{i}.pth")) torch.save(shards_merged, os.path.join(output_dir, f"consolidated.0{i}.pth")) @@ -305,7 +305,7 @@ def merge_shards(output_dir, num_shards: int): print(f"merging {lora_key_A} and lora_B.weight form {tl_idx}-th LoRA weight to {k}") state_dict[k] += ( transpose( - t_and_l['state_dict'][lora_key_B].float() + t_and_l['state_dict'][lora_key_B].float() @ t_and_l['state_dict'][lora_key_A].float(), t_and_l['fan_in_fan_out']) * t_and_l['scaling'] ) weight_size = state_dict[k].numel() * dtype_byte_size(state_dict[k].dtype) diff --git a/scripts/merge_tokenizer/merge_tokenizers.py b/scripts/merge_tokenizer/merge_tokenizers.py index 622b008..e04aa89 100644 --- a/scripts/merge_tokenizer/merge_tokenizers.py +++ b/scripts/merge_tokenizer/merge_tokenizers.py @@ -62,6 +62,5 @@ text='''白日依山尽,黄河入海流。欲穷千里目,更上一层楼。 The primary use of LLaMA is research on large language models, including''' print("Test text:\n",text) -print print(f"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}") print(f"Tokenized by Chinese-LLaMA tokenizer:{chinese_llama_tokenizer.tokenize(text)}") \ No newline at end of file diff --git a/scripts/openai_server_demo/README.md b/scripts/openai_server_demo/README.md index d84d8e8..05b4ffc 100644 --- a/scripts/openai_server_demo/README.md +++ b/scripts/openai_server_demo/README.md @@ -116,7 +116,7 @@ json返回体: `top_k`: 在随机采样(random sampling)时,前top_k高概率的token将作为候选token被随机采样。 -`top_p`: 在随机采样(random sampling)时,累积概率超过top_p的token将作为候选token被随机采样,越低随机性越大,举个例子,当top_p设定为0.6时,概率前5的token概率分别为[0.23, 0.20, 0.18, 0.11, 0.10]时,前三个token的累积概率为0.61,那么第4个token将被过滤掉,只有前三的token将作为候选token被随机采样。 +`top_p`: 在随机采样(random sampling)时,累积概率超过top_p的token将作为候选token被随机采样,越低随机性越大,举个例子,当top_p设定为0.6时,概率前5的token概率分别为{0.23, 0.20, 0.18, 0.11, 0.10}时,前三个token的累积概率为0.61,那么第4个token将被过滤掉,只有前三的token将作为候选token被随机采样。 `repetition_penalty`: 重复惩罚,具体细节可以参考这篇文章: 。 diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py index 75dde96..13f0710 100644 --- a/scripts/openai_server_demo/openai_api_server.py +++ b/scripts/openai_server_demo/openai_api_server.py @@ -182,7 +182,7 @@ async def create_chat_completion(request: ChatCompletionRequest): else: msgs = [ChatMessage(role=x['role'],content=x['message']) for x in msgs] output = predict( - input=msgs, + input=msgs, max_new_tokens=request.max_tokens, top_p=request.top_p, top_k=request.top_k, @@ -200,7 +200,7 @@ async def create_chat_completion(request: ChatCompletionRequest): async def create_completion(request: CompletionRequest): """Creates a completion""" output = predict( - input=request.prompt, + input=request.prompt, max_new_tokens=request.max_tokens, top_p=request.top_p, top_k=request.top_k, diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index 246a797..4bfe7a3 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -322,8 +322,8 @@ def main(): files = [os.path.join(path,file.name) for file in path.glob("*.json")] logger.info(f"training files: {' '.join(files)}") train_dataset = buid_instruction_dataset( - data_path=files, - tokenizer=tokenizer, + data_path=files, + tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, data_cache_dir = None, preprocessing_num_workers = data_args.preprocessing_num_workers) From e66c9f71614e262625582cda13f5238d8b65998a Mon Sep 17 00:00:00 2001 From: ymcui Date: Sun, 25 Jun 2023 20:58:01 +0800 Subject: [PATCH 02/24] stylistic update based on Codacy --- data/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/README.md b/data/README.md index 3ca3a24..6a53968 100644 --- a/data/README.md +++ b/data/README.md @@ -1,12 +1,12 @@ # 数据 Data -### alpaca_data_zh_51k.json +### alpaca_data_zh_51k.json 中文Alpaca数据,包含51k个从ChatGPT (gpt-3.5-turbo)爬取的指令数据。 Chinese Alpaca dataset, containing 51k instruction data crawled from ChatGPT (gpt-3.5-turbo). -### pt_sample_data.txt +### pt_sample_data.txt CLM任务预训练样例数据 From 0c95b6178fd9951e6aa530795e795de03cdd4938 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Sun, 25 Jun 2023 21:16:34 +0800 Subject: [PATCH 03/24] Update prompt_template --- scripts/langchain/langchain_sum.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/langchain/langchain_sum.py b/scripts/langchain/langchain_sum.py index de4bb37..4958afb 100644 --- a/scripts/langchain/langchain_sum.py +++ b/scripts/langchain/langchain_sum.py @@ -17,9 +17,9 @@ from langchain.prompts import PromptTemplate from langchain.chains.summarize import load_summarize_chain -prompt_template = """Below is an instruction that describes a task. - Write a response that appropriately completes the request.\n\n - ### Instruction:\n请为以下文字写一段摘要:\n{text}\n\n### Response: """ +prompt_template = ("Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n请为以下文字写一段摘要:\n{text}\n\n### Response: ") refine_template = ( "Below is an instruction that describes a task." "Write a response that appropriately completes the request.\n\n" From ce78c23227ca2197609910013dcd6b6d010bb18f Mon Sep 17 00:00:00 2001 From: ymcui Date: Sun, 25 Jun 2023 21:34:46 +0800 Subject: [PATCH 04/24] stylistic update based on Codacy --- data/README.md | 2 +- notebooks/README.md | 2 +- scripts/crawl_prompt.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/data/README.md b/data/README.md index 6a53968..5732b3f 100644 --- a/data/README.md +++ b/data/README.md @@ -10,4 +10,4 @@ Chinese Alpaca dataset, containing 51k instruction data crawled from ChatGPT (gp CLM任务预训练样例数据 -Pre-training sample data \ No newline at end of file +Pre-training sample data diff --git a/notebooks/README.md b/notebooks/README.md index 313daab..82bcd4a 100644 --- a/notebooks/README.md +++ b/notebooks/README.md @@ -44,4 +44,4 @@ Example of running the Gradio demo on Colab. 旧版notebook,供参考,但不会再更新。 -Old notebook. Reference only, will not be updated. \ No newline at end of file +Old notebook. Reference only, will not be updated. diff --git a/scripts/crawl_prompt.py b/scripts/crawl_prompt.py index 0924ac2..df39098 100644 --- a/scripts/crawl_prompt.py +++ b/scripts/crawl_prompt.py @@ -1,5 +1,4 @@ import openai -import json import sys import random @@ -23,11 +22,12 @@ def return_random_prompt(): system_prompt += "4. 除非特别要求,请使用中文,指令可以是命令句、疑问句、或其他合适的类型。\n" system_prompt += "5. 为指令生成一个适当且涉及真实情况的,不应该只包含简单的占位符。应提供实质性的内容,具有挑战性。字数不超过" + str(random.randint(80, 120)) + "字。\n" system_prompt += "6. 应该是对指令的适当且真实的回应,不能只回复答应或拒绝请求。如果需要额外信息才能回复时,请努力预测用户意图并尝试回复。的内容应少于" + str(random.randint(128, 512)) + "字。\n\n" - + system_prompt += "请给出满足条件的20条JSON格式数据:\n" return system_prompt + if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python crawl_prompt.py ") From 7e926054740941c10ba81825ee7afa4292ce0222 Mon Sep 17 00:00:00 2001 From: ymcui Date: Sun, 25 Jun 2023 21:36:17 +0800 Subject: [PATCH 05/24] stylistic update based on Codacy --- scripts/ceval/eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/ceval/eval.py b/scripts/ceval/eval.py index 86747a1..c6d18dc 100644 --- a/scripts/ceval/eval.py +++ b/scripts/ceval/eval.py @@ -11,7 +11,6 @@ choices = ["A", "B", "C", "D"] def main(args, evaluator,take): - assert os.path.exists("subject_mapping.json"), "subject_mapping.json not found!" with open("subject_mapping.json") as f: subject_mapping = json.load(f) From f531c6f840d8671fc4d0d00703cebbf44115df99 Mon Sep 17 00:00:00 2001 From: ymcui Date: Sun, 25 Jun 2023 21:38:47 +0800 Subject: [PATCH 06/24] stylistic update based on Codacy --- scripts/ceval/evaluator.py | 1 - scripts/ceval/llama_evaluator.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/scripts/ceval/evaluator.py b/scripts/ceval/evaluator.py index c6027ae..691af6f 100644 --- a/scripts/ceval/evaluator.py +++ b/scripts/ceval/evaluator.py @@ -1,6 +1,5 @@ # This code is modified from C-Eval Project: https://github.com/SJTU-LIT/ceval -import re import string class Evaluator: def __init__(self, choices, model_name, k=-1): diff --git a/scripts/ceval/llama_evaluator.py b/scripts/ceval/llama_evaluator.py index 0929fc3..6f91f0d 100644 --- a/scripts/ceval/llama_evaluator.py +++ b/scripts/ceval/llama_evaluator.py @@ -42,13 +42,13 @@ def __init__(self, choices, k, model_path, device, temperature=0.2): self.D_id = self.tokenizer.encode(":D")[-1] - def eval_subject(self, subject_name, - test_df, - dev_df=None, - few_shot=False, - cot=False, - save_result_dir=None, - with_prompt=False, + def eval_subject(self, subject_name, + test_df, + dev_df=None, + few_shot=False, + cot=False, + save_result_dir=None, + with_prompt=False, constrained_decoding=False, do_test=False): all_answers = {} @@ -81,7 +81,7 @@ def eval_subject(self, subject_name, inputs = self.tokenizer(instruction, return_tensors="pt") generation_output = self.model.generate( - input_ids = inputs["input_ids"].to(self.device), + input_ids = inputs["input_ids"].to(self.device), attention_mask = inputs['attention_mask'].to(self.device), eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id, From d8bd567e30bc5a47891cbc34beca63eacf1cf690 Mon Sep 17 00:00:00 2001 From: ymcui Date: Sun, 25 Jun 2023 21:47:49 +0800 Subject: [PATCH 07/24] stylistic update based on Codacy --- scripts/ceval/eval.py | 2 -- scripts/crawl_prompt.py | 2 +- scripts/inference/gradio_demo.py | 2 +- scripts/inference/inference_hf.py | 4 ++-- scripts/langchain/langchain_qa.py | 8 ++++---- scripts/merge_llama_with_chinese_lora.py | 2 +- scripts/merge_llama_with_chinese_lora_low_mem.py | 2 +- 7 files changed, 10 insertions(+), 12 deletions(-) diff --git a/scripts/ceval/eval.py b/scripts/ceval/eval.py index c6d18dc..6e74cb2 100644 --- a/scripts/ceval/eval.py +++ b/scripts/ceval/eval.py @@ -76,8 +76,6 @@ def main(args, evaluator,take): json.dump(summary,open(save_result_dir+'/summary.json','w'),ensure_ascii=False,indent=2) - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str) diff --git a/scripts/crawl_prompt.py b/scripts/crawl_prompt.py index df39098..357eb1a 100644 --- a/scripts/crawl_prompt.py +++ b/scripts/crawl_prompt.py @@ -32,7 +32,7 @@ def return_random_prompt(): if len(sys.argv) != 2: print("Usage: python crawl_prompt.py ") exit(1) - + output_file = open(sys.argv[1], 'w') MAX_EPOCHS = 1 # number of data to generate (each prompt contains 20 JSON-formatted data) diff --git a/scripts/inference/gradio_demo.py b/scripts/inference/gradio_demo.py index f10431a..328f047 100644 --- a/scripts/inference/gradio_demo.py +++ b/scripts/inference/gradio_demo.py @@ -43,7 +43,7 @@ tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path) base_model = LlamaForCausalLM.from_pretrained( - args.base_model, + args.base_model, load_in_8bit=load_in_8bit, torch_dtype=load_type, low_cpu_mem_usage=True, diff --git a/scripts/inference/inference_hf.py b/scripts/inference/inference_hf.py index 3a15110..4b1f241 100644 --- a/scripts/inference/inference_hf.py +++ b/scripts/inference/inference_hf.py @@ -60,7 +60,7 @@ def generate_prompt(instruction, input=None): tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path) base_model = LlamaForCausalLM.from_pretrained( - args.base_model, + args.base_model, load_in_8bit=False, torch_dtype=load_type, low_cpu_mem_usage=True, @@ -116,7 +116,7 @@ def generate_prompt(instruction, input=None): input_text = raw_input_text inputs = tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ? generation_output = model.generate( - input_ids = inputs["input_ids"].to(device), + input_ids = inputs["input_ids"].to(device), attention_mask = inputs['attention_mask'].to(device), eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, diff --git a/scripts/langchain/langchain_qa.py b/scripts/langchain/langchain_qa.py index 514f03c..8ed50b3 100644 --- a/scripts/langchain/langchain_qa.py +++ b/scripts/langchain/langchain_qa.py @@ -59,7 +59,7 @@ device = torch.device(0) else: device = torch.device('cpu') - + loader = TextLoader(file_path) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter( @@ -89,8 +89,8 @@ chain_type_kwargs = {"prompt": PROMPT} qa = RetrievalQA.from_chain_type( llm=model, - chain_type="stuff", - retriever=docsearch.as_retriever(search_kwargs={"k": 1}), + chain_type="stuff", + retriever=docsearch.as_retriever(search_kwargs={"k": 1}), chain_type_kwargs=chain_type_kwargs) elif args.chain_type == "refine": @@ -104,7 +104,7 @@ ) chain_type_kwargs = {"question_prompt": initial_qa_prompt, "refine_prompt": refine_prompt} qa = RetrievalQA.from_chain_type( - llm=model, chain_type="refine", + llm=model, chain_type="refine", retriever=docsearch.as_retriever(search_kwargs={"k": 1}), chain_type_kwargs=chain_type_kwargs) diff --git a/scripts/merge_llama_with_chinese_lora.py b/scripts/merge_llama_with_chinese_lora.py index d85d0a7..7bb01b0 100644 --- a/scripts/merge_llama_with_chinese_lora.py +++ b/scripts/merge_llama_with_chinese_lora.py @@ -322,7 +322,7 @@ def save_shards(model_sd, num_shards: int): transpose(lora_model_sd[lora_b_key].float() @ lora_model_sd[lora_a_key].float(),fan_in_fan_out) * lora_scaling ) assert base_model_sd[original_key].dtype == torch.float16 - + # did we do anything? assert not torch.allclose(first_weight_old, first_weight) diff --git a/scripts/merge_llama_with_chinese_lora_low_mem.py b/scripts/merge_llama_with_chinese_lora_low_mem.py index 2c4025e..13c0d3f 100644 --- a/scripts/merge_llama_with_chinese_lora_low_mem.py +++ b/scripts/merge_llama_with_chinese_lora_low_mem.py @@ -22,7 +22,7 @@ type=str, help="Please specify a base model") parser.add_argument('--lora_model', default=None, required=True, type=str, help="Please specify LoRA models to be merged (ordered); use commas to separate multiple LoRA models") -parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], +parser.add_argument('--output_type', default='pth',choices=['pth','huggingface'], type=str, help="Save the merged model in pth or huggingface format") parser.add_argument('--output_dir', default='./merged_model', type=str, help="The output folder to save the merged model") From b88c2ed76c596858655464adbe4b4347f8b9de29 Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Sun, 25 Jun 2023 21:48:00 +0800 Subject: [PATCH 08/24] Update openai_api_server.py remove unused import --- scripts/openai_server_demo/openai_api_server.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py index 13f0710..c725bfc 100644 --- a/scripts/openai_server_demo/openai_api_server.py +++ b/scripts/openai_server_demo/openai_api_server.py @@ -1,4 +1,3 @@ -import pdb import argparse import os from fastapi import FastAPI From 2935363148941ad477beacc7628aad8d356ab48a Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Sun, 25 Jun 2023 22:07:15 +0800 Subject: [PATCH 09/24] stylistic update based on Codacy --- scripts/training/run_clm_pt_with_peft.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/scripts/training/run_clm_pt_with_peft.py b/scripts/training/run_clm_pt_with_peft.py index b918a49..ac6bf1a 100644 --- a/scripts/training/run_clm_pt_with_peft.py +++ b/scripts/training/run_clm_pt_with_peft.py @@ -46,13 +46,12 @@ HfArgumentParser, Trainer, TrainingArguments, - default_data_collator, is_torch_tpu_available, set_seed, ) from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import send_example_telemetry from transformers.utils.versions import require_version from sklearn.metrics import accuracy_score @@ -107,8 +106,6 @@ def preprocess_logits_for_metrics(logits, labels): def fault_tolerance_data_collator(features: List) -> Dict[str, Any]: - import torch - if not isinstance(features[0], Mapping): features = [vars(f) for f in features] first = features[0] @@ -483,7 +480,7 @@ def group_texts(examples): remove_columns="text", load_from_cache_file=True, keep_in_memory=False, - cache_file_names = {k: os.path.join(cache_dir, f'tokenized.arrow') for k in raw_dataset}, + cache_file_names = {k: os.path.join(cache_dir, 'tokenized.arrow') for k in raw_dataset}, desc="Running tokenizer on dataset", ) grouped_datasets = tokenized_dataset.map( @@ -492,7 +489,7 @@ def group_texts(examples): num_proc=data_args.preprocessing_num_workers, load_from_cache_file=True, keep_in_memory=False, - cache_file_names = {k: os.path.join(cache_dir, f'grouped.arrow') for k in tokenized_dataset}, + cache_file_names = {k: os.path.join(cache_dir, 'grouped.arrow') for k in tokenized_dataset}, desc=f"Grouping texts in chunks of {block_size}", ) processed_dataset = grouped_datasets @@ -579,7 +576,7 @@ def group_texts(examples): task_type=TaskType.CAUSAL_LM, target_modules=target_modules, inference_mode=False, - r=lora_rank, lora_alpha=lora_alpha, + r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout, modules_to_save=modules_to_save) model = get_peft_model(model, peft_config) From f493d9a542857f75572983ed9ad58d786668a717 Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Sun, 25 Jun 2023 22:14:34 +0800 Subject: [PATCH 10/24] Update run_clm_sft_with_peft.py by Codacy --- scripts/training/run_clm_sft_with_peft.py | 24 ++++++++--------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index 4bfe7a3..946ef2b 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -22,21 +22,18 @@ # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. import logging -import numpy as np import math import os import sys from dataclasses import dataclass, field -from typing import Optional, List, Dict, Any, Mapping +from typing import Optional, Dict, Any, Mapping from pathlib import Path import datasets -import json import torch from build_dataset import buid_instruction_dataset, DataCollatorForSupervisedDataset import transformers from transformers import ( CONFIG_MAPPING, - MODEL_FOR_CAUSAL_LM_MAPPING, AutoConfig, AutoModelForCausalLM, LlamaForCausalLM, @@ -47,24 +44,19 @@ TrainingArguments, set_seed, ) -from transformers.testing_utils import CaptureLogger from transformers.trainer_utils import get_last_checkpoint -from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils import send_example_telemetry from transformers.utils.versions import require_version from peft import LoraConfig, TaskType, get_peft_model, PeftModel, get_peft_model_state_dict from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR - IGNORE_INDEX = -100 DEFAULT_PAD_TOKEN = "[PAD]" DEFAULT_EOS_TOKEN = "" DEFAULT_BOS_TOKEN = "" DEFAULT_UNK_TOKEN = "" -# Will error if the minimal version of Transformers is not installed. Remove at your own risks. -# check_min_version("4.28.0.dev0") - require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -335,10 +327,10 @@ def main(): files = [data_args.validation_file] logger.info(f"training files: {' '.join(files)}") eval_dataset = buid_instruction_dataset( - data_path=files, - tokenizer=tokenizer, + data_path=files, + tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, - data_cache_dir = None, + data_cache_dir = None, preprocessing_num_workers = data_args.preprocessing_num_workers) logger.info(f"Num eval_samples {len(eval_dataset)}") logger.info("eval example:") @@ -386,10 +378,10 @@ def main(): logger.info(f"target_modules: {target_modules}") logger.info(f"lora_rank: {lora_rank}") peft_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, + task_type=TaskType.CAUSAL_LM, target_modules=target_modules, - inference_mode=False, - r=lora_rank, lora_alpha=lora_alpha, + inference_mode=False, + r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout, modules_to_save=modules_to_save) model = get_peft_model(model, peft_config) From d89f53ca41a8954154ceeb6e2f9eaf075c20df98 Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Sun, 25 Jun 2023 22:18:16 +0800 Subject: [PATCH 11/24] Stylistic fixes based on Codacy suggestions --- scripts/openai_server_demo/openai_api_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py index c725bfc..dbeeb1d 100644 --- a/scripts/openai_server_demo/openai_api_server.py +++ b/scripts/openai_server_demo/openai_api_server.py @@ -53,7 +53,7 @@ tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path) base_model = LlamaForCausalLM.from_pretrained( - args.base_model, + args.base_model, load_in_8bit=load_in_8bit, torch_dtype=load_type, low_cpu_mem_usage=True, @@ -189,7 +189,7 @@ async def create_chat_completion(request: ChatCompletionRequest): num_beams=request.num_beams, repetition_penalty=request.repetition_penalty, ) - choices = [ChatCompletionResponseChoice(index = i, message = msg) + choices = [ChatCompletionResponseChoice(index = i, message = msg) for i, msg in enumerate(msgs)] choices += [ChatCompletionResponseChoice(index = len(choices), message = ChatMessage(role='assistant',content=output))] From c74c72f594164e288058e57834760942b54d99ab Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Sun, 25 Jun 2023 22:19:38 +0800 Subject: [PATCH 12/24] Stylistic fixes based on Codacy suggestions --- scripts/openai_server_demo/openai_api_server.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py index dbeeb1d..77d5005 100644 --- a/scripts/openai_server_demo/openai_api_server.py +++ b/scripts/openai_server_demo/openai_api_server.py @@ -189,10 +189,8 @@ async def create_chat_completion(request: ChatCompletionRequest): num_beams=request.num_beams, repetition_penalty=request.repetition_penalty, ) - choices = [ChatCompletionResponseChoice(index = i, message = msg) - for i, msg in enumerate(msgs)] - choices += [ChatCompletionResponseChoice(index = len(choices), - message = ChatMessage(role='assistant',content=output))] + choices = [ChatCompletionResponseChoice(index = i, message = msg) for i, msg in enumerate(msgs)] + choices += [ChatCompletionResponseChoice(index = len(choices), message = ChatMessage(role='assistant',content=output))] return ChatCompletionResponse(choices = choices) @app.post("/v1/completions") From e926f872b8cd75d19ccb9806e766ffd33940d8d7 Mon Sep 17 00:00:00 2001 From: yaoxin <35353688+iMountTai@users.noreply.github.com> Date: Sun, 25 Jun 2023 22:22:25 +0800 Subject: [PATCH 13/24] Update build_dataset.py by Codacy --- scripts/training/build_dataset.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scripts/training/build_dataset.py b/scripts/training/build_dataset.py index 83ec95d..17677a8 100644 --- a/scripts/training/build_dataset.py +++ b/scripts/training/build_dataset.py @@ -1,14 +1,12 @@ import logging import os from dataclasses import dataclass -from typing import Optional, Dict, Sequence, Union, List +from typing import Dict, Sequence, Union, List import datasets import torch -import logging from datasets import load_dataset, concatenate_datasets -import copy import transformers -import random + IGNORE_INDEX = -100 From 66fe1b6e45d5bbd54a1d71d3cd1928e0b6107af5 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Sun, 25 Jun 2023 22:36:00 +0800 Subject: [PATCH 14/24] Update openai_api_server.py --- scripts/openai_server_demo/openai_api_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py index 77d5005..de88bcf 100644 --- a/scripts/openai_server_demo/openai_api_server.py +++ b/scripts/openai_server_demo/openai_api_server.py @@ -120,7 +120,7 @@ def predict( type(input) == str -> /v1/completions type(input) == list -> /v1/chat/completions """ - if type(input) == str: + if isinstance(input,str): prompt = generate_completion_prompt(input) else: prompt = generate_chat_prompt(input) @@ -176,7 +176,7 @@ def get_embedding(input): async def create_chat_completion(request: ChatCompletionRequest): """Creates a completion for the chat message""" msgs = request.messages - if type(msgs) == str: + if isinstance(msgs, str): msgs = [ChatMessage(role='user',content=msgs)] else: msgs = [ChatMessage(role=x['role'],content=x['message']) for x in msgs] From 5817b40f21ada802345bcf40a81d6e091e75e804 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Sun, 25 Jun 2023 22:36:35 +0800 Subject: [PATCH 15/24] Update openai_api_server.py --- scripts/openai_server_demo/openai_api_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/openai_server_demo/openai_api_server.py b/scripts/openai_server_demo/openai_api_server.py index de88bcf..e2ae8e3 100644 --- a/scripts/openai_server_demo/openai_api_server.py +++ b/scripts/openai_server_demo/openai_api_server.py @@ -120,7 +120,7 @@ def predict( type(input) == str -> /v1/completions type(input) == list -> /v1/chat/completions """ - if isinstance(input,str): + if isinstance(input, str): prompt = generate_completion_prompt(input) else: prompt = generate_chat_prompt(input) From a9f9ed2aa19cacfdde8e4626116d98dd4ca334ea Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Sun, 25 Jun 2023 22:54:37 +0800 Subject: [PATCH 16/24] Stylistic update run_clm_pt_with_peft.py based on Codacy --- scripts/training/run_clm_pt_with_peft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_clm_pt_with_peft.py b/scripts/training/run_clm_pt_with_peft.py index ac6bf1a..9adaf66 100644 --- a/scripts/training/run_clm_pt_with_peft.py +++ b/scripts/training/run_clm_pt_with_peft.py @@ -122,7 +122,7 @@ def fault_tolerance_data_collator(features: List) -> Dict[str, Any]: if isinstance(first["label_ids"], torch.Tensor): batch["labels"] = torch.stack([f["label_ids"] for f in features]) else: - dtype = torch.long if type(first["label_ids"][0]) is int else torch.float + dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype) # Handling of all other possible keys. From 4d02338def650acefd03a9deb31a80e2f850c98d Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Sun, 25 Jun 2023 23:01:57 +0800 Subject: [PATCH 17/24] Update run_clm_sft_with_peft.py Stylistic update run_clm_sft_with_peft.py based on Codacy --- scripts/training/run_clm_sft_with_peft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index 946ef2b..a19dd12 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -26,7 +26,7 @@ import os import sys from dataclasses import dataclass, field -from typing import Optional, Dict, Any, Mapping +from typing import Optional, Dict from pathlib import Path import datasets import torch From 53b5607596bbc0ed6521b743e77ed0b23438de34 Mon Sep 17 00:00:00 2001 From: ymcui Date: Mon, 26 Jun 2023 08:17:52 +0800 Subject: [PATCH 18/24] Trailing whitespace --- scripts/ceval/eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/ceval/eval.py b/scripts/ceval/eval.py index 6e74cb2..35149e8 100644 --- a/scripts/ceval/eval.py +++ b/scripts/ceval/eval.py @@ -89,7 +89,6 @@ def main(args, evaluator,take): parser.add_argument("--do_save_csv", choices=["False","True"], default="False") parser.add_argument("--output_dir", type=str) parser.add_argument("--do_test", choices=["False","True"], default="False") - args = parser.parse_args() From c04757d7e8468f1667bbf424670a42df9d84f2e9 Mon Sep 17 00:00:00 2001 From: ymcui Date: Mon, 26 Jun 2023 08:20:48 +0800 Subject: [PATCH 19/24] Trailing whitespace --- scripts/inference/inference_hf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/inference/inference_hf.py b/scripts/inference/inference_hf.py index 4b1f241..a73ed5a 100644 --- a/scripts/inference/inference_hf.py +++ b/scripts/inference/inference_hf.py @@ -140,7 +140,7 @@ def generate_prompt(instruction, input=None): input_text = example inputs = tokenizer(input_text,return_tensors="pt") #add_special_tokens=False ? generation_output = model.generate( - input_ids = inputs["input_ids"].to(device), + input_ids = inputs["input_ids"].to(device), attention_mask = inputs['attention_mask'].to(device), eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, From c98555219d0356b80e901eef00f241ed31f53ba2 Mon Sep 17 00:00:00 2001 From: ymcui Date: Mon, 26 Jun 2023 08:24:29 +0800 Subject: [PATCH 20/24] Trailing whitespace --- scripts/training/run_clm_pt_with_peft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_clm_pt_with_peft.py b/scripts/training/run_clm_pt_with_peft.py index 9adaf66..3ec1aee 100644 --- a/scripts/training/run_clm_pt_with_peft.py +++ b/scripts/training/run_clm_pt_with_peft.py @@ -575,7 +575,7 @@ def group_texts(examples): peft_config = LoraConfig( task_type=TaskType.CAUSAL_LM, target_modules=target_modules, - inference_mode=False, + inference_mode=False, r=lora_rank, lora_alpha=lora_alpha, lora_dropout=lora_dropout, modules_to_save=modules_to_save) From 8f5f621d51fbc5d5c57bd63fcff6271acf93fc40 Mon Sep 17 00:00:00 2001 From: ymcui Date: Mon, 26 Jun 2023 08:25:44 +0800 Subject: [PATCH 21/24] Trailing whitespace --- scripts/training/run_clm_sft_with_peft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index a19dd12..2713698 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -317,7 +317,7 @@ def main(): data_path=files, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, - data_cache_dir = None, + data_cache_dir = None, preprocessing_num_workers = data_args.preprocessing_num_workers) logger.info(f"Num train_samples {len(train_dataset)}") logger.info("training example:") From 6406fe6bc8b2a0edee1ffc0505672f1051d5ec80 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Mon, 26 Jun 2023 08:32:20 +0800 Subject: [PATCH 22/24] remove unused variable in run_clm_sft_with_peft.py --- scripts/training/run_clm_sft_with_peft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index 2713698..e4cd87d 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -300,7 +300,7 @@ def main(): raise ValueError(f"The vocab size of the tokenizer must be 49954, but found {len(tokenizer)}.\n" "Please use Chinese Alpaca tokenizer!") if tokenizer.pad_token is None: - num_new_tokens = smart_tokenizer_and_embedding_resize( + smart_tokenizer_and_embedding_resize( special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), tokenizer=tokenizer) From e5567180672b9a3bfadf17020ef30bf73b24dee1 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Mon, 26 Jun 2023 08:44:46 +0800 Subject: [PATCH 23/24] remove unused function smart_tokenizer_and_embedding_resize --- scripts/training/run_clm_sft_with_peft.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index e4cd87d..1eca83b 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -300,9 +300,8 @@ def main(): raise ValueError(f"The vocab size of the tokenizer must be 49954, but found {len(tokenizer)}.\n" "Please use Chinese Alpaca tokenizer!") if tokenizer.pad_token is None: - smart_tokenizer_and_embedding_resize( - special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), - tokenizer=tokenizer) + print(f"Adding pad token {DEFAULT_PAD_TOKEN}") + tokenizer.add_special_tokens(dict(pad_token=DEFAULT_PAD_TOKEN)) data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer) eval_dataset=None @@ -438,16 +437,5 @@ def main(): trainer.save_metrics("eval", metrics) -def smart_tokenizer_and_embedding_resize( - special_tokens_dict: Dict, - tokenizer: transformers.PreTrainedTokenizer, -): - """Resize tokenizer and embedding. - Note: This is the unoptimized version that may make your embedding size not be divisible by 64. - """ - num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) - return num_new_tokens - - if __name__ == "__main__": main() From a96009b08e6281e3611349f81dbbf4ffd8e618b4 Mon Sep 17 00:00:00 2001 From: Ziqing Yang Date: Mon, 26 Jun 2023 08:50:24 +0800 Subject: [PATCH 24/24] remove unused Dict imported from typing --- scripts/training/run_clm_sft_with_peft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/training/run_clm_sft_with_peft.py b/scripts/training/run_clm_sft_with_peft.py index 1eca83b..c0444d5 100644 --- a/scripts/training/run_clm_sft_with_peft.py +++ b/scripts/training/run_clm_sft_with_peft.py @@ -26,7 +26,7 @@ import os import sys from dataclasses import dataclass, field -from typing import Optional, Dict +from typing import Optional from pathlib import Path import datasets import torch