-
Notifications
You must be signed in to change notification settings - Fork 539
load pretokenized user query (v0) #965
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 18 commits
82d2ad4
c804e07
d38806d
058fcb5
f09d5ef
cabe288
34939c4
941a1e0
5150683
0f5f6e2
f0d4b55
b0ea266
3a87a75
6a7dccd
5f22cfd
a420aa1
09cbcf7
3dc9e73
46159ba
3cd6e31
b9884c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -763,6 +763,7 @@ def get_tokenizer_tulu_v2_2(tc: "TokenizerConfig"): | |
| DEFAULT_SFT_MESSAGES_KEY = "messages" | ||
| GROUND_TRUTHS_KEY = "ground_truth" | ||
| VERIFIER_SOURCE_KEY = "dataset" | ||
| RAW_PROMPT_KEY = "prompt" | ||
|
|
||
|
|
||
| @dataclass | ||
|
|
@@ -814,8 +815,14 @@ def tokenizer(self): | |
| ATTENTION_MASK_KEY = "attention_mask" | ||
| LABELS_KEY = "labels" | ||
| DATASET_ORIGIN_KEY = "dataset_source" # just 'dataset' clashes with RLVR stuff (see VERIFIER_SOURCE_KEY) | ||
| TOKENIZED_SFT_DATASET_KEYS = [INPUT_IDS_KEY, ATTENTION_MASK_KEY, LABELS_KEY] | ||
| TOKENIZED_SFT_DATASET_KEYS_WITH_SOURCE = [INPUT_IDS_KEY, ATTENTION_MASK_KEY, LABELS_KEY, DATASET_ORIGIN_KEY] | ||
| TOKENIZED_SFT_DATASET_KEYS = [INPUT_IDS_KEY, ATTENTION_MASK_KEY, LABELS_KEY, RAW_PROMPT_KEY] | ||
| TOKENIZED_SFT_DATASET_KEYS_WITH_SOURCE = [ | ||
| INPUT_IDS_KEY, | ||
| ATTENTION_MASK_KEY, | ||
| LABELS_KEY, | ||
| DATASET_ORIGIN_KEY, | ||
| RAW_PROMPT_KEY, | ||
| ] | ||
|
|
||
|
|
||
| def remove_dataset_source_field(dataset: Dataset) -> Dataset: | ||
|
|
@@ -1178,13 +1185,16 @@ def rlvr_tokenize_v1( | |
| prompt = row[sft_messages_key] | ||
| else: | ||
| prompt = row[sft_messages_key][:-1] | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove please! |
||
| row[INPUT_IDS_PROMPT_KEY] = tokenizer.apply_chat_template(prompt, add_generation_prompt=True) | ||
| row[INPUT_IDS_KEY] = tokenizer.apply_chat_template(row[sft_messages_key]) | ||
| row[ATTENTION_MASK_KEY] = [1] * len(row[INPUT_IDS_KEY]) | ||
| labels = copy.deepcopy(row[INPUT_IDS_KEY]) | ||
| row[LABELS_KEY] = labels | ||
| row[GROUND_TRUTHS_KEY] = row[ground_truths_key] | ||
| row[VERIFIER_SOURCE_KEY] = row[verifier_source_key] | ||
| # concatenate all the previous messages as <role>: <content>\n <role>: <content>\n ... | ||
| row[RAW_PROMPT_KEY] = "\n".join(f"{msg['role']}: {msg['content']}" for msg in prompt) | ||
| return row | ||
|
|
||
|
|
||
|
|
@@ -1212,6 +1222,10 @@ def rlvr_tokenize_v2( | |
| row[LABELS_KEY] = labels | ||
| row[GROUND_TRUTHS_KEY] = row[ground_truths_key] | ||
| row[VERIFIER_SOURCE_KEY] = row[verifier_source_key] | ||
| # concatenate all the previous messages as <role>: <content>\n <role>: <content>\n ... | ||
| # row[DEFAULT_SFT_MESSAGES_KEY] = prompt | ||
| # concatenate all the previous messages as <role>: <content>\n <role>: <content>\n ... | ||
| row[RAW_PROMPT_KEY] = "\n".join(f"{msg['role']}: {msg['content']}" for msg in prompt) | ||
| # some basic transformations: | ||
| # if ground truths is a string, make it a list | ||
| if isinstance(row[ground_truths_key], str): | ||
|
|
@@ -1673,7 +1687,6 @@ def get_cached_dataset_tulu_with_statistics( | |
| frac_or_num_samples = float(frac_or_num_samples) | ||
| else: | ||
| frac_or_num_samples = int(frac_or_num_samples) | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove please! |
||
| dataset_config = DatasetConfig( | ||
| dataset_name=dataset_name, | ||
| dataset_split=dataset_mixer_list_splits[i], | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,6 +39,14 @@ | |
| logger = logger_utils.setup_logger(__name__) | ||
|
|
||
|
|
||
| logging.getLogger("LiteLLM").setLevel(logging.WARNING) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment as to why you're doing this? |
||
| logging.getLogger("litellm").setLevel(logging.ERROR) | ||
| logging.getLogger("litellm.cost_calculator").setLevel(logging.CRITICAL) | ||
| logging.getLogger("litellm._client").setLevel(logging.CRITICAL) | ||
| logging.getLogger("cost_calculator").setLevel(logging.WARNING) | ||
| logging.getLogger("httpx").setLevel(logging.WARNING) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this work to remove the logging from litellm?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it worked both locally and in the debug beaker job. |
||
|
|
||
|
|
||
| @dataclass | ||
| class VerifierConfig: | ||
| """For now this config exists to support LMJudgeVerifer, can be expanded to support other verifers""" | ||
|
|
@@ -663,8 +671,7 @@ async def async_call( | |
| for attempt in range(max_retries): | ||
| # judges the quality of a response | ||
| try: | ||
| system_prompt = "Do not generate text between the <think> and </think> tags." # "You are a concise assistant who gives very short explanations before giving a quality score." | ||
| messages = build_messages(prompt, system_prompt) | ||
| messages = build_messages(prompt) | ||
|
|
||
| # Faeze: check if the request would exceed context window | ||
| # Import the context window checker | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there's not multiple prompt key variables, let's just use
PROMPT_KEY?