kocchop
diff --git a/‎src/MaxText/scratch_code/generate_hf_golden_logits.py‎
Lines changed: 38 additions & 4 deletions b/‎src/MaxText/scratch_code/generate_hf_golden_logits.py‎
Lines changed: 38 additions & 4 deletions
diff --git a/‎src/MaxText/utils/ckpt_conversion/to_huggingface.py‎
Lines changed: 18 additions & 5 deletions b/‎src/MaxText/utils/ckpt_conversion/to_huggingface.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py‎
Lines changed: 60 additions & 0 deletions b/‎src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py‎
Lines changed: 60 additions & 0 deletions
@@ -60,7 +60,16 @@ def upload_blob(bucket_name, source_file_name, destination_blob_name):
 
 
 def save_golden_logits(
-    model_id, output_path, prompt_texts, apply_chat_template, gcs_bucket, hf_model_path, image_paths, output_format
+    model_id,
+    output_path,
+    prompt_texts,
+    apply_chat_template,
+    gcs_bucket,
+    hf_model_path,
+    hf_load_dtype,
+    trust_remote_code,
+    image_paths,
+    output_format,
 ):
   """save golden logits"""
   if hf_model_path is None:
@@ -77,10 +86,18 @@ def save_golden_logits(
 
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   print(f"loading model from {hf_model_path}")
+
+  if hf_load_dtype == "float32":
+    torch_dtype = torch.float32
+  elif hf_load_dtype == "bfloat16":
+    torch_dtype = torch.bfloat16
+  else:
+    raise ValueError
+
   model = model_class.from_pretrained(
       hf_model_path,
-      torch_dtype=torch.float32,
-      trust_remote_code=True,
+      dtype=torch_dtype,
+      trust_remote_code=trust_remote_code,
   )
 
   all_data_to_save = []
@@ -110,7 +127,7 @@ def save_golden_logits(
     # 2. Run inference
     with torch.no_grad():
       outputs = model(**inputs)
-      logits = outputs.logits.cpu().numpy().astype("float32")
+      logits = outputs.logits.cpu().to(torch.float32).numpy()
 
     # 3. Populate final data dictionary with tensors from inputs and logits
     for key, value in inputs.items():
@@ -159,6 +176,21 @@ def main(raw_args=None) -> None:
   parser.add_argument(
       "--hf-model-path", type=str, required=False, default=None, help="local path to checkpoint if exists."
   )
+  parser.add_argument(
+      "--hf-load-dtype",
+      type=str,
+      required=False,
+      choices=["float32", "bfloat16"],
+      default="float32",
+      help="model_class.from_pretrained: dtype",
+  )
+  # variable `args.trust_remote_code` is True by default, False only if with flag `--not-trust-remote-code`
+  parser.add_argument(
+      "--not-trust-remote-code",
+      dest="trust_remote_code",
+      action="store_false",
+      help="model_class.from_pretrained: trust_remote_code",
+  )
   parser.add_argument(
       "--image-paths", type=str, required=False, default=None, help="A semicolon-separated list of image_paths."
   )
@@ -185,6 +217,8 @@ def main(raw_args=None) -> None:
       args.apply_chat_template,
       args.gcs_bucket,
       args.hf_model_path,
+      args.hf_load_dtype,
+      args.trust_remote_code,
       image_paths,
       args.output_format,
   )
 
@@ -54,6 +54,8 @@
 import jax
 import os
 from typing import Sequence, Any
+import time
+from tqdm import tqdm
 
 from transformers import AutoTokenizer, AutoProcessor
 
@@ -72,7 +74,8 @@
 from MaxText.utils.ckpt_conversion.utils.utils import (process_leaf_param, save_model_files, HF_IDS)
 
 
-jax.config.update("jax_platform_name", "cpu")
+os.environ["JAX_PLATFORMS"] = "cpu"
+os.environ["XLA_FLAGS"] = "--xla_force_host_platform_device_count=16"
 
 
 def _get_model_mappings(model_name: str, scan_layers: bool, config_dict: dict):
@@ -114,17 +117,22 @@ def main(argv: Sequence[str]) -> None:
   jax.config.update("jax_default_prng_impl", "unsafe_rbg")
   os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
 
+  # Initialize maxtext config
   config = pyconfig.initialize(argv)
   assert (
       config.load_full_state_path == ""
   ), "This script expects parameters, not a full state. Use generate_param_only_checkpoint first if needed."
   max_utils.print_system_information()
 
+  # Load Maxtext checkpoint
+  max_logging.log("\nLoading Orbax checkpoint...")
+  start = time.time()
   engine = maxengine.MaxEngine(config)
   rng = jax.random.PRNGKey(1234)
   rng, rng_load_params = jax.random.split(rng)
   # load params from maxengine
   loaded_params_from_engine = engine.load_params(rng_load_params)
+  max_logging.log(f"Elapse: {(time.time() - start) / 60:.2f} min")
 
   if not config.base_output_directory:
     output_directory = f"tmp/{config.run_name}"
@@ -165,18 +173,22 @@ def main(argv: Sequence[str]) -> None:
   leaves_with_paths = jax.tree_util.tree_leaves_with_path(actual_weights_dict)
 
   # traverse leavse to build: mt_param_key:mt_weights
+  max_logging.log("\nProccessing weight...")
+  start = time.time()
   processed_params_list = []
-  for path_tuple_iter, leaf_value_iter in leaves_with_paths:
-    processed_params_list.extend(
-        process_leaf_param(path_tuple_iter, leaf_value_iter, param_map, shape_map, hook_fn_map, config)
-    )
+  for path_tuple_iter, leaf_value_iter in tqdm(leaves_with_paths, total=len(leaves_with_paths)):
+    processed_params = process_leaf_param(path_tuple_iter, leaf_value_iter, param_map, shape_map, hook_fn_map, config)
+    processed_params_list.extend(processed_params)
   transformed_hf_weights = dict(processed_params_list)
+  max_logging.log(f"Elapse: {(time.time() - start) / 60:.2f} min")
 
   # 5. Save in HuggingFace Format
   if not transformed_hf_weights:
     print("Error: No weights were transformed. Check mappings and parameter paths.")
     return
 
+  max_logging.log("\nSaving HuggingFace model...")
+  start = time.time()
   save_model_files(
       weight_arrays=transformed_hf_weights,
       config=hf_config_obj,
@@ -185,6 +197,7 @@ def main(argv: Sequence[str]) -> None:
       output_dir=output_directory,
   )
   max_logging.log(f"✅ MaxText model successfully saved in HuggingFace format at {output_directory}")
+  max_logging.log(f"Elapse: {(time.time() - start) / 60:.2f} min")
 
 
 if __name__ == "__main__":
 
@@ -469,6 +469,65 @@
     vocab_size=151936,
 )
 
+deepseek3_671b_dict = {
+    "architectures": ["DeepseekV3ForCausalLM"],
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "auto_map": {
+        "AutoConfig": "configuration_deepseek.DeepseekV3Config",
+        "AutoModel": "modeling_deepseek.DeepseekV3Model",
+        "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM",
+    },
+    "bos_token_id": 0,
+    "eos_token_id": 1,
+    "ep_size": 1,
+    "first_k_dense_replace": 3,
+    "hidden_act": "silu",
+    "hidden_size": 7168,
+    "initializer_range": 0.02,
+    "intermediate_size": 18432,
+    "kv_lora_rank": 512,
+    "max_position_embeddings": 163840,
+    "model_type": "deepseek_v3",
+    "moe_intermediate_size": 2048,
+    "moe_layer_freq": 1,
+    "n_group": 8,
+    "n_routed_experts": 256,
+    "n_shared_experts": 1,
+    "norm_topk_prob": True,
+    "num_attention_heads": 128,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 61,
+    "num_key_value_heads": 128,
+    "num_nextn_predict_layers": 1,
+    "q_lora_rank": 1536,
+    "qk_nope_head_dim": 128,
+    "qk_rope_head_dim": 64,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+        "beta_fast": 32,
+        "beta_slow": 1,
+        "factor": 40,
+        "mscale": 1.0,
+        "mscale_all_dim": 1.0,
+        "original_max_position_embeddings": 4096,
+        "type": "yarn",
+    },
+    "rope_theta": 10000,
+    "routed_scaling_factor": 2.5,
+    "scoring_func": "sigmoid",
+    "tie_word_embeddings": False,
+    "topk_group": 4,
+    "topk_method": "noaux_tc",
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.33.1",
+    "use_cache": True,
+    "v_head_dim": 128,
+    "vocab_size": 129280,
+}
+deepseek3_671b_config = transformers.DeepseekV3Config(**deepseek3_671b_dict)
+
+# {maxtext model name: hf model config}
 qwen3_omni_30b_a3b_config = transformers.Qwen3OmniMoeConfig(
     # TODO(hengtaoguo): Pure-text Omni model, need to fill in visual/audio/code2wav parts
     architectures=["Qwen3OmniMoeForConditionalGeneration"],
@@ -500,5 +559,6 @@
     "qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
     "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
     "qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
+    "deepseek3-671b": deepseek3_671b_config,
     "qwen3-omni-30b-a3b": qwen3_omni_30b_a3b_config,
 }