pytorch
diff --git a/‎docs/source/tune_cli.rst
+71-2 b/‎docs/source/tune_cli.rst
+71-2
diff --git a/‎tests/torchtune/_cli/test_cat.py
+81 b/‎tests/torchtune/_cli/test_cat.py
+81
diff --git a/‎tests/torchtune/modules/loss/test_kd_losses.py
+208-1 b/‎tests/torchtune/modules/loss/test_kd_losses.py
+208-1
@@ -17,15 +17,15 @@ with a short description of each.
 .. code-block:: bash
 
     $ tune --help
-    usage: tune [-h] {download,ls,cp,run,validate} ...
+    usage: tune [-h] {download,ls,cp,run,validate,cat} ...
 
     Welcome to the torchtune CLI!
 
     options:
     -h, --help            show this help message and exit
 
     subcommands:
-      {download,ls,cp,run,validate}
+      {download,ls,cp,run,validate,cat}
         download            Download a model from the Hugging Face Hub.
         ls                  List all built-in recipes and configs
         ...
@@ -233,3 +233,72 @@ The ``tune validate <config>`` command will validate that your config is formatt
     # If you've copied over a built-in config and want to validate custom changes
     $ tune validate my_configs/llama3/8B_full.yaml
     Config is well-formed!
+
+.. _tune_cat_cli_label:
+
+Inspect a config
+---------------------
+
+The ``tune cat <config>`` command pretty prints a configuration file, making it easy to use ``tune run`` with confidence. This command is useful for inspecting the structure and contents of a config file before running a recipe, ensuring that all parameters are correctly set.
+
+You can also use the ``--sort`` option to print the config in sorted order, which can help in quickly locating specific keys.
+
+.. list-table::
+   :widths: 30 60
+
+   * - \--sort
+     - Print the config in sorted order.
+
+**Workflow Example**
+
+1. **List all available configs:**
+
+   Use the ``tune ls`` command to list all the built-in recipes and configs within torchtune.
+
+   .. code-block:: bash
+
+       $ tune ls
+       RECIPE                                   CONFIG
+       full_finetune_single_device              llama2/7B_full_low_memory
+                                                code_llama2/7B_full_low_memory
+                                                llama3/8B_full_single_device
+                                                mistral/7B_full_low_memory
+                                                phi3/mini_full_low_memory
+       full_finetune_distributed                llama2/7B_full
+                                                llama2/13B_full
+                                                llama3/8B_full
+                                                llama3/70B_full
+       ...
+
+2. **Inspect the contents of a config:**
+
+   Use the ``tune cat`` command to pretty print the contents of a specific config. This helps you understand the structure and parameters of the config.
+
+   .. code-block:: bash
+
+       $ tune cat llama2/7B_full
+       output_dir: /tmp/torchtune/llama2_7B/full
+       tokenizer:
+           _component_: torchtune.models.llama2.llama2_tokenizer
+           path: /tmp/Llama-2-7b-hf/tokenizer.model
+           max_seq_len: null
+       ...
+
+   You can also print the config in sorted order:
+
+   .. code-block:: bash
+
+       $ tune cat llama2/7B_full --sort
+
+3. **Run a recipe with parameter override:**
+
+   After inspecting the config, you can use the ``tune run`` command to run a recipe with the config. You can also override specific parameters directly from the command line. For example, to override the `output_dir` parameter:
+
+   .. code-block:: bash
+
+       $ tune run full_finetune_distributed --config llama2/7B_full output_dir=./
+
+   Learn more about config overrides :ref:`here  <cli_override>`.
+
+.. note::
+    You can find all the cat-able configs via the ``tune ls`` command.
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import runpy
+import sys
+
+import pytest
+from tests.common import TUNE_PATH
+
+
+class TestTuneCatCommand:
+    """This class tests the `tune cat` command."""
+
+    def test_cat_valid_config(self, capsys, monkeypatch):
+        testargs = "tune cat llama2/7B_full".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+        runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        captured = capsys.readouterr()
+        output = captured.out.rstrip("\n")
+
+        # Check for key sections that should be in the YAML output
+        assert "output_dir:" in output
+        assert "tokenizer:" in output
+        assert "model:" in output
+
+    def test_cat_recipe_name_shows_error(self, capsys, monkeypatch):
+        testargs = "tune cat full_finetune_single_device".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+        runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        captured = capsys.readouterr()
+        output = captured.out.rstrip("\n")
+
+        assert "is a recipe, not a config" in output
+
+    def test_cat_non_existent_config(self, capsys, monkeypatch):
+        testargs = "tune cat non_existent_config".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+
+        with pytest.raises(SystemExit):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        captured = capsys.readouterr()
+        err = captured.err.rstrip("\n")
+
+        assert (
+            "Invalid config format: 'non_existent_config'. Must be YAML (.yaml/.yml)"
+            in err
+        )
+
+    def test_cat_invalid_yaml_file(self, capsys, monkeypatch, tmpdir):
+        invalid_yaml = tmpdir / "invalid.yaml"
+        invalid_yaml.write_text("invalid: yaml: file", encoding="utf-8")
+
+        testargs = f"tune cat {invalid_yaml}".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+
+        with pytest.raises(SystemExit):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        captured = capsys.readouterr()
+        err = captured.err.rstrip("\n")
+
+        assert "Error parsing YAML file" in err
+
+    def test_cat_external_yaml_file(self, capsys, monkeypatch, tmpdir):
+        valid_yaml = tmpdir / "external.yaml"
+        valid_yaml.write_text("key: value", encoding="utf-8")
+
+        testargs = f"tune cat {valid_yaml}".split()
+        monkeypatch.setattr(sys, "argv", testargs)
+        runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        captured = capsys.readouterr()
+        output = captured.out.rstrip("\n")
+
+        assert "key: value" in output
@@ -7,7 +7,14 @@
 import pytest
 import torch
 from tests.test_utils import assert_expected
-from torchtune.modules.loss import ForwardKLLoss, ForwardKLWithChunkedOutputLoss
+from torchtune.modules.loss import (
+    ForwardKLLoss,
+    ForwardKLWithChunkedOutputLoss,
+    ReverseKLLoss,
+    ReverseKLWithChunkedOutputLoss,
+    SymmetricKLLoss,
+    SymmetricKLWithChunkedOutputLoss,
+)
 from torchtune.training.seed import set_seed
 
 
@@ -140,3 +147,203 @@ def test_forward_kl_loss_expected(self):
         # assert
         assert_expected(chunked_loss, expected_loss, rtol=1e-2, atol=1e-2)
         assert_expected(standard_loss, expected_loss, rtol=1e-2, atol=1e-2)
+
+
+class TestReverseKLWithChunkedOutputLoss:
+    def test_reverse_kl_loss(self):
+        # Create a sample input and label
+        ignore_index = -100
+        batch_size = 3
+        num_tokens = 50
+        vocab_size = 50
+        logits = torch.randn(batch_size, num_tokens, vocab_size, dtype=torch.bfloat16)
+        teacher_logits = torch.randn(
+            batch_size, num_tokens, vocab_size, dtype=torch.bfloat16
+        )
+        labels = torch.randint(
+            0, vocab_size, (batch_size, num_tokens), dtype=torch.long
+        )
+
+        # add random ignore index to random tokens in the label
+        random_indices = torch.randint(0, num_tokens, (batch_size, num_tokens))
+        labels[random_indices < num_tokens // 5] = ignore_index
+
+        # chunked RKL
+        chunked_rkl_loss = ReverseKLWithChunkedOutputLoss(
+            num_output_chunks=8, ignore_index=ignore_index
+        )
+        logits_chunks = logits.chunk(chunked_rkl_loss.num_output_chunks, dim=1)
+        teacher_logits_chunks = teacher_logits.chunk(
+            chunked_rkl_loss.num_output_chunks, dim=1
+        )
+        chunked_loss = chunked_rkl_loss(logits_chunks, teacher_logits_chunks, labels)
+
+        # vanilla RKL
+        rkl_loss = ReverseKLLoss(ignore_index=ignore_index)
+        logits = logits.reshape(-1, logits.size(-1))
+        teacher_logits = teacher_logits.reshape(-1, teacher_logits.size(-1))
+        labels = labels.reshape(-1)
+        standard_loss = rkl_loss(logits, teacher_logits, labels)
+
+        # Assert
+        assert_expected(chunked_loss, standard_loss, rtol=1e-2, atol=1e-2)
+
+    def test_reverse_kl_loss_expected(self):
+        student_logits = torch.tensor(
+            [
+                [
+                    [1.1250, -0.4102, -0.0879, -2.5000],
+                    [0.2676, 0.3535, 0.8711, -1.4688],
+                    [-0.1084, 1.6641, 0.0084, 0.1196],
+                    [0.5000, -0.6406, -0.2236, -1.5938],
+                ],
+                [
+                    [-1.5312, -1.9219, 0.0000, -0.5039],
+                    [-1.5391, 1.5312, 0.5820, 0.2695],
+                    [-0.3887, 1.2188, 0.0000, 0.6055],
+                    [0.5000, 1.3828, 0.1309, -1.0312],
+                ],
+            ],
+            dtype=torch.bfloat16,
+        )
+        teacher_logits = torch.tensor(
+            [
+                [
+                    [-0.0381, -1.2578, -1.2031, 0.0947],
+                    [-0.7852, 0.4492, 1.5547, 0.0972],
+                    [0.8203, 0.0012, 0.7656, 0.3477],
+                    [-1.5781, 0.4297, 0.5977, 0.3926],
+                ],
+                [
+                    [1.5156, 0.1641, 2.0781, -0.7734],
+                    [-0.5898, 0.4453, -0.7969, 0.6328],
+                    [0.6289, -0.8359, 0.9258, 0.2109],
+                    [0.0006, 0.5195, 3.2344, -1.5781],
+                ],
+            ],
+            dtype=torch.bfloat16,
+        )
+        labels = torch.tensor([[0, 3, 3, 1], [1, 1, 1, 1]])
+        expected_loss = torch.tensor(0.6775, dtype=torch.float32)
+
+        # chunked RKL loss
+        chunked_rkl_loss = ReverseKLWithChunkedOutputLoss(
+            num_output_chunks=2, ignore_index=-100
+        )
+        student_logits_chunks = student_logits.chunk(
+            chunked_rkl_loss.num_output_chunks, dim=1
+        )
+        teacher_logits_chunks = teacher_logits.chunk(
+            chunked_rkl_loss.num_output_chunks, dim=1
+        )
+        chunked_loss = chunked_rkl_loss(
+            student_logits_chunks, teacher_logits_chunks, labels
+        )
+
+        # vanilla RKL loss
+        rkl_loss = ReverseKLLoss(ignore_index=-100)
+        standard_loss = rkl_loss(student_logits, teacher_logits, labels)
+
+        # assert
+        assert_expected(chunked_loss, expected_loss, rtol=1e-2, atol=1e-2)
+        assert_expected(standard_loss, expected_loss, rtol=1e-2, atol=1e-2)
+
+
+class TestSymmetricKLWithChunkedOutputLoss:
+    def test_symmetric_kl_loss(self):
+        # Create a sample input and label
+        ignore_index = -100
+        batch_size = 3
+        num_tokens = 50
+        vocab_size = 50
+        logits = torch.randn(batch_size, num_tokens, vocab_size, dtype=torch.bfloat16)
+        teacher_logits = torch.randn(
+            batch_size, num_tokens, vocab_size, dtype=torch.bfloat16
+        )
+        labels = torch.randint(
+            0, vocab_size, (batch_size, num_tokens), dtype=torch.long
+        )
+
+        # add random ignore index to random tokens in the label
+        random_indices = torch.randint(0, num_tokens, (batch_size, num_tokens))
+        labels[random_indices < num_tokens // 5] = ignore_index
+
+        # chunked Symmetric KL
+        chunked_sym_kl_loss = SymmetricKLWithChunkedOutputLoss(
+            num_output_chunks=8, ignore_index=ignore_index
+        )
+        logits_chunks = logits.chunk(chunked_sym_kl_loss.num_output_chunks, dim=1)
+        teacher_logits_chunks = teacher_logits.chunk(
+            chunked_sym_kl_loss.num_output_chunks, dim=1
+        )
+        chunked_loss = chunked_sym_kl_loss(logits_chunks, teacher_logits_chunks, labels)
+
+        # vanilla Symmetric KL
+        sym_kl_loss = SymmetricKLLoss(ignore_index=ignore_index)
+        logits = logits.reshape(-1, logits.size(-1))
+        teacher_logits = teacher_logits.reshape(-1, teacher_logits.size(-1))
+        labels = labels.reshape(-1)
+        standard_loss = sym_kl_loss(logits, teacher_logits, labels)
+
+        # Assert
+        assert_expected(chunked_loss, standard_loss, rtol=1e-2, atol=1e-2)
+
+    def test_symmetric_kl_loss_expected(self):
+        student_logits = torch.tensor(
+            [
+                [
+                    [1.1250, -0.4102, -0.0879, -2.5000],
+                    [0.2676, 0.3535, 0.8711, -1.4688],
+                    [-0.1084, 1.6641, 0.0084, 0.1196],
+                    [0.5000, -0.6406, -0.2236, -1.5938],
+                ],
+                [
+                    [-1.5312, -1.9219, 0.0000, -0.5039],
+                    [-1.5391, 1.5312, 0.5820, 0.2695],
+                    [-0.3887, 1.2188, 0.0000, 0.6055],
+                    [0.5000, 1.3828, 0.1309, -1.0312],
+                ],
+            ],
+            dtype=torch.bfloat16,
+        )
+        teacher_logits = torch.tensor(
+            [
+                [
+                    [-0.0381, -1.2578, -1.2031, 0.0947],
+                    [-0.7852, 0.4492, 1.5547, 0.0972],
+                    [0.8203, 0.0012, 0.7656, 0.3477],
+                    [-1.5781, 0.4297, 0.5977, 0.3926],
+                ],
+                [
+                    [1.5156, 0.1641, 2.0781, -0.7734],
+                    [-0.5898, 0.4453, -0.7969, 0.6328],
+                    [0.6289, -0.8359, 0.9258, 0.2109],
+                    [0.0006, 0.5195, 3.2344, -1.5781],
+                ],
+            ],
+            dtype=torch.bfloat16,
+        )
+        labels = torch.tensor([[0, 3, 3, 1], [1, 1, 1, 1]])
+        expected_loss = torch.tensor(1.1992, dtype=torch.float32)
+
+        # chunked Symmetric KL loss
+        chunked_sym_kl_loss = SymmetricKLWithChunkedOutputLoss(
+            num_output_chunks=2, ignore_index=-100
+        )
+        student_logits_chunks = student_logits.chunk(
+            chunked_sym_kl_loss.num_output_chunks, dim=1
+        )
+        teacher_logits_chunks = teacher_logits.chunk(
+            chunked_sym_kl_loss.num_output_chunks, dim=1
+        )
+        chunked_loss = chunked_sym_kl_loss(
+            student_logits_chunks, teacher_logits_chunks, labels
+        )
+
+        # vanilla Symmetric KL loss
+        sym_kl_loss = SymmetricKLLoss(ignore_index=-100)
+        standard_loss = sym_kl_loss(student_logits, teacher_logits, labels)
+
+        # assert
+        assert_expected(chunked_loss, expected_loss, rtol=1e-2, atol=1e-2)
+        assert_expected(standard_loss, expected_loss, rtol=1e-2, atol=1e-2)