#4003: run test_performance_of_bloom_for_question_answering using L1 Config and assuming fused softmax

arakhmati · arakhmati · commit de44c2c00b64 · 2023-12-07T18:07:39.000-05:00
diff --git a/tests/ttnn/integration_tests/bloom/test_bloom_for_question_answering.py b/tests/ttnn/integration_tests/bloom/test_bloom_for_question_answering.py
@@ -11,7 +11,6 @@
 
 from models.experimental.functional_bloom.tt import ttnn_functional_bloom
 from models.experimental.functional_bloom.tt import ttnn_optimized_functional_bloom
-from models.utility_functions import enable_persistent_kernel_cache, disable_persistent_kernel_cache
 from models.utility_functions import skip_for_wormhole_b0
 
 import ttnn
@@ -87,8 +86,6 @@ def test_performance_of_bloom_for_question_answering(
 ):
     torch.manual_seed(0)
 
-    enable_persistent_kernel_cache()
-
     model_name = "bigscience/bloom-560m"
     config = BloomConfig.from_pretrained(model_name)
     tokenizer = BloomTokenizerFast.from_pretrained(model_name)
@@ -117,6 +114,10 @@ def test_performance_of_bloom_for_question_answering(
         input_ids=input_ids, device=device, num_heads=num_heads, attention_mask=attention_mask, max_length=max_length
     )
 
+    # TODO: don't modify the config globally. Pass it into the functions instead
+    ttnn_optimized_functional_bloom.BLOOM_MEMORY_CONFIG = ttnn.L1_MEMORY_CONFIG
+    ttnn_optimized_functional_bloom.ASSUME_FUSED_SOFTMAX = True
+
     # Run twice to measure the time with and without the program cache
     for _ in range(2):
         start = time.time()
@@ -129,4 +130,6 @@ def test_performance_of_bloom_for_question_answering(
         logger.info(f"Duration: {duration}")
         logger.info(f"Samples per second: {1 / duration * batch_size}")
 
-    disable_persistent_kernel_cache()
+    # TODO: don't modify the config globally. Pass it into the functions instead
+    ttnn_optimized_functional_bloom.BLOOM_MEMORY_CONFIG = ttnn.DRAM_MEMORY_CONFIG
+    ttnn_optimized_functional_bloom.ASSUME_FUSED_SOFTMAX = False