update

boomanaiden154 · boomanaiden154 · commit 994a9a8d28d4 · 2025-04-12T21:02:22.000Z
Created using spr 1.3.4
diff --git a/gematria/granite/python/gnn_model_base.py b/gematria/granite/python/gnn_model_base.py
@@ -226,13 +226,15 @@ def __init__(
         shape=global_feature_shape, dtype=global_feature_dtype or self.dtype
     )
 
-    self._graph_network = self._create_graph_network_modules()
-    assert self._graph_network is not None
-
     self._num_message_passing_iterations = num_message_passing_iterations
     self._graph_module_residual_connections = graph_module_residual_connections
     self._graph_module_layer_normalization = graph_module_layer_normalization
 
+  def initialize(self):
+    super().initialize()
+    self._graph_network = self._create_graph_network_modules()
+    assert self._graph_network is not None
+
     self._norm_layers = {}
     self._residual_layers = {}
     nodes_residual_shape = None
diff --git a/gematria/granite/python/gnn_model_base_test.py b/gematria/granite/python/gnn_model_base_test.py
@@ -229,6 +229,8 @@ def __init__(self, *, decoder_residual_connection=False, **kwargs):
         **kwargs,
     )
 
+  def initialize(self):
+    super().initialize()
     self._linear_layer = tf_keras.layers.Dense(
         self.num_tasks, activation='linear'
     )
@@ -476,18 +478,17 @@ def test_train_seq2seq_single_task(self, loss_type, loss_normalization):
   def test_train_seq2num_encoder_decoder_model(
       self, loss_type, loss_normalization
   ):
+    model = TestEncoderDecoderGnnModel(
+        graph_module_layer_normalization=False,
+        loss_normalization=loss_normalization,
+        loss_type=loss_type,
+        learning_rate=0.01,
+    )
     with mock.patch(
         'tf_keras.layers.LayerNormalization',
         side_effect=tf_keras.layers.LayerNormalization,
     ) as tf_keras_layer_norm:
-      model = TestEncoderDecoderGnnModel(
-          graph_module_layer_normalization=False,
-          loss_normalization=loss_normalization,
-          loss_type=loss_type,
-          learning_rate=0.01,
-      )
-
-    model.initialize()
+      model.initialize()
     self.assertEqual(
         tf_keras_layer_norm.call_args_list,
         [
@@ -504,6 +505,14 @@ def test_train_seq2num_encoder_decoder_model(
   def test_train_seq2seq_encoder_decoder_model(
       self, loss_type, loss_normalization
   ):
+    model = TestEncoderDecoderGnnModel(
+        graph_module_layer_normalization=True,
+        graph_module_residual_connections=False,
+        loss_normalization=loss_normalization,
+        loss_type=loss_type,
+        use_deltas=True,
+        learning_rate=0.01,
+    )
     with (
         mock.patch(
             'tf_keras.layers.LayerNormalization',
@@ -514,16 +523,7 @@ def test_train_seq2seq_encoder_decoder_model(
             side_effect=model_blocks.ResidualConnectionLayer,
         ) as residual_connection_layer,
     ):
-      model = TestEncoderDecoderGnnModel(
-          graph_module_layer_normalization=True,
-          graph_module_residual_connections=False,
-          loss_normalization=loss_normalization,
-          loss_type=loss_type,
-          use_deltas=True,
-          learning_rate=0.01,
-      )
-
-    model.initialize()
+      model.initialize()
     # NOTE(ondrasej): tf.math.add is called only when adding residual
     # connections. Since they are disabled in this test case, we should not see
     # any calls to this function.
@@ -563,6 +563,12 @@ def test_train_seq2seq_encoder_decoder_model(
     self.check_training_model(model)
 
   def test_train_seq2seq_model_with_residual_connections(self):
+    model = TestEncoderDecoderGnnModel(
+        graph_module_layer_normalization=True,
+        graph_module_residual_connections=True,
+        use_deltas=True,
+        learning_rate=0.01,
+    )
     with (
         mock.patch(
             'gematria.model.python.model_blocks.ResidualConnectionLayer',
@@ -573,14 +579,7 @@ def test_train_seq2seq_model_with_residual_connections(self):
             side_effect=tf_keras.layers.Dense,
         ) as tf_keras_dense,
     ):
-      model = TestEncoderDecoderGnnModel(
-          graph_module_layer_normalization=True,
-          graph_module_residual_connections=True,
-          use_deltas=True,
-          learning_rate=0.01,
-      )
-
-    model.initialize()
+      model.initialize()
     self.assertEqual(
         residual_connection_layer.call_args_list,
         [
@@ -618,6 +617,14 @@ def test_train_seq2seq_model_with_residual_connections(self):
   def test_train_seq2seq_model_with_residual_connections_with_linear_transform(
       self,
   ):
+    model = TestEncoderDecoderGnnModel(
+        graph_module_layer_normalization=False,
+        graph_module_residual_connections=False,
+        decoder_residual_connection=True,
+        use_deltas=True,
+        learning_rate=0.01,
+    )
+
     with (
         mock.patch(
             'gematria.model.python.model_blocks.ResidualConnectionLayer',
@@ -628,14 +635,7 @@ def test_train_seq2seq_model_with_residual_connections_with_linear_transform(
             side_effect=tf_keras.layers.Dense,
         ) as tf_keras_dense,
     ):
-      model = TestEncoderDecoderGnnModel(
-          graph_module_layer_normalization=False,
-          graph_module_residual_connections=False,
-          decoder_residual_connection=True,
-          use_deltas=True,
-          learning_rate=0.01,
-      )
-    model.initialize()
+      model.initialize()
     self.assertEqual(
         residual_connection_layer.call_args_list,
         [
@@ -648,22 +648,22 @@ def test_train_seq2seq_model_with_residual_connections_with_linear_transform(
         tf_keras_dense.call_args_list,
         [
             mock.call(
-                activation=tf_keras.activations.linear,
-                name='residual_connection_2_0_nodes_transformation',
                 units=5,
+                activation=tf_keras.activations.linear,
                 use_bias=False,
+                name='residual_connection_2_0_nodes_transformation',
             ),
             mock.call(
-                activation=tf_keras.activations.linear,
-                name='residual_connection_2_0_edges_transformation',
                 units=6,
+                activation=tf_keras.activations.linear,
                 use_bias=False,
+                name='residual_connection_2_0_edges_transformation',
             ),
             mock.call(
-                activation=tf_keras.activations.linear,
-                name='residual_connection_2_0_globals_transformation',
                 units=4,
+                activation=tf_keras.activations.linear,
                 use_bias=False,
+                name='residual_connection_2_0_globals_transformation',
             ),
             mock.call(1, activation='linear'),
         ],
diff --git a/gematria/granite/python/graph_builder_model_base.py b/gematria/granite/python/graph_builder_model_base.py
@@ -132,11 +132,21 @@ def __init__(
       **kwargs: Additional keyword arguments are passed to the constructor of
         the base class.
     """
-    token_model.TokenModel.__init__(
-        self,
+    # NOTE(ondrasej): We set the node/edge feature dtypes to int32. They are
+    # indices to the token list/edge type; an int32 should be sufficient for all
+    # our use cases and fixing the type will make it easier to move the array
+    # construction to the C++ code if needed in the future. Similarly for the
+    # graph index dtype.
+    super().__init__(
+        node_feature_shape=(),
+        node_feature_dtype=tf.dtypes.int32,
+        edge_feature_shape=(),
+        edge_feature_dtype=tf.dtypes.int32,
+        global_feature_shape=(len(tokens),),
+        global_feature_dtype=tf.dtypes.int32,
+        graph_index_dtype=tf.dtypes.int32,
         tokens=tokens,
-        out_of_vocabulary_behavior=kwargs['out_of_vocabulary_behavior'],
-        dtype=kwargs['dtype'],
+        **kwargs,
     )
 
     self._instruction_features = None
@@ -157,24 +167,6 @@ def __init__(
     )
     self._num_annotations = len(self._annotation_names_list)
 
-    # NOTE(ondrasej): We set the node/edge feature dtypes to int32. They are
-    # indices to the token list/edge type; an int32 should be sufficient for all
-    # our use cases and fixing the type will make it easier to move the array
-    # construction to the C++ code if needed in the future. Similarly for the
-    # graph index dtype.
-    gnn_model_base.GnnModelBase.__init__(
-        self,
-        node_feature_shape=(),
-        node_feature_dtype=tf.dtypes.int32,
-        edge_feature_shape=(),
-        edge_feature_dtype=tf.dtypes.int32,
-        global_feature_shape=(len(tokens),),
-        global_feature_dtype=tf.dtypes.int32,
-        graph_index_dtype=tf.dtypes.int32,
-        tokens=tokens,
-        **kwargs,
-    )
-
     special_tokens = np.array(
         (
             self._batch_graph_builder.immediate_token,
@@ -220,9 +212,11 @@ def _make_batch_feed_dict(self) -> model_base.FeedDict:
     feed_dict['instruction_node_mask'] = np.array(
         self._batch_graph_builder.instruction_node_mask, dtype=bool
     )
+    self._instruction_node_mask = feed_dict['instruction_node_mask']
     feed_dict['instruction_annotations'] = (
         self._batch_graph_builder.instruction_annotations
     )
+    self._instruction_annotations = feed_dict['instruction_annotations']
     return feed_dict
 
   # @Override