diff --git a/tests/clip/test_modeling_clip.py b/tests/clip/test_modeling_clip.py
index b9fc50c4b6b8..88649b31455c 100644
--- a/tests/clip/test_modeling_clip.py
+++ b/tests/clip/test_modeling_clip.py
@@ -28,7 +28,6 @@
 from transformers.testing_utils import (
     is_flax_available,
     is_pt_flax_cross_test,
-    is_pt_tf_cross_test,
     require_torch,
     require_vision,
     slow,
@@ -602,149 +601,6 @@ def test_load_vision_text_config(self):
             text_config = CLIPTextConfig.from_pretrained(tmp_dir_name)
             self.assertDictEqual(config.text_config.to_dict(), text_config.to_dict())
 
-    # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import numpy as np
-        import tensorflow as tf
-
-        import transformers
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
-
-            if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have TF version yet
-                return
-
-            tf_model_class = getattr(transformers, tf_model_class_name)
-
-            config.output_hidden_states = True
-
-            tf_model = tf_model_class(config)
-            pt_model = model_class(config)
-
-            # make sure only tf inputs are forward that actually exist in function args
-            tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
-
-            # remove all head masks
-            tf_input_keys.discard("head_mask")
-            tf_input_keys.discard("cross_attn_head_mask")
-            tf_input_keys.discard("decoder_head_mask")
-
-            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-            pt_inputs = {k: v for k, v in pt_inputs.items() if k in tf_input_keys}
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tf_inputs_dict[key] = tensor
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
-
-            # need to rename encoder-decoder "inputs" for PyTorch
-            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
-            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-            tfo = tf_model(tf_inputs_dict, training=False)
-
-            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
-            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
-
-                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
-                    continue
-
-                tf_out = tf_output.numpy()
-                pt_out = pt_output.cpu().numpy()
-
-                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
-
-                if len(tf_out.shape) > 0:
-
-                    tf_nans = np.copy(np.isnan(tf_out))
-                    pt_nans = np.copy(np.isnan(pt_out))
-
-                    pt_out[tf_nans] = 0
-                    tf_out[tf_nans] = 0
-                    pt_out[pt_nans] = 0
-                    tf_out[pt_nans] = 0
-
-                max_diff = np.amax(np.abs(tf_out - pt_out))
-                self.assertLessEqual(max_diff, 4e-2)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-                pt_model = pt_model.to(torch_device)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tensor = np.array(tensor, dtype=bool)
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor, dtype=tf.int32)
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
-
-            # need to rename encoder-decoder "inputs" for PyTorch
-            #            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
-            #                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-
-            tfo = tf_model(tf_inputs_dict)
-
-            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
-            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):
-
-                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
-                    continue
-
-                tf_out = tf_output.numpy()
-                pt_out = pt_output.cpu().numpy()
-
-                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")
-
-                if len(tf_out.shape) > 0:
-                    tf_nans = np.copy(np.isnan(tf_out))
-                    pt_nans = np.copy(np.isnan(pt_out))
-
-                    pt_out[tf_nans] = 0
-                    tf_out[tf_nans] = 0
-                    pt_out[pt_nans] = 0
-                    tf_out[pt_nans] = 0
-
-                max_diff = np.amax(np.abs(tf_out - pt_out))
-                self.assertLessEqual(max_diff, 4e-2)
-
     # overwrite from common since FlaxCLIPModel returns nested output
     # which is not supported in the common test
     @is_pt_flax_cross_test
diff --git a/tests/lxmert/test_modeling_lxmert.py b/tests/lxmert/test_modeling_lxmert.py
index adbfbb2ab1c0..f1209d132dc6 100644
--- a/tests/lxmert/test_modeling_lxmert.py
+++ b/tests/lxmert/test_modeling_lxmert.py
@@ -15,16 +15,13 @@
 
 
 import copy
-import os
-import tempfile
 import unittest
 
 import numpy as np
 
-import transformers
 from transformers import LxmertConfig, is_tf_available, is_torch_available
 from transformers.models.auto import get_values
-from transformers.testing_utils import is_pt_tf_cross_test, require_torch, slow, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from ..test_configuration_common import ConfigTester
 from ..test_modeling_common import ModelTesterMixin, ids_tensor
@@ -527,6 +524,8 @@ def prepare_config_and_inputs_for_common(self, return_obj_labels=False):
 
         if return_obj_labels:
             inputs_dict["obj_labels"] = obj_labels
+        else:
+            config.task_obj_predict = False
 
         return config, inputs_dict
 
@@ -740,121 +739,30 @@ def test_retain_grad_hidden_states_attentions(self):
         self.assertIsNotNone(hidden_states_vision.grad)
         self.assertIsNotNone(attentions_vision.grad)
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
-                return_obj_labels="PreTraining" in model_class.__name__
-            )
-
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
-
-            if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have TF version yet
-                return
-
-            tf_model_class = getattr(transformers, tf_model_class_name)
-
-            config.output_hidden_states = True
-            config.task_obj_predict = False
-
-            pt_model = model_class(config)
-            tf_model = tf_model_class(config)
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            def recursive_numpy_convert(iterable):
-                return_dict = {}
-                for key, value in iterable.items():
-                    if type(value) == bool:
-                        return_dict[key] = value
-                    if isinstance(value, dict):
-                        return_dict[key] = recursive_numpy_convert(value)
-                    else:
-                        if isinstance(value, (list, tuple)):
-                            return_dict[key] = (
-                                tf.convert_to_tensor(iter_value.cpu().numpy(), dtype=tf.int32) for iter_value in value
-                            )
-                        else:
-                            return_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.int32)
-                return return_dict
-
-            tf_inputs_dict = recursive_numpy_convert(pt_inputs)
-
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model).to(torch_device)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-
-            # Delete obj labels as we want to compute the hidden states and not the loss
-
-            if "obj_labels" in inputs_dict:
-                del inputs_dict["obj_labels"]
-
-            pt_inputs = self._prepare_for_class(inputs_dict, model_class)
-            tf_inputs_dict = recursive_numpy_convert(pt_inputs)
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-            tfo = tf_model(tf_inputs_dict, training=False)
-            tf_hidden_states = tfo[0].numpy()
-            pt_hidden_states = pto[0].cpu().numpy()
-
-            tf_nans = np.copy(np.isnan(tf_hidden_states))
-            pt_nans = np.copy(np.isnan(pt_hidden_states))
-
-            pt_hidden_states[tf_nans] = 0
-            tf_hidden_states[tf_nans] = 0
-            pt_hidden_states[pt_nans] = 0
-            tf_hidden_states[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
-            # Debug info (remove when fixed)
-            if max_diff >= 2e-2:
-                print("===")
-                print(model_class)
-                print(config)
-                print(inputs_dict)
-                print(pt_inputs)
-            self.assertLessEqual(max_diff, 6e-2)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
-
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-
-            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
-            pt_model.eval()
-
-            for key, value in pt_inputs.items():
-                if key in ("visual_feats", "visual_pos"):
-                    pt_inputs[key] = value.to(torch.float32)
-                else:
-                    pt_inputs[key] = value.to(torch.long)
-
-            with torch.no_grad():
-                pto = pt_model(**pt_inputs)
-
-            tfo = tf_model(tf_inputs_dict)
-            tfo = tfo[0].numpy()
-            pto = pto[0].cpu().numpy()
-            tf_nans = np.copy(np.isnan(tfo))
-            pt_nans = np.copy(np.isnan(pto))
-
-            pto[tf_nans] = 0
-            tfo[tf_nans] = 0
-            pto[pt_nans] = 0
-            tfo[pt_nans] = 0
-
-            max_diff = np.amax(np.abs(tfo - pto))
-            self.assertLessEqual(max_diff, 6e-2)
+    def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
+
+        tf_inputs_dict = {}
+        for key, value in pt_inputs_dict.items():
+            # skip key that does not exist in tf
+            if isinstance(value, dict):
+                tf_inputs_dict[key] = self.prepare_pt_inputs_from_tf_inputs(value)
+            elif isinstance(value, (list, tuple)):
+                tf_inputs_dict[key] = (self.prepare_pt_inputs_from_tf_inputs(iter_value) for iter_value in value)
+            elif type(value) == bool:
+                tf_inputs_dict[key] = value
+            elif key == "input_values":
+                tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32)
+            elif key == "pixel_values":
+                tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32)
+            elif key == "input_features":
+                tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32)
+            # other general float inputs
+            elif value.is_floating_point():
+                tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.float32)
+            else:
+                tf_inputs_dict[key] = tf.convert_to_tensor(value.cpu().numpy(), dtype=tf.int32)
+
+        return tf_inputs_dict
 
 
 @require_torch
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4a4a0eba044f..ac45a1c10822 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -56,7 +56,14 @@
     slow,
     torch_device,
 )
-from transformers.utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, is_flax_available, is_torch_fx_available
+from transformers.utils import (
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    is_flax_available,
+    is_tf_available,
+    is_torch_fx_available,
+)
+from transformers.utils.generic import ModelOutput
 
 
 sys.path.append(str(Path(__file__).parent.parent / "utils"))
@@ -94,6 +101,9 @@
     )
     from transformers.modeling_utils import shard_checkpoint
 
+if is_tf_available():
+    import tensorflow as tf
+
 if is_flax_available():
     import jax.numpy as jnp
     from transformers.modeling_flax_pytorch_utils import (
@@ -1478,237 +1488,240 @@ def recursive_check(tuple_object, dict_object):
                     model, tuple_inputs, dict_inputs, {"output_hidden_states": True, "output_attentions": True}
                 )
 
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import numpy as np
-        import tensorflow as tf
+    # Don't copy this method to model specific test file!
+    # TODO: remove this method once the issues are all fixed!
+    def _make_attention_mask_non_null(self, inputs_dict):
+        """Make sure no sequence has all zeros as attention mask"""
 
-        import transformers
+        for k in ["attention_mask", "encoder_attention_mask", "decoder_attention_mask"]:
+            if k in inputs_dict:
+                attention_mask = inputs_dict[k]
 
-        def prepare_tf_inputs_from_pt_inputs(pt_inputs_dict):
-
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs_dict.items():
-                # skip key that does not exist in tf
-                if type(tensor) == bool:
-                    tf_inputs_dict[key] = tensor
-                elif key == "input_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "pixel_values":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                elif key == "input_features":
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                # To deal with the edge cases from `TFTapasForQuestionAnswering`.
-                # PyTorch can deal with type casting automatically, but TensorFlow is more strict!
-                # TODO: find a clean/better way to deal with these extra keys that are not common.
-                elif key in ["float_answer", "numeric_values", "numeric_values_scale"]:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-                else:
-                    tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
+                # Make sure no all 0s attention masks - to avoid failure at this moment.
+                # Put `1` at the beginning of sequences to make it still work when combining causal attention masks.
+                # TODO: remove this line once a fix regarding large negative values for attention mask is done.
+                attention_mask = torch.cat(
+                    [torch.ones_like(attention_mask[:, :1], dtype=attention_mask.dtype), attention_mask[:, 1:]], dim=-1
+                )
 
-            return tf_inputs_dict
+                # Here we make the first sequence with all 0s as attention mask.
+                # Currently, this will fail for `TFWav2Vec2Model`. This is caused by the different large negative
+                # values, like `1e-4`, `1e-9`, `1e-30` and `-inf` for attention mask across models/frameworks.
+                # TODO: enable this block once the large negative values thing is cleaned up.
+                # (see https://github.com/huggingface/transformers/issues/14859)
+                # attention_mask = torch.cat(
+                #     [torch.zeros_like(attention_mask[:1], dtype=attention_mask.dtype), attention_mask[1:]],
+                #     dim=0
+                # )
+
+                inputs_dict[k] = attention_mask
+
+    # Don't copy this method to model specific test file!
+    # TODO: remove this method once the issues are all fixed!
+    def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_class):
+        """For temporarily ignoring some failed test cases (issues to be fixed)"""
+
+        tf_keys = set([k for k, v in tf_outputs.items() if v is not None])
+        pt_keys = set([k for k, v in pt_outputs.items() if v is not None])
+
+        key_differences = tf_keys.symmetric_difference(pt_keys)
+
+        if model_class.__name__ in [
+            "FlaubertWithLMHeadModel",
+            "FunnelForPreTraining",
+            "ElectraForPreTraining",
+            "XLMWithLMHeadModel",
+            "TransfoXLLMHeadModel",
+        ]:
+            for k in key_differences:
+                if k in ["loss", "losses"]:
+                    tf_keys.discard(k)
+                    pt_keys.discard(k)
+        elif model_class.__name__.startswith("GPT2"):
+            # `TFGPT2` has `past_key_values` as a tensor while `GPT2` has it as a tuple.
+            tf_keys.discard("past_key_values")
+            pt_keys.discard("past_key_values")
+
+        # create new outputs from the remaining fields
+        new_tf_outputs = type(tf_outputs)(**{k: tf_outputs[k] for k in tf_keys})
+        new_pt_outputs = type(pt_outputs)(**{k: pt_outputs[k] for k in pt_keys})
+
+        return new_tf_outputs, new_pt_outputs
+
+    # Copied from tests.test_modeling_tf_common.TFModelTesterMixin.check_pt_tf_outputs
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None):
+        """Check the outputs from PyTorch and TensorFlow models are closed enough. Checks are done in a recursive way.
 
-        def check_outputs(tf_outputs, pt_outputs, model_class, names):
-            """
-            Args:
-                model_class: The class of the model that is currently testing. For example, `TFBertModel`,
-                    TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Currently unused, but it could make
-                    debugging easier and faster.
+        Args:
+            model_class: The class of the model that is currently testing. For example, `TFBertModel`,
+                TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Mainly used for providing more informative
+                error messages.
+            name (`str`): The name of the output. For example, `output.hidden_states`, `output.attentions`, etc.
+            attributes (`Tuple[str]`): The names of the output's element if the output is a tuple/list with each element
+                being a named field in the output.
+        """
 
-                names: A string, or a tuple of strings. These specify what tf_outputs/pt_outputs represent in the model outputs.
-                    Currently unused, but in the future, we could use this information to make the error message clearer
-                    by giving the name(s) of the output tensor(s) with large difference(s) between PT and TF.
-            """
+        self.assertEqual(type(name), str)
+        if attributes is not None:
+            self.assertEqual(type(attributes), tuple, f"{name}: The argument `attributes` should be a `tuple`")
 
-            # Some issue (`about past_key_values`) to solve (e.g. `TFPegasusForConditionalGeneration`) in a separate PR.
-            if names == "past_key_values":
-                return
+        # Allow `ModelOutput` (e.g. `CLIPOutput` has `text_model_output` and `vision_model_output`).
+        if isinstance(tf_outputs, ModelOutput):
+            self.assertTrue(
+                isinstance(pt_outputs, ModelOutput),
+                f"{name}: `pt_outputs` should an instance of `ModelOutput` when `tf_outputs` is",
+            )
 
-            # Allow `list` because `(TF)TransfoXLModelOutput.mems` is a list of tensors.
-            if type(tf_outputs) in [tuple, list]:
-                self.assertEqual(type(tf_outputs), type(pt_outputs))
-                self.assertEqual(len(tf_outputs), len(pt_outputs))
-                if type(names) == tuple:
-                    for tf_output, pt_output, name in zip(tf_outputs, pt_outputs, names):
-                        check_outputs(tf_output, pt_output, model_class, names=name)
-                elif type(names) == str:
-                    for idx, (tf_output, pt_output) in enumerate(zip(tf_outputs, pt_outputs)):
-                        check_outputs(tf_output, pt_output, model_class, names=f"{names}_{idx}")
-                else:
-                    raise ValueError(f"`names` should be a `tuple` or a string. Got {type(names)} instead.")
-            elif isinstance(tf_outputs, tf.Tensor):
-                self.assertTrue(isinstance(pt_outputs, torch.Tensor))
+            # Don't copy this block to model specific test file!
+            # TODO: remove this method and this line after issues are fixed
+            tf_outputs, pt_outputs = self._postprocessing_to_ignore_test_cases(tf_outputs, pt_outputs, model_class)
 
-                tf_outputs = tf_outputs.numpy()
-                pt_outputs = pt_outputs.detach().to("cpu").numpy()
+            tf_keys = tuple([k for k, v in tf_outputs.items() if v is not None])
+            pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
 
-                tf_nans = np.isnan(tf_outputs)
-                pt_nans = np.isnan(pt_outputs)
+            self.assertEqual(tf_keys, pt_keys, f"{name}: Output keys differ between TF and PyTorch")
 
-                pt_outputs[tf_nans] = 0
-                tf_outputs[tf_nans] = 0
-                pt_outputs[pt_nans] = 0
-                tf_outputs[pt_nans] = 0
+            # convert to the case of `tuple`
+            # appending each key to the current (string) `names`
+            attributes = tuple([f"{name}.{k}" for k in tf_keys])
+            self.check_pt_tf_outputs(
+                tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, tol=tol, name=name, attributes=attributes
+            )
 
-                max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
-                self.assertLessEqual(max_diff, 1e-5)
-            else:
-                raise ValueError(
-                    f"`tf_outputs` should be a `tuple` or an instance of `tf.Tensor`. Got {type(tf_outputs)} instead."
+        # Allow `list` (e.g. `TransfoXLModelOutput.mems` is a list of tensors.)
+        elif type(tf_outputs) in [tuple, list]:
+            self.assertEqual(type(tf_outputs), type(pt_outputs), f"{name}: Output types differ between TF and PyTorch")
+            self.assertEqual(len(tf_outputs), len(pt_outputs), f"{name}: Output lengths differ between TF and PyTorch")
+
+            if attributes is not None:
+                # case 1: each output has assigned name (e.g. a tuple form of a `ModelOutput`)
+                self.assertEqual(
+                    len(attributes),
+                    len(tf_outputs),
+                    f"{name}: The tuple `names` should have the same length as `tf_outputs`",
                 )
+            else:
+                # case 2: each output has no assigned name (e.g. hidden states of each layer) -> add an index to `names`
+                attributes = tuple([f"{name}_{idx}" for idx in range(len(tf_outputs))])
 
-        def check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_with_labels):
-
-            # send pytorch model to the correct device
-            pt_model.to(torch_device)
+            for tf_output, pt_output, attr in zip(tf_outputs, pt_outputs, attributes):
+                self.check_pt_tf_outputs(tf_output, pt_output, model_class, tol=tol, name=attr)
 
-            # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
-            pt_model.eval()
+        elif isinstance(tf_outputs, tf.Tensor):
+            self.assertTrue(
+                isinstance(pt_outputs, torch.Tensor), f"{name}: `pt_outputs` should a tensor when `tf_outputs` is"
+            )
 
-            tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
-            tf_inputs_dict_maybe_with_labels = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict_maybe_with_labels)
+            tf_outputs = tf_outputs.numpy()
+            pt_outputs = pt_outputs.detach().to("cpu").numpy()
 
-            # send pytorch inputs to the correct device
-            pt_inputs_dict = {
-                k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
-            }
-            pt_inputs_dict_maybe_with_labels = {
-                k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v
-                for k, v in pt_inputs_dict_maybe_with_labels.items()
-            }
+            self.assertEqual(
+                tf_outputs.shape, pt_outputs.shape, f"{name}: Output shapes differ between TF and PyTorch"
+            )
 
-            # Original test: check without `labels`
-            with torch.no_grad():
-                pt_outputs = pt_model(**pt_inputs_dict)
-            tf_outputs = tf_model(tf_inputs_dict)
+            # deal with NumPy's scalars to make replacing nan values by 0 work.
+            if np.isscalar(tf_outputs):
+                tf_outputs = np.array([tf_outputs])
+                pt_outputs = np.array([pt_outputs])
 
-            tf_keys = tuple([k for k, v in tf_outputs.items() if v is not None])
-            pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+            tf_nans = np.isnan(tf_outputs)
+            pt_nans = np.isnan(pt_outputs)
 
-            self.assertEqual(tf_keys, pt_keys)
-            check_outputs(tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, names=tf_keys)
+            pt_outputs[tf_nans] = 0
+            tf_outputs[tf_nans] = 0
+            pt_outputs[pt_nans] = 0
+            tf_outputs[pt_nans] = 0
 
-            # check the case where `labels` is passed
-            has_labels = any(
-                x in tf_inputs_dict_maybe_with_labels for x in ["labels", "next_sentence_label", "start_positions"]
+            max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
+            self.assertLessEqual(max_diff, tol, f"{name}: Difference between torch and tf is {max_diff} (>= {tol}).")
+        else:
+            raise ValueError(
+                f"`tf_outputs` should be an instance of `tf.Tensor`, a `tuple`, or an instance of `tf.Tensor`. Got {type(tf_outputs)} instead."
             )
-            if has_labels:
 
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs_dict_maybe_with_labels)
-                tf_outputs = tf_model(tf_inputs_dict_maybe_with_labels)
-
-                # Some models' output class don't have `loss` attribute despite `labels` is used.
-                # TODO: identify which models
-                tf_loss = getattr(tf_outputs, "loss", None)
-                pt_loss = getattr(pt_outputs, "loss", None)
-
-                # Some PT models return loss while the corresponding TF models don't (i.e. `None` for `loss`).
-                #   - FlaubertWithLMHeadModel
-                #   - FunnelForPreTraining
-                #   - ElectraForPreTraining
-                #   - XLMWithLMHeadModel
-                # TODO: Fix PT/TF diff -> remove this condition to fail the test if a diff occurs
-                if not ((tf_loss is None and pt_loss is None) or (tf_loss is not None and pt_loss is not None)):
-                    if model_class.__name__ not in [
-                        "FlaubertWithLMHeadModel",
-                        "FunnelForPreTraining",
-                        "ElectraForPreTraining",
-                        "XLMWithLMHeadModel",
-                        "TransfoXLLMHeadModel",
-                    ]:
-                        self.assertEqual(tf_loss is None, pt_loss is None)
-
-                tf_keys = tuple([k for k, v in tf_outputs.items() if v is not None])
-                pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
+    def prepare_tf_inputs_from_pt_inputs(self, pt_inputs_dict):
+
+        tf_inputs_dict = {}
+        for key, tensor in pt_inputs_dict.items():
+            # skip key that does not exist in tf
+            if type(tensor) == bool:
+                tf_inputs_dict[key] = tensor
+            elif key == "input_values":
+                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+            elif key == "pixel_values":
+                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+            elif key == "input_features":
+                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+            # other general float inputs
+            elif tensor.is_floating_point():
+                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
+            else:
+                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.int32)
 
-                # TODO: remove these 2 conditions once the above TODOs (above loss) are implemented
-                # (Also, `TFTransfoXLLMHeadModel` has no `loss` while `TransfoXLLMHeadModel` return `losses`)
-                if tf_keys != pt_keys:
-                    if model_class.__name__ not in [
-                        "FlaubertWithLMHeadModel",
-                        "FunnelForPreTraining",
-                        "ElectraForPreTraining",
-                        "XLMWithLMHeadModel",
-                        "TransfoXLLMHeadModel",
-                    ]:
-                        self.assertEqual(tf_keys, pt_keys)
-
-                # Since we deliberately make some tests pass above (regarding the `loss`), let's still try to test
-                # some remaining attributes in the outputs.
-                # TODO: remove this block of `index` computing once the above TODOs (above loss) are implemented
-                # compute the 1st `index` where `tf_keys` and `pt_keys` is different
-                index = 0
-                for _ in range(min(len(tf_keys), len(pt_keys))):
-                    if tf_keys[index] == pt_keys[index]:
-                        index += 1
-                    else:
-                        break
-                if tf_keys[:index] != pt_keys[:index]:
-                    self.assertEqual(tf_keys, pt_keys)
+        return tf_inputs_dict
 
-                # Some models require extra condition to return loss. For example, `(TF)BertForPreTraining` requires
-                # both`labels` and `next_sentence_label`.
-                if tf_loss is not None and pt_loss is not None:
+    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
 
-                    # check anything else than `loss`
-                    keys = tuple([k for k in tf_keys])
-                    check_outputs(tf_outputs[1:index], pt_outputs[1:index], model_class, names=keys[1:index])
+        tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
 
-                    # check `loss`
+        # send pytorch inputs to the correct device
+        pt_inputs_dict = {
+            k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
+        }
 
-                    # tf models returned loss is usually a tensor rather than a scalar.
-                    # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`)
-                    # Change it here to a scalar to match PyTorch models' loss
-                    tf_loss = tf.math.reduce_mean(tf_loss).numpy()
-                    pt_loss = pt_loss.detach().to("cpu").numpy()
+        # send pytorch model to the correct device
+        pt_model.to(torch_device)
 
-                    tf_nans = np.isnan(tf_loss)
-                    pt_nans = np.isnan(pt_loss)
-                    # the 2 losses need to be both nan or both not nan
-                    self.assertEqual(tf_nans, pt_nans)
+        # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
+        pt_model.eval()
 
-                    if not tf_nans:
-                        max_diff = np.amax(np.abs(tf_loss - pt_loss))
-                        self.assertLessEqual(max_diff, 1e-5)
+        with torch.no_grad():
+            pt_outputs = pt_model(**pt_inputs_dict)
+        tf_outputs = tf_model(tf_inputs_dict)
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # tf models returned loss is usually a tensor rather than a scalar.
+        # (see `hf_compute_loss`: it uses `tf.keras.losses.Reduction.NONE`)
+        # Change it here to a scalar to match PyTorch models' loss
+        tf_loss = getattr(tf_outputs, "loss", None)
+        if tf_loss is not None:
+            tf_outputs.loss = tf.math.reduce_mean(tf_loss)
+
+        self.check_pt_tf_outputs(tf_outputs, pt_outputs, type(pt_model))
+
+    @is_pt_tf_cross_test
+    def test_pt_tf_model_equivalence(self):
+        import transformers
 
         for model_class in self.all_model_classes:
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
 
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
             if not hasattr(transformers, tf_model_class_name):
-                # transformers does not have TF version yet
+                # transformers does not have this model in TF version yet
                 return
 
             # Output all for aggressive testing
             config.output_hidden_states = True
             config.output_attentions = self.has_attentions
 
-            for k in ["attention_mask", "encoder_attention_mask", "decoder_attention_mask"]:
-                if k in inputs_dict:
-                    attention_mask = inputs_dict[k]
-                    # make sure no all 0s attention masks - to avoid failure at this moment.
-                    # TODO: remove this line once the TODO below is implemented.
-                    attention_mask = torch.ones_like(attention_mask, dtype=torch.int32)
-                    # Here we make the first sequence with all 0s as attention mask.
-                    # Currently, this will fail for `TFWav2Vec2Model`. This is caused by the different large negative
-                    # values, like `1e-4`, `1e-9`, `1e-30` and `-inf` for attention mask across models/frameworks.
-                    # TODO: enable this block once the large negative values thing is cleaned up.
-                    # (see https://github.com/huggingface/transformers/issues/14859)
-                    # attention_mask = torch.cat(
-                    #     [
-                    #         torch.zeros_like(attention_mask[:1], dtype=torch.int32),
-                    #         attention_mask[1:].type(dtype=torch.int32)
-                    #     ],
-                    #     dim=0
-                    # )
-                    inputs_dict[k] = attention_mask
+            # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
+            # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
+            # TODO: Use a uniform value for all models, make sure all tests pass without this processing, and remove it.
+            self._make_attention_mask_non_null(inputs_dict)
 
             tf_model_class = getattr(transformers, tf_model_class_name)
 
-            tf_model = tf_model_class(config)
             pt_model = model_class(config)
+            tf_model = tf_model_class(config)
+
+            pt_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+            pt_inputs_dict_with_labels = self._prepare_for_class(
+                inputs_dict,
+                model_class,
+                # Not all models accept "labels" in the forward pass (yet :) )
+                return_labels=True if "labels" in inspect.signature(model_class.forward).parameters.keys() else False,
+            )
 
             # make sure only tf inputs are forward that actually exist in function args
             tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
@@ -1718,20 +1731,25 @@ def check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_
             tf_input_keys.discard("cross_attn_head_mask")
             tf_input_keys.discard("decoder_head_mask")
 
-            pt_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-            pt_inputs_dict_maybe_with_labels = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-
             pt_inputs_dict = {k: v for k, v in pt_inputs_dict.items() if k in tf_input_keys}
-            pt_inputs_dict_maybe_with_labels = {
-                k: v for k, v in pt_inputs_dict_maybe_with_labels.items() if k in tf_input_keys
-            }
+            pt_inputs_dict_with_labels = {k: v for k, v in pt_inputs_dict_with_labels.items() if k in tf_input_keys}
+
+            # For some models (e.g. base models), there is no label returned.
+            # Set the input dict to `None` to avoid check outputs twice for the same input dicts.
+            if set(pt_inputs_dict_with_labels.keys()).symmetric_difference(pt_inputs_dict.keys()):
+                pt_inputs_dict_with_labels = None
 
             # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
+            # Here requires `tf_inputs_dict` to build `tf_model`
+            tf_inputs_dict = self.prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
             tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
             pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
-            check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_with_labels)
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
+            # check with `labels`
+            if pt_inputs_dict_with_labels:
+                self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict_with_labels)
 
             # Check we can load pt model in tf and vice-versa with checkpoint => model functions
             with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1742,9 +1760,12 @@ def check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_
                 tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                 tf_model.save_weights(tf_checkpoint_path)
                 pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-                pt_model = pt_model.to(torch_device)
 
-            check_pt_tf_models(tf_model, pt_model, pt_inputs_dict, pt_inputs_dict_maybe_with_labels)
+            # Original test: check without `labels`
+            self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
+            # check with `labels`
+            if pt_inputs_dict_with_labels:
+                self.check_pt_tf_models(tf_model, pt_model, pt_inputs_dict_with_labels)
 
     def assert_almost_equals(self, a: np.ndarray, b: np.ndarray, tol: float):
         diff = np.abs((a - b)).max()
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 195c7daa84dd..6fb0f845c6dd 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -565,8 +565,7 @@ def test_pt_tf_model_equivalence(self):
 
             # Output all for aggressive testing
             config.output_hidden_states = True
-            if self.has_attentions:
-                config.output_attentions = True
+            config.output_attentions = self.has_attentions
 
             # Make sure no sequence has all zeros as attention mask, otherwise some tests fail due to the inconsistency
             # of the usage `1e-4`, `1e-9`, `1e-30`, `-inf`.
diff --git a/tests/vit_mae/test_modeling_vit_mae.py b/tests/vit_mae/test_modeling_vit_mae.py
index 8cbde5b2ce92..fae72a8ad7be 100644
--- a/tests/vit_mae/test_modeling_vit_mae.py
+++ b/tests/vit_mae/test_modeling_vit_mae.py
@@ -17,14 +17,13 @@
 
 import inspect
 import math
-import os
 import tempfile
 import unittest
 
 import numpy as np
 
 from transformers import ViTMAEConfig
-from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ..test_configuration_common import ConfigTester
@@ -321,150 +320,20 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
     # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise
     # to generate masks during test
-    @is_pt_tf_cross_test
-    def test_pt_tf_model_equivalence(self):
-        import numpy as np
-        import tensorflow as tf
-
-        import transformers
+    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
 
         # make masks reproducible
         np.random.seed(2)
 
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-        num_patches = int((config.image_size // config.patch_size) ** 2)
+        num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
         noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-        pt_noise = torch.from_numpy(noise).to(device=torch_device)
-        tf_noise = tf.constant(noise)
-
-        def prepare_tf_inputs_from_pt_inputs(pt_inputs_dict):
-
-            tf_inputs_dict = {}
-            for key, tensor in pt_inputs_dict.items():
-                tf_inputs_dict[key] = tf.convert_to_tensor(tensor.cpu().numpy(), dtype=tf.float32)
-
-            return tf_inputs_dict
-
-        def check_outputs(tf_outputs, pt_outputs, model_class, names):
-            """
-            Args:
-                model_class: The class of the model that is currently testing. For example, `TFBertModel`,
-                    TFBertForMaskedLM`, `TFBertForSequenceClassification`, etc. Currently unused, but it could make
-                    debugging easier and faster.
-
-                names: A string, or a tuple of strings. These specify what tf_outputs/pt_outputs represent in the model outputs.
-                    Currently unused, but in the future, we could use this information to make the error message clearer
-                    by giving the name(s) of the output tensor(s) with large difference(s) between PT and TF.
-            """
-
-            # Allow `list` because `(TF)TransfoXLModelOutput.mems` is a list of tensors.
-            if type(tf_outputs) in [tuple, list]:
-                self.assertEqual(type(tf_outputs), type(pt_outputs))
-                self.assertEqual(len(tf_outputs), len(pt_outputs))
-                if type(names) == tuple:
-                    for tf_output, pt_output, name in zip(tf_outputs, pt_outputs, names):
-                        check_outputs(tf_output, pt_output, model_class, names=name)
-                elif type(names) == str:
-                    for idx, (tf_output, pt_output) in enumerate(zip(tf_outputs, pt_outputs)):
-                        check_outputs(tf_output, pt_output, model_class, names=f"{names}_{idx}")
-                else:
-                    raise ValueError(f"`names` should be a `tuple` or a string. Got {type(names)} instead.")
-            elif isinstance(tf_outputs, tf.Tensor):
-                self.assertTrue(isinstance(pt_outputs, torch.Tensor))
-
-                tf_outputs = tf_outputs.numpy()
-                if isinstance(tf_outputs, np.float32):
-                    tf_outputs = np.array(tf_outputs, dtype=np.float32)
-                pt_outputs = pt_outputs.detach().to("cpu").numpy()
-
-                tf_nans = np.isnan(tf_outputs)
-                pt_nans = np.isnan(pt_outputs)
-
-                pt_outputs[tf_nans] = 0
-                tf_outputs[tf_nans] = 0
-                pt_outputs[pt_nans] = 0
-                tf_outputs[pt_nans] = 0
-
-                max_diff = np.amax(np.abs(tf_outputs - pt_outputs))
-                self.assertLessEqual(max_diff, 1e-5)
-            else:
-                raise ValueError(
-                    f"`tf_outputs` should be a `tuple` or an instance of `tf.Tensor`. Got {type(tf_outputs)} instead."
-                )
-
-        def check_pt_tf_models(tf_model, pt_model, pt_inputs_dict):
-            # we are not preparing a model with labels because of the formation
-            # of the ViT MAE model
-
-            # send pytorch model to the correct device
-            pt_model.to(torch_device)
-
-            # Check predictions on first output (logits/hidden-states) are close enough given low-level computational differences
-            pt_model.eval()
-
-            tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
-
-            # send pytorch inputs to the correct device
-            pt_inputs_dict = {
-                k: v.to(device=torch_device) if isinstance(v, torch.Tensor) else v for k, v in pt_inputs_dict.items()
-            }
-
-            # Original test: check without `labels`
-            with torch.no_grad():
-                pt_outputs = pt_model(**pt_inputs_dict, noise=pt_noise)
-            tf_outputs = tf_model(tf_inputs_dict, noise=tf_noise)
-
-            tf_keys = tuple([k for k, v in tf_outputs.items() if v is not None])
-            pt_keys = tuple([k for k, v in pt_outputs.items() if v is not None])
-
-            self.assertEqual(tf_keys, pt_keys)
-            check_outputs(tf_outputs.to_tuple(), pt_outputs.to_tuple(), model_class, names=tf_keys)
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            tf_model_class_name = "TF" + model_class.__name__  # Add the "TF" at the beginning
-
-            # Output all for aggressive testing
-            config.output_hidden_states = True
-            config.output_attentions = self.has_attentions
-
-            tf_model_class = getattr(transformers, tf_model_class_name)
-
-            tf_model = tf_model_class(config)
-            pt_model = model_class(config)
-
-            # make sure only tf inputs are forward that actually exist in function args
-            tf_input_keys = set(inspect.signature(tf_model.call).parameters.keys())
-
-            # remove all head masks
-            tf_input_keys.discard("head_mask")
-            tf_input_keys.discard("cross_attn_head_mask")
-            tf_input_keys.discard("decoder_head_mask")
-
-            pt_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-
-            pt_inputs_dict = {k: v for k, v in pt_inputs_dict.items() if k in tf_input_keys}
-
-            # Check we can load pt model in tf and vice-versa with model => model functions
-            tf_inputs_dict = prepare_tf_inputs_from_pt_inputs(pt_inputs_dict)
-            tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=tf_inputs_dict)
-            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
-
-            check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
-
-            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
-                torch.save(pt_model.state_dict(), pt_checkpoint_path)
-                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+        pt_noise = torch.from_numpy(noise)
 
-                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
-                tf_model.save_weights(tf_checkpoint_path)
-                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
-                pt_model = pt_model.to(torch_device)
+        # Add `noise` argument.
+        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
+        pt_inputs_dict["noise"] = pt_noise
 
-            check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
+        super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
 
     def test_save_load(self):