From 1b1f69bc644fe0954567d2ca472b3d08e7a7c775 Mon Sep 17 00:00:00 2001 From: Shaobo Hou Date: Wed, 17 Jul 2024 02:32:12 -0700 Subject: [PATCH] Remove usage of enable_xla. PiperOrigin-RevId: 653152832 --- tf2jax/_src/roundtrip_test.py | 497 +++++++++++++++++----------------- tf2jax/_src/sharding_test.py | 19 +- 2 files changed, 247 insertions(+), 269 deletions(-) diff --git a/tf2jax/_src/roundtrip_test.py b/tf2jax/_src/roundtrip_test.py index 5302710..74e0513 100644 --- a/tf2jax/_src/roundtrip_test.py +++ b/tf2jax/_src/roundtrip_test.py @@ -19,13 +19,11 @@ from absl import flags from absl.testing import absltest from absl.testing import parameterized - import chex import haiku as hk import jax from jax.experimental import jax2tf import jax.numpy as jnp - import numpy as np import tensorflow as tf from tf2jax._src import config @@ -54,6 +52,7 @@ def _bool_env(varname: str, value: bool) -> bool: def _compute_gradients(func, *inputs): def fn(*args): return jax.tree_util.tree_leaves(func(*args))[0] + return jax.grad(lambda *args: jnp.sum(fn(*args)))(*inputs) @@ -86,19 +85,18 @@ def _test_convert( inputs, *, with_grad, - enable_xla, with_custom_grad=False, grad_tols=None, ): if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") if with_grad and not with_custom_grad: self.skipTest( "native_serialization does not support differentiation without " - "custom gradient.") + "custom gradient." + ) grad_tols = grad_tols or {} + def assert_grad_all_close(*args): return self.assertAllClose(*args, **grad_tols) @@ -109,9 +107,7 @@ def assert_grad_all_close(*args): jax_grads = _compute_gradients(jax_func, *inputs) # Jax -> TF - tf_func = _jax2tf_convert( - jax_func, with_gradient=with_grad, enable_xla=enable_xla - ) + tf_func = _jax2tf_convert(jax_func, with_gradient=with_grad) tf_func = tf.function(tf_func, jit_compile=True, autograph=False) tf_outputs = tf_func(*inputs) jax.tree.map(self.assertAllClose, jax_outputs, tf_outputs) @@ -119,7 +115,8 @@ def assert_grad_all_close(*args): # Jax -> TF -> Jax with config.override_config("convert_custom_gradient", with_custom_grad): rejax_func = tf2jax.convert_functional( - tf_func, *tree.map_structure(np.zeros_like, inputs)) + tf_func, *tree.map_structure(np.zeros_like, inputs) + ) rejax_func = self.variant(rejax_func) rejax_outputs = rejax_func(*inputs) jax.tree.map(self.assertAllClose, rejax_outputs, tf_outputs) @@ -140,7 +137,8 @@ def assert_grad_all_close(*args): # Jax -> TF -> SavedModel -> TF -> Jax with config.override_config("convert_custom_gradient", with_custom_grad): rejax_too_func = tf2jax.convert_functional( - restored.f, *tree.map_structure(np.zeros_like, inputs)) + restored.f, *tree.map_structure(np.zeros_like, inputs) + ) rejax_too_func = self.variant(rejax_too_func) rejax_too_outputs = rejax_too_func(*inputs) jax.tree.map(self.assertAllClose, rejax_too_outputs, tf_outputs) @@ -152,11 +150,11 @@ def assert_grad_all_close(*args): @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), (("without_custom_gradient", False), ("with_custom_gradient", True)), named=True, - )) - def test_simple(self, with_grad, enable_xla, with_custom_grad): + ) + ) + def test_simple(self, with_grad, with_custom_grad): np.random.seed(42) def forward(x): @@ -164,20 +162,21 @@ def forward(x): inputs = np.random.normal((3, 2)).astype(np.float32) self._test_convert( - forward, [inputs], + forward, + [inputs], with_grad=with_grad, - enable_xla=enable_xla, - with_custom_grad=with_custom_grad) + with_custom_grad=with_custom_grad, + ) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), (("without_custom_gradient", False), ("with_custom_gradient", True)), named=True, - )) - def test_mlp(self, with_grad, enable_xla, with_custom_grad): + ) + ) + def test_mlp(self, with_grad, with_custom_grad): np.random.seed(42) def forward(x): @@ -190,20 +189,21 @@ def forward(x): variables = hk.data_structures.to_mutable_dict(variables) jax_fn = hk.without_apply_rng(forward).apply self._test_convert( - jax_fn, [variables, inputs], + jax_fn, + [variables, inputs], with_grad=with_grad, - enable_xla=enable_xla, - with_custom_grad=with_custom_grad) + with_custom_grad=with_custom_grad, + ) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), (("without_custom_gradient", False), ("with_custom_gradient", True)), named=True, - )) - def test_batch_norm(self, with_grad, enable_xla, with_custom_grad): + ) + ) + def test_batch_norm(self, with_grad, with_custom_grad): np.random.seed(42) def forward(x): @@ -213,40 +213,46 @@ def forward(x): inputs = np.random.normal(size=(8, 17)) forward = hk.transform_with_state(forward) variables, states = forward.init( - jax.random.PRNGKey(42), jnp.zeros_like(inputs)) + jax.random.PRNGKey(42), jnp.zeros_like(inputs) + ) variables = hk.data_structures.to_mutable_dict(variables) states = hk.data_structures.to_mutable_dict(states) + def jax_fn(params, states, x): outputs, states = hk.without_apply_rng(forward).apply(params, states, x) return outputs, hk.data_structures.to_mutable_dict(states) # Perturb variables and states. variables = tree.map_structure( - lambda x: x + np.random.uniform(size=x.shape), variables) + lambda x: x + np.random.uniform(size=x.shape), variables + ) states = tree.map_structure( - lambda x: x + np.random.normal(size=x.shape), states) + lambda x: x + np.random.normal(size=x.shape), states + ) self._test_convert( - jax_fn, [variables, states, inputs], + jax_fn, + [variables, states, inputs], with_grad=with_grad, - enable_xla=enable_xla, - with_custom_grad=with_custom_grad) + with_custom_grad=with_custom_grad, + ) # Conv2D uses jax.lax.conv_general_dilated which is translated to XlaConv. @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), named=True, - )) - def test_conv2d(self, with_grad, enable_xla): + ) + ) + def test_conv2d(self, with_grad): np.random.seed(42) tols = dict(rtol=1e-5) if jax.default_backend().lower() == "gpu" else {} def forward(x): conv = hk.Conv2D( - output_channels=7, kernel_shape=3, stride=1, padding="SAME") + output_channels=7, kernel_shape=3, stride=1, padding="SAME" + ) return conv(x) inputs = np.random.normal(size=(8, 28, 28, 3)) @@ -255,29 +261,28 @@ def forward(x): variables = hk.data_structures.to_mutable_dict(variables) jax_fn = hk.without_apply_rng(forward).apply self._test_convert( - jax_fn, [variables, inputs], - with_grad=with_grad, - enable_xla=enable_xla, - grad_tols=tols) + jax_fn, [variables, inputs], with_grad=with_grad, grad_tols=tols + ) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), ( ("default_group_counts", dict()), ("feature_group_count", dict(feature_group_count=3)), ("batch_group_count", dict(batch_group_count=2)), ), named=True, - )) - def test_xla_conv(self, with_grad, enable_xla, group_counts): + ) + ) + def test_xla_conv(self, with_grad, group_counts): np.random.seed(42) kernels = np.random.normal(size=(3, 3, 3, 12)) dimension_numbers = jax.lax.ConvDimensionNumbers( - lhs_spec=(0, 3, 1, 2), rhs_spec=(3, 2, 0, 1), out_spec=(0, 3, 1, 2)) + lhs_spec=(0, 3, 1, 2), rhs_spec=(3, 2, 0, 1), out_spec=(0, 3, 1, 2) + ) def forward(x): return jax.lax.conv_general_dilated( @@ -288,7 +293,8 @@ def forward(x): lhs_dilation=(1, 1), rhs_dilation=(1, 1), dimension_numbers=dimension_numbers, - **group_counts) + **group_counts, + ) feature_dim = 3 * group_counts.get("feature_group_count", 1) inputs = np.random.normal(size=(8, 28, 28, feature_dim)) @@ -296,17 +302,16 @@ def forward(x): variables = forward.init(jax.random.PRNGKey(42), jnp.zeros_like(inputs)) variables = hk.data_structures.to_mutable_dict(variables) jax_fn = hk.without_apply_rng(forward).apply - self._test_convert( - jax_fn, [variables, inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(jax_fn, [variables, inputs], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), named=True, - )) - def test_dot(self, with_grad, enable_xla): + ) + ) + def test_dot(self, with_grad): def forward(lhs, rhs): return jax.lax.dot(lhs, rhs) @@ -314,17 +319,16 @@ def forward(lhs, rhs): np.linspace(0, 1, 10 * 5).reshape(10, 5), np.linspace(-1, 0, 5 * 3).reshape(5, 3), ) - self._test_convert( - forward, inputs, with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, inputs, with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), named=True, - )) - def test_dot_general(self, with_grad, enable_xla): + ) + ) + def test_dot_general(self, with_grad): dimension_numbers = (((2,), (1,)), ((0,), (0,))) def forward(lhs, rhs): @@ -334,32 +338,30 @@ def forward(lhs, rhs): np.linspace(0, 1, 2 * 10 * 5).reshape((2, 10, 5)), np.linspace(-1, 0, 2 * 5 * 3).reshape((2, 5, 3)), ) - self._test_convert( - forward, inputs, with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, inputs, with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), named=True, - )) - def test_dynamic_slice(self, with_grad, enable_xla): + ) + ) + def test_dynamic_slice(self, with_grad): def forward(x): return jax.lax.dynamic_slice(x, (1, 1), (2, 3)) inputs = np.linspace(0, 1, 12).reshape(3, 4) - self._test_convert( - forward, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [inputs], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), named=True, - )) - def test_dynamic_update_slice(self, with_grad, enable_xla): + ) + ) + def test_dynamic_update_slice(self, with_grad): def forward(x, y): return jax.lax.dynamic_update_slice(x, y, (1, 2)) @@ -367,17 +369,16 @@ def forward(x, y): np.linspace(0, 1, 12).reshape(3, 4), 1.0 - np.linspace(0, 1, 12).reshape(3, 4), ] - self._test_convert( - forward, inputs, with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, inputs, with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), named=True, - )) - def test_gather(self, with_grad, enable_xla): + ) + ) + def test_gather(self, with_grad): dimension_numbers = jax.lax.GatherDimensionNumbers((1,), (0,), (0, 1)) slice_sizes = (1, 3) @@ -389,17 +390,16 @@ def forward(operand, indices): np.linspace(0, 1, 10 * 5).reshape(10, 5), np.array([[4, 2], [3, 2]]), ) - self._test_convert( - forward, inputs, with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, inputs, with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), named=True, - )) - def test_pad(self, with_grad, enable_xla): + ) + ) + def test_pad(self, with_grad): padding_config = [(1, 2, 1), (0, 1, 0)] def forward(operand, padding_value): @@ -409,39 +409,37 @@ def forward(operand, padding_value): np.linspace(0, 1, 2 * 3).reshape(2, 3), np.array(0.42), ) - self._test_convert( - forward, inputs, with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, inputs, with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), ( ("min", jax.lax.min, jnp.inf), ("max", jax.lax.max, -jnp.inf), ("add", jax.lax.add, 0.0), ), named=True, - )) - def test_reduce(self, with_grad, enable_xla, reduce_fn, init_value): + ) + ) + def test_reduce(self, with_grad, reduce_fn, init_value): def forward(x): dimensions = [1, 2] return jax.lax.reduce(x, init_value, reduce_fn, dimensions) inputs = np.linspace(0, 1, 2 * 5 * 5 * 3).reshape((2, 5, 5, 3)) - self._test_convert( - forward, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [inputs], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), named=True, - )) - def test_reduce_variadic(self, with_grad, enable_xla): + ) + ) + def test_reduce_variadic(self, with_grad): def forward(args): return jax.lax.reduce(args, (0.0, 1.0), lambda xs, ys: xs, [1, 2]) @@ -450,39 +448,37 @@ def forward(args): np.linspace(0, 1, 2 * 5 * 5 * 3).reshape((2, 5, 5, 3)), np.linspace(2, 3, 2 * 5 * 5 * 3).reshape((2, 5, 5, 3)), ) - self._test_convert( - forward, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [inputs], with_grad=with_grad) # jax.lax.reduce_window is translated to XlaReduceWindow. @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), ( ("min", jax.lax.min, jnp.inf), ("max", jax.lax.max, -jnp.inf), ("add", jax.lax.add, 0.0), ), named=True, - )) - def test_reduce_window(self, with_grad, enable_xla, reduce_fn, init_value): + ) + ) + def test_reduce_window(self, with_grad, reduce_fn, init_value): np.random.seed(42) def forward(x): window_shape = [1, 2, 2, 1] - return jax.lax.reduce_window(x, init_value, reduce_fn, window_shape, - window_shape, "SAME") + return jax.lax.reduce_window( + x, init_value, reduce_fn, window_shape, window_shape, "SAME" + ) inputs = np.random.normal(size=(8, 28, 28, 3)) - self._test_convert( - forward, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [inputs], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), ( ("min", jax.lax.cummin), ("max", jax.lax.cummax), @@ -502,11 +498,16 @@ def forward(x): ("no_heuristic", False), ), named=True, - )) - def test_cumulative_reduction(self, with_grad, enable_xla, reducer, axis, - reverse, use_heuristic): - if (with_grad and not use_heuristic and - jax.default_backend().lower() == "tpu"): + ) + ) + def test_cumulative_reduction( + self, with_grad, reducer, axis, reverse, use_heuristic + ): + if ( + with_grad + and not use_heuristic + and jax.default_backend().lower() == "tpu" + ): self.skipTest("Gradient of reduce-window not always supported on TPU") np.random.seed(42) @@ -516,28 +517,28 @@ def forward(x): inputs = np.random.normal(size=(4, 3)) - with config.override_config("infer_cumulative_reduction_from_jax2tf", - use_heuristic): + with config.override_config( + "infer_cumulative_reduction_from_jax2tf", use_heuristic + ): roundtrip_forward = tf2jax.convert_functional( - tf.function(_jax2tf_convert(forward), autograph=False), inputs) + tf.function(_jax2tf_convert(forward), autograph=False), inputs + ) roundtrip_jaxpr = jax.make_jaxpr(roundtrip_forward)(inputs) - if (use_heuristic and - not uses_native_serialization()): + if use_heuristic and not uses_native_serialization(): self.assertNotIn("reduce_window", roundtrip_jaxpr.pretty_print()) - if (with_grad and enable_xla and reducer is jax.lax.cumprod and - not use_heuristic): - self.skipTest("No differentiation rule for `reduce_window` with " - "`jax.lax.cumprod`.") + if with_grad and reducer is jax.lax.cumprod and not use_heuristic: + self.skipTest( + "No differentiation rule for `reduce_window` with " + "`jax.lax.cumprod`." + ) - self._test_convert( - forward, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [inputs], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False),), - (("enable_xla", True),), (("uint32", np.uint32),), ( ("default", xla_data_pb2.RandomAlgorithm.RNG_DEFAULT), @@ -545,19 +546,20 @@ def forward(x): ("philox", xla_data_pb2.RandomAlgorithm.RNG_PHILOX), ), named=True, - )) - def test_rng_bit_generator(self, with_grad, enable_xla, dtype, algorithm): + ) + ) + def test_rng_bit_generator(self, with_grad, dtype, algorithm): def forward(key): return jax.lax.rng_bit_generator( - key, shape=(10, 5), dtype=dtype, algorithm=algorithm) + key, shape=(10, 5), dtype=dtype, algorithm=algorithm + ) if dtype == np.uint32: key = np.array([6, 7, 8, 9], dtype=np.uint32) else: raise ValueError(f"Unsupported dtype={dtype}") - self._test_convert( - forward, [key], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [key], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( @@ -570,14 +572,16 @@ def forward(key): ("scatter_max", jax.lax.scatter_max), ), (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), (("unique_indices", True), ("non_unique_indices", False)), named=True, - )) - def test_scatter(self, scatter_fn, with_grad, enable_xla, unique_indices): + ) + ) + def test_scatter(self, scatter_fn, with_grad, unique_indices): if scatter_fn is jax.lax.scatter_mul and with_grad and not unique_indices: - self.skipTest("Gradient is disallowed for jax.lax.scatter_mul if " - "unique_indices=False") + self.skipTest( + "Gradient is disallowed for jax.lax.scatter_mul if " + "unique_indices=False" + ) dimension_numbers = jax.lax.ScatterDimensionNumbers((1,), (0,), (0,)) @@ -587,44 +591,43 @@ def forward(operand, indices, updates): indices, updates, dimension_numbers, - unique_indices=unique_indices) + unique_indices=unique_indices, + ) inputs = ( - np.linspace(0, 1, 10*5).reshape(10, 5), + np.linspace(0, 1, 10 * 5).reshape(10, 5), np.array([[1], [8], [4]]), np.linspace(0, 9, 9).reshape(3, 3), ) - self._test_convert( - forward, inputs, with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, inputs, with_grad=with_grad) # Derivative of jax.lax.reduce_window uses XlaSelectAndScatter. @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False),), - (("enable_xla", True),), ( ("min", jax.lax.min, jnp.inf), ("max", jax.lax.max, -jnp.inf), ("add", jax.lax.add, 0.0), ), named=True, - )) - def test_select_and_scatter(self, with_grad, enable_xla, reduce_fn, - init_value): + ) + ) + def test_select_and_scatter(self, with_grad, reduce_fn, init_value): np.random.seed(42) def forward(x): window_shape = [1, 2, 2, 1] - return jax.lax.reduce_window(x, init_value, reduce_fn, window_shape, - window_shape, "SAME") + return jax.lax.reduce_window( + x, init_value, reduce_fn, window_shape, window_shape, "SAME" + ) inputs = np.random.normal(size=(8, 5, 5, 3)) jax_fn = jax.jacrev(forward) try: - self._test_convert( - jax_fn, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(jax_fn, [inputs], with_grad=with_grad) except tf.errors.InvalidArgumentError as e: if jax.default_backend().lower() == "tpu": # Can fail on older TPUs. @@ -636,39 +639,34 @@ def forward(x): @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("enable_xla", True),), (("2nd_last_dim", -2), ("last_dim", -1)), (("not_stable", False), ("is_stable", True)), (("one_keys", 1), ("two_keys", 2), ("three_keys", 3)), named=True, - )) - def test_sort_variadic(self, with_grad, enable_xla, dim, is_stable, num_keys): + ) + ) + def test_sort_variadic(self, with_grad, dim, is_stable, num_keys): def forward(args): return jax.lax.sort( - args, dimension=dim, is_stable=is_stable, num_keys=num_keys) + args, dimension=dim, is_stable=is_stable, num_keys=num_keys + ) inputs = ( - np.array([[6., 2.], [4., 2.], [4., 1.]], np.float32), - np.array([[1., 2.], [3., 4.], [5., 6.]], np.float32), - np.array([[6., 5.], [4., 3.], [2., 1.]], np.float32), + np.array([[6.0, 2.0], [4.0, 2.0], [4.0, 1.0]], np.float32), + np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], np.float32), + np.array([[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]], np.float32), ) - self._test_convert( - forward, [inputs], with_grad=with_grad, enable_xla=enable_xla) + self._test_convert(forward, [inputs], with_grad=with_grad) @chex.variants(with_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), named=True, ) ) - def test_polymorphic_shape(self, with_grad, enable_xla): - if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") - + def test_polymorphic_shape(self, with_grad): inputs = np.array(range(36), dtype=np.float32).reshape(9, 4) # TF @@ -681,6 +679,7 @@ def forward(x): outputs = tf.concat([outputs] * 2, axis=0) # Stack along unknown dim outputs = tf.concat([outputs] * 2, axis=1) # Stack along knonwn dim return outputs / tf.cast(shape[0], tf.float32) # Divide by unknown dim + tf_outputs = forward(inputs) # TF -> JAX @@ -691,15 +690,15 @@ def forward(x): # TF -> JAX -> TF new_tf_forward = _jax2tf_convert( - jax_func, - polymorphic_shapes=["(b, _)"], - with_gradient=with_grad, - enable_xla=enable_xla) + jax_func, polymorphic_shapes=["(b, _)"], with_gradient=with_grad + ) new_tf_forward = tf.function(new_tf_forward, autograph=False) concrete_new_tf_forward = new_tf_forward.get_concrete_function( - tf.TensorSpec(shape=(None, 4))) - self.assertEqual(concrete_new_tf_forward.structured_outputs.shape.as_list(), - [None, 8]) + tf.TensorSpec(shape=(None, 4)) + ) + self.assertEqual( + concrete_new_tf_forward.structured_outputs.shape.as_list(), [None, 8] + ) new_tf_outputs = concrete_new_tf_forward(inputs) self.assertAllClose(new_tf_outputs, jax_outputs) @@ -707,15 +706,10 @@ def forward(x): @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), named=True, ) ) - def test_polymorphic_shape_refinement_dot(self, with_grad, enable_xla): - if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") - + def test_polymorphic_shape_refinement_dot(self, with_grad): @jax.jit def forward(x, w): return jnp.dot(x, w) @@ -726,13 +720,12 @@ def forward(x, w): # JAX -> TF tf_fn = _jax2tf_convert( - forward, - polymorphic_shapes=["(b, _)", None], - with_gradient=with_grad, - enable_xla=enable_xla) + forward, polymorphic_shapes=["(b, _)", None], with_gradient=with_grad + ) tf_fn = tf.function(tf_fn, autograph=False) concrete_tf_fn = tf_fn.get_concrete_function( - tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(4, 5))) + tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(4, 5)) + ) tf_outputs = concrete_tf_fn(x, w) self.assertAllClose(expected_outputs, tf_outputs) @@ -745,13 +738,12 @@ def forward(x, w): # JAX -> TF -> JAX -> TF tf_fn2 = _jax2tf_convert( - jax_fn, - polymorphic_shapes=["(b, _)", None], - with_gradient=with_grad, - enable_xla=enable_xla) + jax_fn, polymorphic_shapes=["(b, _)", None], with_gradient=with_grad + ) tf_fn2 = tf.function(tf_fn2, autograph=False) concrete_tf_fn2 = tf_fn2.get_concrete_function( - tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(4, 5))) + tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(4, 5)) + ) tf_outputs2 = concrete_tf_fn2(x, w) self.assertAllClose(expected_outputs, tf_outputs2) @@ -766,15 +758,10 @@ def forward(x, w): @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), named=True, ) ) - def test_polymorphic_shape_refinement_broadcast(self, with_grad, enable_xla): - if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") - + def test_polymorphic_shape_refinement_broadcast(self, with_grad): @jax.jit def forward(x, y): return (jnp.broadcast_to(x, y.shape), x + y) @@ -788,10 +775,11 @@ def forward(x, y): forward, polymorphic_shapes=["(b, _)", "(_, b, _)"], with_gradient=with_grad, - enable_xla=enable_xla) + ) tf_fn = tf.function(tf_fn, autograph=False) concrete_tf_fn = tf_fn.get_concrete_function( - tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(2, None, 4))) + tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(2, None, 4)) + ) tf_outputs = concrete_tf_fn(x, y) self.assertAllClose(expected_outputs, tf_outputs) @@ -807,10 +795,11 @@ def forward(x, y): jax_fn, polymorphic_shapes=["(b, _)", "(_, b, _)"], with_gradient=with_grad, - enable_xla=enable_xla) + ) tf_fn2 = tf.function(tf_fn2, autograph=False) concrete_tf_fn2 = tf_fn2.get_concrete_function( - tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(2, None, 4))) + tf.TensorSpec(shape=(None, 4)), tf.TensorSpec(shape=(2, None, 4)) + ) tf_outputs2 = concrete_tf_fn2(x, y) self.assertAllClose(expected_outputs, tf_outputs2) @@ -825,37 +814,34 @@ def forward(x, y): @parameterized.named_parameters( chex.params_product( (("with_gradient", True),), - (("disable_xla", False), ("enable_xla", True)), named=True, - )) - def test_custom_gradient(self, with_grad, enable_xla): - if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") - + ) + ) + def test_custom_gradient(self, with_grad): inputs = np.array(range(6), dtype=np.float32).reshape(3, 2) # JAX @jax.custom_gradient def forward(x): e = jnp.exp(x) + def grad(dy): # This is deliberately the wrong gradient. return dy * (1 - 1 / (1 + e)) * jnp.sin(x) + 0.42 + return jnp.sum(jnp.log(1 + e)), grad + forward = self.variant(forward) expected_outputs = forward(inputs) expected_grads = jax.grad(forward)(inputs) # JAX -> TF - tf_forward = _jax2tf_convert( - forward, with_gradient=with_grad, enable_xla=enable_xla) + tf_forward = _jax2tf_convert(forward, with_gradient=with_grad) tf_forward = tf.function(tf_forward, autograph=False) # JAX -> TF -> JAX with config.override_config("convert_custom_gradient", True): - jax_forward = tf2jax.convert_functional(tf_forward, - tf.zeros_like(inputs)) + jax_forward = tf2jax.convert_functional(tf_forward, tf.zeros_like(inputs)) jax_forward = self.variant(jax_forward) jax_outputs = jax_forward(inputs) jax_grads = jax.grad(jax_forward)(inputs) @@ -872,8 +858,9 @@ def grad(dy): # Jax -> TF -> SavedModel -> TF -> Jax with config.override_config("convert_custom_gradient", True): - re_jax_forward = tf2jax.convert_functional(restored.f, - tf.zeros_like(inputs)) + re_jax_forward = tf2jax.convert_functional( + restored.f, tf.zeros_like(inputs) + ) re_jax_forward = self.variant(re_jax_forward) re_jax_outputs = re_jax_forward(inputs) re_jax_grads = jax.grad(re_jax_forward)(inputs) @@ -911,23 +898,21 @@ def test_custom_gradient_saved_model(self): @parameterized.named_parameters( chex.params_product( (("with_gradient", True),), - (("disable_xla", False), ("enable_xla", True)), named=True, - )) - def test_custom_gradient_nested(self, with_grad, enable_xla): - if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") - + ) + ) + def test_custom_gradient_nested(self, with_grad): inputs = np.array(range(6), dtype=np.float32).reshape(3, 2) # JAX @jax.custom_gradient def forward(x): e = jnp.exp(x) + def grad(dy): # This is deliberately the wrong gradient. return dy * (1 - 1 / (1 + e)) * jnp.sin(x) + 0.42 + return jnp.sum(jnp.log(1 + e)), grad forward = self.variant(forward) @@ -935,8 +920,7 @@ def grad(dy): expected_grads = jax.grad(forward)(inputs) # JAX -> TF - tf_fn = _jax2tf_convert( - forward, with_gradient=with_grad, enable_xla=enable_xla) + tf_fn = _jax2tf_convert(forward, with_gradient=with_grad) tf_fn = tf.function(tf_fn, autograph=False) # JAX -> TF -> CALL_TF -> TF. @@ -958,29 +942,26 @@ def grad(dy): @parameterized.named_parameters( chex.params_product( (("without_gradient", False), ("with_gradient", True)), - (("disable_xla", False), ("enable_xla", True)), (("without_custom_gradient", False), ("with_custom_gradient", True)), named=True, - )) - def test_relu(self, with_grad, enable_xla, with_custom_grad): + ) + ) + def test_relu(self, with_grad, with_custom_grad): inputs = np.array([-1.0, 0.0, 1.0], np.float32) self._test_convert( - jax.nn.relu, [inputs], + jax.nn.relu, + [inputs], with_grad=with_grad, - enable_xla=enable_xla, - with_custom_grad=with_custom_grad) + with_custom_grad=with_custom_grad, + ) @chex.variants(with_jit=True, without_jit=True) @parameterized.named_parameters( chex.params_product( - (("disable_xla", False), ("enable_xla", True)), named=True, - )) - def test_empty_return(self, enable_xla): - if uses_native_serialization(): - if not enable_xla: - self.skipTest("native_serialization does not support enable_xla=False.") - + ) + ) + def test_empty_return(self): np.random.seed(42) def forward(x): @@ -992,23 +973,23 @@ def forward(x): inputs = np.random.normal((3, 2)) with self.assertRaisesRegex( TypeError, - "Gradient only defined for scalar-output functions. Output was ()."): + "Gradient only defined for scalar-output functions. Output was ().", + ): jax.grad(forward)(inputs) # JAX -> TF - tf_forward = _jax2tf_convert( - forward, with_gradient=True, enable_xla=enable_xla) + tf_forward = _jax2tf_convert(forward, with_gradient=True) tf_forward = tf.function(tf_forward, autograph=False) # JAX -> TF -> JAX with config.override_config("convert_custom_gradient", True): - jax_forward = tf2jax.convert_functional(tf_forward, - tf.zeros_like(inputs)) + jax_forward = tf2jax.convert_functional(tf_forward, tf.zeros_like(inputs)) jax_forward = self.variant(jax_forward) with self.assertRaisesRegex( TypeError, - "Gradient only defined for scalar-output functions. Output was ()."): + "Gradient only defined for scalar-output functions. Output was ().", + ): jax.grad(jax_forward)(inputs) # Jax -> TF -> SavedModel -> TF @@ -1021,13 +1002,15 @@ def forward(x): # Jax -> TF -> SavedModel -> TF -> Jax with config.override_config("convert_custom_gradient", True): - re_jax_forward = tf2jax.convert_functional(restored.f, - tf.zeros_like(inputs)) + re_jax_forward = tf2jax.convert_functional( + restored.f, tf.zeros_like(inputs) + ) re_jax_forward = self.variant(re_jax_forward) with self.assertRaisesRegex( TypeError, - "Gradient only defined for scalar-output functions. Output was ()."): + "Gradient only defined for scalar-output functions. Output was ().", + ): jax.grad(re_jax_forward)(inputs) @chex.variants(with_jit=True, without_jit=True) @@ -1043,7 +1026,7 @@ def tf2jax_fn(x): tf2jax2tf_fn = _jax2tf_convert(tf2jax_fn) tf2jax2tf_fn = tf.function(tf2jax2tf_fn, autograph=False) - inputs = np.linspace(-1., 1., 6, dtype=np.float32).reshape((2, 3)) + inputs = np.linspace(-1.0, 1.0, 6, dtype=np.float32).reshape((2, 3)) self.assertAllClose(tf.sin(tf_fn(inputs)), tf2jax2tf_fn(inputs)) @chex.variants(with_jit=True, without_jit=True) @@ -1051,31 +1034,29 @@ def tf2jax_fn(x): chex.params_product( (("without_gradient", False), ("with_gradient", True)), named=True, - )) + ) + ) def test_remat(self, with_gradient): def fn(x): return jnp.sin(jnp.sin(x)) + remat_fn = jax.checkpoint(fn) - inputs = ( - np.linspace(0, 1, 10 * 5, dtype=np.float32).reshape(10, 5), - ) + inputs = (np.linspace(0, 1, 10 * 5, dtype=np.float32).reshape(10, 5),) self._test_convert( - remat_fn, - inputs, - with_grad=with_gradient, - enable_xla=True, - with_custom_grad=True) + remat_fn, inputs, with_grad=with_gradient, with_custom_grad=True + ) if uses_native_serialization(): self.skipTest("Skip remat jaxpr test with native_serialization.") # Check jaxpr. tf_fn = tf.function( - _jax2tf_convert(remat_fn, with_gradient=True, enable_xla=True), - autograph=False) - jax_fn = tf2jax.convert_functional(tf_fn, tf.TensorSpec((10, 5), - tf.float32)) + _jax2tf_convert(remat_fn, with_gradient=True), autograph=False + ) + jax_fn = tf2jax.convert_functional( + tf_fn, tf.TensorSpec((10, 5), tf.float32) + ) jax_fn = self.variant(jax_fn) out_jaxpr = jax.make_jaxpr(jax_fn)(*inputs) self.assertNotRegex(str(out_jaxpr), "remat") @@ -1086,14 +1067,16 @@ def fn(x): @parameterized.named_parameters( chex.params_product( (("without_gradient", False),), - (("enable_xla", True),), (("without_custom_gradient", False), ("with_custom_gradient", True)), named=True, - )) - def test_reduce_precision(self, with_grad, enable_xla, with_custom_grad): + ) + ) + def test_reduce_precision(self, with_grad, with_custom_grad): if jax.__version_info__ <= (0, 4, 4): - self.skipTest("jax.lax.reduce_precision is only supported from 0.4.4 and " - f"onward, found {jax.__version__}.") + self.skipTest( + "jax.lax.reduce_precision is only supported from 0.4.4 and " + f"onward, found {jax.__version__}." + ) np.random.seed(42) @@ -1103,16 +1086,16 @@ def forward(x): inputs = np.random.normal((3, 2)).astype(np.float32) self._test_convert( - forward, [inputs], + forward, + [inputs], with_grad=with_grad, - enable_xla=enable_xla, - with_custom_grad=with_custom_grad) + with_custom_grad=with_custom_grad, + ) @chex.variants(with_jit=True, without_jit=True) @parameterized.named_parameters( chex.params_product( (("without_gradient", True),), - (("enable_xla", True), ("disable_xla", False)), (("with_custom_gradient", True),), ( ("lower", True), @@ -1128,11 +1111,11 @@ def forward(x): ("more_batched", ((2, 3, 5, 5), (2, 3, 5, 6))), ), named=True, - )) + ) + ) def test_triangular_solve( self, with_grad, - enable_xla, with_custom_grad, lower, unit_diagonal, @@ -1141,7 +1124,8 @@ def test_triangular_solve( if uses_native_serialization(): self.skipTest( "native_serialization: Cannot serialize code with custom calls whose " - "targets have no compatibility guarantees.") + "targets have no compatibility guarantees." + ) np.random.seed(42) @@ -1159,11 +1143,12 @@ def forward(a, b): tols = dict(atol=1e-5) if jax.default_backend().lower() == "tpu" else {} self._test_convert( - forward, inputs, + forward, + inputs, with_grad=with_grad, - enable_xla=enable_xla, with_custom_grad=with_custom_grad, - grad_tols=tols) + grad_tols=tols, + ) def test_explicit_native_serialization(self): def forward(x): @@ -1177,7 +1162,7 @@ def forward(x): tf_fn, tf.TensorSpec((2, 3), tf.float32) ) jax_fn = jax.jit(jax_fn) - inputs = np.linspace(-1., 1., 6, dtype=np.float32).reshape((2, 3)) + inputs = np.linspace(-1.0, 1.0, 6, dtype=np.float32).reshape((2, 3)) self.assertAllClose(jax_fn(inputs), tf_fn(inputs)) except ValueError as e: if uses_native_serialization(): diff --git a/tf2jax/_src/sharding_test.py b/tf2jax/_src/sharding_test.py index e516a5b..444afcc 100644 --- a/tf2jax/_src/sharding_test.py +++ b/tf2jax/_src/sharding_test.py @@ -16,13 +16,11 @@ from absl.testing import absltest from absl.testing import parameterized - import chex import haiku as hk import jax from jax.experimental import jax2tf import jax.numpy as jnp - import numpy as np import tensorflow as tf from tf2jax._src import test_util @@ -53,20 +51,14 @@ class ShardingTest(test_util.TestCase): @parameterized.named_parameters( chex.params_product( - (('enable_xla', True), ('disable_xla', False)), (('native_serialization', True), ('graph_serialization', False)), named=True, ) ) - def test_sharding(self, enable_xla, native_serialization): + def test_sharding(self, native_serialization): if jax.default_backend().upper() != 'TPU': self.skipTest('Only run sharding tests on TPU.') - if not enable_xla and native_serialization: - self.skipTest( - 'native_serializaton is only supported with enable_xla=True.' - ) - # Set up network and inputs. transformed = hk.without_apply_rng(hk.transform(_net)) rng = jax.random.PRNGKey(42) @@ -79,11 +71,12 @@ def test_sharding(self, enable_xla, native_serialization): # Partitioned to 8 devices. assert jax.device_count() == 8, jax.device_count() mesh = jax.sharding.Mesh( - np.array(jax.devices()).reshape((2, 4)), ('data', 'model')) + np.array(jax.devices()).reshape((2, 4)), ('data', 'model') + ) params_pspecs = _get_param_pspecs() + def to_xla_sharding(pspecs): - return jax.tree.map( - lambda x: jax.sharding.NamedSharding(mesh, x), pspecs) + return jax.tree.map(lambda x: jax.sharding.NamedSharding(mesh, x), pspecs) partitioned_apply = jax.jit( transformed.apply, @@ -116,7 +109,6 @@ def partitioned_grad(params, xs): def tf_fn(params, inputs): return jax2tf.convert( partitioned_apply, - enable_xla=enable_xla, native_serialization=native_serialization, )(params, inputs) @@ -145,6 +137,7 @@ def tf_fn(params, inputs): @jax.grad def reloaded_grad(params, xs): return jnp.sum(jax.jit(jax_fn)(params, xs)) + self.assertAllClose( jax.jit(unpartitioned_grad)(params, images), jax.jit(reloaded_grad)(params, images),