From bacdf8eaf4f5bd1a467b7e9d9703e571ed37c897 Mon Sep 17 00:00:00 2001 From: James Martens Date: Sun, 29 Oct 2023 16:25:11 -0700 Subject: [PATCH] - Adding norm_to_scale_identity_weight_per_block to multiply and update_cache methods of estimator which allows the identity_weight to be scaled differently for each block according to some kind of norm (or norm-like function) of the curvature for that block. - Fixing minor bug that would cause some curvature blocks to use an improperly scaled damping when multiplying with power=1 and use_cached=True, for classes that have non-trivial state_dependent_scale methods. - Adding whitespace to improve readability. PiperOrigin-RevId: 577679697 --- examples/optimizers.py | 97 +++++++++++++++++++-- kfac_jax/_src/curvature_blocks.py | 126 +++++++++++++++++++++++++-- kfac_jax/_src/curvature_estimator.py | 92 ++++++++++++++----- kfac_jax/_src/optimizer.py | 18 ++++ kfac_jax/_src/utils/math.py | 99 ++++++++++++++++++--- 5 files changed, 382 insertions(+), 50 deletions(-) diff --git a/examples/optimizers.py b/examples/optimizers.py index 59bf75d..829ef31 100644 --- a/examples/optimizers.py +++ b/examples/optimizers.py @@ -73,6 +73,7 @@ def __init__( distributed_precon_apply: bool = True, num_samples: int = 1, should_vmap_samples: bool = False, + norm_to_scale_identity_weight_per_block: Optional[str] = None, ): """Initializes the curvature estimator and preconditioner. @@ -141,10 +142,14 @@ def __init__( '[fisher,ggn]_curvature_prop'``. (Default: 1) should_vmap_samples: Whether to use ``jax.vmap`` to compute samples when ``num_samples > 1``. (Default: False) + norm_to_scale_identity_weight_per_block: The name of a norm to use to + compute extra per-block scaling for the damping. See psd_matrix_norm() + in utils/math.py for the definition of these. (Default: None) """ self._l2_reg = l2_reg self._damping = damping self._damping_schedule = damping_schedule + if (self._damping_schedule is None) == (self._damping is None): raise ValueError( "Only one of `damping_schedule` or `damping` has to be specified." @@ -159,6 +164,10 @@ def __init__( self._use_cached_inverses = self._inverse_update_period != 1 self._use_exact_inverses = use_exact_inverses + self._norm_to_scale_identity_weight_per_block = ( + norm_to_scale_identity_weight_per_block + ) + # Curvature estimator self._estimator = kfac_jax.curvature_estimator.BlockDiagonalCurvature( func=value_func, @@ -180,6 +189,7 @@ def init( rng: PRNGKey, ) -> PreconditionState: """Initializes the preconditioner and returns the state.""" + return PreconditionState( count=jnp.array(0, dtype=jnp.int32), estimator_state=self.estimator.init( @@ -193,6 +203,7 @@ def init( @property def _exact_powers_to_cache(self) -> Optional[Union[int, Sequence[int]]]: + if self._use_exact_inverses and self._use_cached_inverses: return -1 else: @@ -200,6 +211,7 @@ def _exact_powers_to_cache(self) -> Optional[Union[int, Sequence[int]]]: @property def _approx_powers_to_cache(self) -> Optional[Union[int, Sequence[int]]]: + if not self._use_exact_inverses and self._use_cached_inverses: return -1 else: @@ -217,9 +229,12 @@ def pmap_axis_name(self): def get_identity_weight( self, state: PreconditionState ) -> Union[Array, float]: + damping = self._damping + if damping is None: damping = self._damping_schedule(state.count) + return damping + self._l2_reg def sync_estimator_state( @@ -227,36 +242,43 @@ def sync_estimator_state( state: PreconditionState, ) -> PreconditionState: """Syncs the estimator state.""" + return PreconditionState( count=state.count, estimator_state=self.estimator.sync( state.estimator_state, pmap_axis_name=self.pmap_axis_name), ) - def should_update_estimate_curvature( + def should_update_estimator_curvature( self, state: PreconditionState ) -> Union[Array, bool]: """Whether at the current step the preconditioner should update the curvature estimates.""" + if self._curvature_update_period == 1: return True + return state.count % self._curvature_update_period == 0 def should_sync_estimate_curvature( self, state: PreconditionState ) -> Union[Array, bool]: """Whether at the current step the preconditioner should synchronize (pmean) the curvature estimates.""" + # sync only before inverses are calculated (either for updating the # cache or for preconditioning). if not self._use_cached_inverses: return True + return self.should_update_inverse_cache(state) def should_update_inverse_cache( self, state: PreconditionState ) -> Union[Array, bool]: """Whether at the current step the preconditioner should update the inverse cache.""" + if not self._use_cached_inverses: return False + return state.count % self._inverse_update_period == 0 def maybe_update( @@ -266,6 +288,7 @@ def maybe_update( rng: PRNGKey, ) -> PreconditionState: """Updates the estimates if it is the right iteration.""" + # NOTE: This maybe update curvatures and inverses at an iteration. But # if curvatures should be accumulated for multiple iterations # before updating inverses (for micro-batching), call @@ -277,7 +300,9 @@ def maybe_update( rng=rng, sync=self.should_sync_estimate_curvature(state), ) + state = self.maybe_update_inverse_cache(state) + return PreconditionState(state.count, state.estimator_state) def _update_estimator_curvature( @@ -300,6 +325,7 @@ def _update_estimator_curvature( rng=rng, func_args=func_args, ) + return jax.lax.cond( sync, functools.partial(self.estimator.sync, @@ -322,7 +348,7 @@ def maybe_update_estimator_curvature( return self._maybe_update_estimator_state( state, - self.should_update_estimate_curvature(state), + self.should_update_estimator_curvature(state), self._update_estimator_curvature, func_args=func_args, rng=rng, @@ -336,6 +362,7 @@ def maybe_update_inverse_cache( state: PreconditionState, ) -> PreconditionState: """Updates the estimator state cache if it is the right iteration.""" + if state.count is None: raise ValueError( "PreconditionState is not initialized. Call" @@ -351,6 +378,7 @@ def maybe_update_inverse_cache( approx_powers=self._approx_powers_to_cache, eigenvalues=False, pmap_axis_name=self.pmap_axis_name, + norm_to_scale_identity_weight_per_block=self._norm_to_scale_identity_weight_per_block, ) def _maybe_update_estimator_state( @@ -361,12 +389,14 @@ def _maybe_update_estimator_state( **update_func_kwargs, ) -> PreconditionState: """Updates the estimator state if it should update.""" + estimator_state = lax.cond( should_update, functools.partial(update_func, **update_func_kwargs), lambda s: s, state.estimator_state, ) + return PreconditionState(state.count, estimator_state) def apply( @@ -375,6 +405,7 @@ def apply( state: PreconditionState, ) -> optax.Updates: """Preconditions (= multiplies the inverse curvature estimation matrix to) updates.""" + new_updates = self.estimator.multiply_inverse( state=state.estimator_state, parameter_structured_vector=updates, @@ -382,15 +413,22 @@ def apply( exact_power=self._use_exact_inverses, use_cached=self._use_cached_inverses, pmap_axis_name=self.pmap_axis_name, + norm_to_scale_identity_weight_per_block=self._norm_to_scale_identity_weight_per_block, ) + if self._norm_constraint is not None: + sq_norm_grads = kfac_jax.utils.inner_product(new_updates, updates) del updates + max_coefficient = jnp.sqrt(self._norm_constraint / sq_norm_grads) coeff = jnp.minimum(max_coefficient, 1) + new_updates = kfac_jax.utils.scalar_mul(new_updates, coeff) + else: del updates + return new_updates def multiply_curvature( @@ -412,9 +450,10 @@ def multiply_curvature( state=state.estimator_state, parameter_structured_vector=updates, identity_weight=self.get_identity_weight(state), - exact_power=self._use_exact_inverses, # this argument will not be used. - use_cached=self._use_cached_inverses, # this argument will not be used. + exact_power=self._use_exact_inverses, + use_cached=self._use_cached_inverses, pmap_axis_name=self.pmap_axis_name, + norm_to_scale_identity_weight_per_block=self._norm_to_scale_identity_weight_per_block, ) return updates @@ -506,10 +545,12 @@ def __init__( self._value_func_has_aux = value_func_has_aux self._value_func_has_state = value_func_has_state self._value_func_has_rng = value_func_has_rng + if not callable(learning_rate): self._learning_rate = lambda _: learning_rate else: self._learning_rate = learning_rate + # Wraps the optax optimizer (gradient transformation), so that it ignores # extra args (i.e. `precond_state` for preconditioner) if not needed. self._optax_optimizer = optax.with_extra_args_support( @@ -542,13 +583,16 @@ def __init__( ) if self._preconditioner is not None: + if not isinstance(self._preconditioner, Preconditioner): raise ValueError( "preconditioner must be a {}, but {} is given.".format( Preconditioner, type(self._preconditioner) ) ) + preconditioner: Preconditioner = self._preconditioner + def _init_preconditioner( params: Params, rng: PRNGKey, @@ -556,7 +600,9 @@ def _init_preconditioner( func_state: Optional[FuncState] = None, ) -> PreconditionState: """Maybe initializes the PreconditionState.""" + batch = self._batch_process_func(batch) + func_args = kfac_jax.optimizer.make_func_args( params, func_state, @@ -565,6 +611,7 @@ def _init_preconditioner( has_state=self._value_func_has_state, has_rng=self._value_func_has_rng, ) + return preconditioner.init(func_args, rng) self._pmap_init_preconditioner = jax.pmap( @@ -597,8 +644,8 @@ def _step( """A single step of optax.""" rng_func, rng_precon = jax.random.split(rng) - batch = self._batch_process_func(batch) + func_args = kfac_jax.optimizer.make_func_args( params, func_state, rng_func, batch, has_state=self._value_func_has_state, @@ -606,6 +653,7 @@ def _step( ) optax_state, precond_state = state.optax_state, state.precond_state + if self._preconditioner is not None: precond_state = self._preconditioner.maybe_update( precond_state, @@ -613,15 +661,19 @@ def _step( rng_precon, ) precond_state = self._preconditioner.increment_count(precond_state) + out, grads = self._value_and_grad_func(*func_args) + loss, new_func_state, stats = kfac_jax.optimizer.extract_func_outputs( out, has_aux=self._value_func_has_aux, has_state=self._value_func_has_state, ) + loss, stats, grads = kfac_jax.utils.pmean_if_pmap( # pytype: disable=wrong-keyword-args (loss, stats, grads), axis_name=self.pmap_axis_name ) + stats = stats or {} stats["loss"] = loss @@ -641,21 +693,27 @@ def _step( stats["batch_size"] = batch_size * jax.device_count() stats["data_seen"] = stats["step"] * stats["batch_size"] stats["learning_rate"] = self._learning_rate(global_step_int) + if self._include_norms_in_stats: stats["grad_norm"] = kfac_jax.utils.norm(grads) stats["update_norm"] = kfac_jax.utils.norm(updates) stats["param_norm"] = kfac_jax.utils.norm(params) stats["rel_grad_norm"] = stats["grad_norm"] / stats["param_norm"] stats["rel_update_norm"] = stats["update_norm"] / stats["param_norm"] + if self._include_per_param_norms_in_stats: stats.update(kfac_jax.utils.per_parameter_norm(grads, "grad_norm")) stats.update(kfac_jax.utils.per_parameter_norm(updates, "update_norm")) param_norms = kfac_jax.utils.per_parameter_norm(params, "param_norm") + for key in param_norms: + norm = param_norms[key] stats[key] = norm + grad_key = key.replace("param", "grad") stats["rel_" + grad_key] = stats[grad_key] / norm + upd_key = key.replace("param", "update") stats["rel_" + upd_key] = stats[upd_key] / norm @@ -709,18 +767,27 @@ def tf1_rmsprop( def tf1_scale_by_rms(decay_=0.9, epsilon_=1e-8): """Same as optax.scale_by_rms, but initializes second moment to one.""" + def init_fn(params): nu = jax.tree_util.tree_map(jnp.ones_like, params) # second moment return optax.ScaleByRmsState(nu=nu) + def _update_moment(updates, moments, decay, order): + return jax.tree_util.tree_map( lambda g, t: (1 - decay) * (g ** order) + decay * t, updates, moments) + def update_fn(updates, state, params=None): + del params + nu = _update_moment(updates, state.nu, decay_, 2) + updates = jax.tree_util.tree_map( lambda g, n: g / (jnp.sqrt(n + epsilon_)), updates, nu) + return updates, optax.ScaleByRmsState(nu=nu) + return optax.GradientTransformation(init_fn, update_fn) return optax.chain( @@ -735,26 +802,34 @@ def linear_interpolation( interpolation_points: Tuple[Tuple[float, float], ...] ) -> Array: """Performs linear interpolation between the interpolation points.""" + xs, ys = zip(*interpolation_points) masks = [x < ci for ci in xs[1:]] + min_iter = jnp.zeros_like(x) max_iter = jnp.zeros_like(x) max_val = jnp.zeros_like(x) min_val = jnp.zeros_like(x) p = jnp.ones_like(x) + for i in range(len(masks) - 1): pi = p * masks[i] + min_iter = pi * xs[i] + (1 - pi) * min_iter max_iter = pi * xs[i + 1] + (1 - pi) * max_iter max_val = pi * ys[i] + (1 - pi) * max_val min_val = pi * ys[i + 1] + (1 - pi) * min_val + p = p * (1 - masks[i]) + min_iter = p * xs[-2] + (1 - p) * min_iter max_iter = p * xs[-1] + (1 - p) * max_iter max_val = p * ys[-2] + (1 - p) * max_val min_val = p * ys[-1] + (1 - p) * min_val + diff = (min_val - max_val) progress = (x - min_iter) / (max_iter - min_iter - 1) + return max_val + diff * jnp.minimum(progress, 1.0) @@ -772,12 +847,16 @@ def imagenet_sgd_schedule( # Can be found in Section 5.1 of https://arxiv.org/pdf/1706.02677.pdf steps_per_epoch = dataset_size / train_total_batch_size current_epoch = global_step / steps_per_epoch + lr = (0.1 * train_total_batch_size) / 256 lr_linear_till = 5 + boundaries = jnp.array((30, 60, 80)) * steps_per_epoch values = jnp.array([1., 0.1, 0.01, 0.001]) * lr + index = jnp.sum(boundaries < global_step) lr = jnp.take(values, index) + return lr * jnp.minimum(1., current_epoch / lr_linear_till) @@ -795,6 +874,7 @@ def kfac_resnet50_schedule( **_: Any, ) -> Array: """Custom schedule for KFAC.""" + return jnp.power(10.0, linear_interpolation( x=global_step, interpolation_points=( @@ -1033,6 +1113,7 @@ def construct_schedule( **kwargs, ) -> Callable[[Numeric], Array]: """Constructs the actual schedule from its name and extra kwargs.""" + if name == "fixed": return functools.partial(fixed_schedule, **kwargs) elif name == "imagenet_sgd": @@ -1053,16 +1134,21 @@ def kfac_bn_registration_kwargs(bn_registration: str) -> Mapping[ str, Union[Tuple[str, ...], Mapping[str, Type[kfac_jax.CurvatureBlock]]] ]: """Constructs KFAC kwargs for the given batch-norm registration strategy.""" + if bn_registration == "generic": return dict(patterns_to_skip=("scale_and_shift", "scale_only")) + elif bn_registration == "full": + return dict( layer_tag_to_block_cls=dict( scale_and_shift_tag=kfac_jax.ScaleAndShiftFull, ) ) + elif bn_registration != "diag": raise ValueError(f"Unknown batch_norm_registration={bn_registration}.") + return {} @@ -1129,6 +1215,7 @@ def create_optimizer( **kwargs.pop("learning_rate_schedule") ) optax_ctor = lambda lr: (getattr(optax, name)(learning_rate=lr, **kwargs)) + return OptaxWrapper( value_and_grad_func=value_and_grad_func, value_func_has_aux=has_aux, diff --git a/kfac_jax/_src/curvature_blocks.py b/kfac_jax/_src/curvature_blocks.py index b299a5d..c2f67ff 100644 --- a/kfac_jax/_src/curvature_blocks.py +++ b/kfac_jax/_src/curvature_blocks.py @@ -254,6 +254,12 @@ def scale(self, state: "CurvatureBlock.State", use_cache: bool) -> Numeric: Returns: A scalar value to be multiplied with any unscaled block representation. """ + + # TODO(jamesmartens,botev): This way of handling state dependent scale is + # a bit hacky and leads to complexity in other parts of the code that must + # be aware of how this part works. Should try to replace this with something + # better. + if use_cache: return self.fixed_scale() @@ -365,7 +371,9 @@ def multiply_matpower( Returns: A tuple of arrays, representing the result of the matrix-vector product. """ + scale = self.scale(state, use_cached) + result = self._multiply_matpower_unscaled( state=state, vector=vector, @@ -541,6 +549,19 @@ def to_dense_matrix(self, state: "CurvatureBlock.State") -> Array: def _to_dense_unscaled(self, state: "CurvatureBlock.State") -> Array: """A dense representation of the curvature, ignoring ``self.scale``.""" + def norm(self, state: "CurvatureBlock.State", norm_type: str) -> Numeric: + """Computes the norm of the curvature block, according to ``norm_type``.""" + + return self.scale(state, False) * self._norm_unscaled(state, norm_type) + + @abc.abstractmethod + def _norm_unscaled( + self, + state: "CurvatureBlock.State", + norm_type: str + ) -> Numeric: + """Like ``norm`` but with ``self.scale`` not included.""" + class ScaledIdentity(CurvatureBlock): """A block that assumes that the curvature is a scaled identity matrix.""" @@ -596,9 +617,13 @@ def _multiply_matpower_unscaled( use_cached: bool, ) -> Tuple[Array, ...]: - del exact_power, use_cached # Unused + del exact_power # Unused - identity_weight = identity_weight + 1.0 + # state_dependent_scale needs to be included because it won't be by the + # caller of this function (multiply_matpower) when use_cached=True + scale = self.state_dependent_scale(state) if use_cached else 1.0 + + identity_weight = identity_weight + scale if power == 1: return jax.tree_util.tree_map(lambda x: identity_weight * x, vector) @@ -644,6 +669,14 @@ def _to_dense_unscaled(self, state: CurvatureBlock.State) -> Array: del state # not used return jnp.eye(self.dim) + def _norm_unscaled( + self, + state: CurvatureBlock.State, + norm_type: str + ) -> Numeric: + + return utils.psd_matrix_norm(jnp.ones([self.dim]), norm_type=norm_type) + class Diagonal(CurvatureBlock, abc.ABC): """An abstract class for approximating only the diagonal of curvature.""" @@ -701,7 +734,12 @@ def _multiply_matpower_unscaled( use_cached: bool, ) -> Tuple[Array, ...]: - factors = tuple(f.value + identity_weight for f in state.diagonal_factors) + # state_dependent_scale needs to be included because it won't be by the + # caller of this function (multiply_matpower) when use_cached=True + scale = self.state_dependent_scale(state) if use_cached else 1.0 + + factors = tuple(scale * f.value + identity_weight + for f in state.diagonal_factors) assert len(factors) == len(vector) @@ -728,6 +766,7 @@ def _update_cache( approx_powers: Set[Scalar], eigenvalues: bool, ) -> "Diagonal.State": + return state.copy() def _to_dense_unscaled(self, state: "Diagonal.State") -> Array: @@ -739,6 +778,16 @@ def _to_dense_unscaled(self, state: "Diagonal.State") -> Array: # Construct diagonal matrix return jnp.diag(jnp.concatenate(factors, axis=0)) + def _norm_unscaled( + self, + state: CurvatureBlock.State, + norm_type: str + ) -> Numeric: + + return utils.product( + utils.psd_matrix_norm(f.value.flatten(), norm_type=norm_type) + for f in state.diagonal_factors) + class Full(CurvatureBlock, abc.ABC): """An abstract class for approximating the block matrix with a full matrix.""" @@ -776,6 +825,7 @@ def __init__( if eigen_decomposition_threshold is None: threshold = get_default_eigen_decomposition_threshold() self._eigen_decomposition_threshold = threshold + else: self._eigen_decomposition_threshold = eigen_decomposition_threshold @@ -788,10 +838,12 @@ def parameters_list_to_single_vector( """Converts values corresponding to parameters of the block to vector.""" if len(parameters_shaped_list) != self.number_of_parameters: + raise ValueError(f"Expected a list of {self.number_of_parameters} values," f" but got {len(parameters_shaped_list)} instead.") for array, shape in zip(parameters_shaped_list, self.parameters_shapes): + if array.shape != shape: raise ValueError(f"Expected a value of shape {shape}, but got " f"{array.shape} instead.") @@ -815,6 +867,7 @@ def single_vector_to_parameters_list( index = 0 for shape in self.parameters_shapes: + size = utils.product(shape) parameters_shaped_list.append(vector[index: index + size].reshape(shape)) index += size @@ -880,7 +933,17 @@ def _multiply_matpower_unscaled( vector = self.parameters_list_to_single_vector(vector) if power == 1: - result = jnp.matmul(state.matrix.value, vector) + identity_weight * vector + + result = jnp.matmul(state.matrix.value, vector) + + if use_cached: + # state_dependent_scale needs to be included here because it won't be by + # the caller of this function (multiply_matpower) when use_cached=True. + # This is not an issue for other powers because they bake in + # state_dependent_scale. + result *= self.state_dependent_scale(state) + + result += identity_weight * vector elif not use_cached: @@ -911,8 +974,10 @@ def _eigenvalues_unscaled( state: "Full.State", use_cached: bool, ) -> Array: + if not use_cached: return utils.safe_psd_eigh(state.matrix.value)[0] + else: return state.cache["eigenvalues"] @@ -957,6 +1022,7 @@ def _update_cache( return state def _to_dense_unscaled(self, state: "Full.State") -> Array: + # Permute the matrix according to the parameters canonical order return utils.block_permuted( state.matrix.value, @@ -964,6 +1030,14 @@ def _to_dense_unscaled(self, state: "Full.State") -> Array: block_order=self.parameters_canonical_order ) + def _norm_unscaled( + self, + state: CurvatureBlock.State, + norm_type: str + ) -> Numeric: + + return utils.psd_matrix_norm(state.matrix.value, norm_type=norm_type) + class KroneckerFactored(CurvatureBlock, abc.ABC): """An abstract class for approximating the block with a Kronecker product.""" @@ -1073,6 +1147,7 @@ def _init( approx_powers_to_cache: Set[Scalar], cache_eigenvalues: bool, ) -> "KroneckerFactored.State": + cache = {} factors = [] @@ -1088,10 +1163,12 @@ def _init( cache[f"{i}_factor_eigen_vectors"] = jnp.zeros((d, d), dtype=self.dtype) for power in approx_powers_to_cache: + if power != -1: raise NotImplementedError( f"Approximations for power {power} is not yet implemented." ) + if str(power) not in cache: cache[str(power)] = {} @@ -1125,6 +1202,7 @@ def _multiply_matpower_unscaled( exact_power: bool, use_cached: bool, ) -> Tuple[Array, ...]: + assert len(state.factors) == len(self.axis_groups) vector = self.parameter_shaped_list_to_grouped_array(vector) @@ -1133,8 +1211,14 @@ def _multiply_matpower_unscaled( factors = [f.value for f in state.factors] + # state_dependent_scale needs to be included here because it won't be by + # the caller of this function (multiply_matpower) when use_cached=True. + # This is not an issue for other powers because they bake in + # state_dependent_scale. + scale = self.state_dependent_scale(state) if use_cached else 1.0 + if exact_power: - result = utils.kronecker_product_axis_mul_v(factors, vector) + result = scale * utils.kronecker_product_axis_mul_v(factors, vector) result = result + identity_weight * vector else: @@ -1142,9 +1226,9 @@ def _multiply_matpower_unscaled( # norm in its computation, it might make sense to cache it. But we # currently don't do that. - result = utils.kronecker_product_axis_mul_v( - utils.pi_adjusted_kronecker_factors(*factors, - damping=identity_weight), + result = scale * utils.kronecker_product_axis_mul_v( + utils.pi_adjusted_kronecker_factors( + *factors, damping=identity_weight / scale), vector) elif exact_power: @@ -1177,6 +1261,7 @@ def _multiply_matpower_unscaled( ) if use_cached: + factors = [ state.cache[str(power)][f"{i}_factor"] for i in range(len(state.factors)) @@ -1218,6 +1303,7 @@ def _update_cache( # pytype: disable=signature-mismatch # numpy-scalars approx_powers: Numeric, eigenvalues: bool, ) -> "KroneckerFactored.State": + assert len(state.factors) == len(self.axis_groups) # Copy this first since we mutate it later in this function. @@ -1227,8 +1313,11 @@ def _update_cache( # pytype: disable=signature-mismatch # numpy-scalars factor_scale = jnp.power(scale, 1.0 / len(self.axis_groups)) if eigenvalues or exact_powers: + s_q = [utils.safe_psd_eigh(factor.value) for factor in state.factors] + s, q = zip(*s_q) + for i in range(len(state.factors)): state.cache[f"{i}_factor_eigenvalues"] = factor_scale * s[i] @@ -1236,24 +1325,36 @@ def _update_cache( # pytype: disable=signature-mismatch # numpy-scalars state.cache[f"{i}_factor_eigen_vectors"] = q[i] for power in approx_powers: + if power != -1: raise NotImplementedError( f"Approximations for power {power} is not yet implemented." ) cache = state.cache[str(power)] + # This computes the approximate inverse factors using the generalization # of the pi-adjusted inversion from the original KFAC paper. - inv_factors = utils.pi_adjusted_kronecker_inverse( *[factor.value for factor in state.factors], damping=identity_weight, ) + for i in range(len(state.factors)): cache[f"{i}_factor"] = inv_factors[i] / factor_scale return state + def _norm_unscaled( + self, + state: CurvatureBlock.State, + norm_type: str + ) -> Numeric: + + return utils.product( + utils.psd_matrix_norm(f.value, norm_type=norm_type) + for f in state.factors) + class TwoKroneckerFactored(KroneckerFactored): """A Kronecker factored block for layers with weights and an optional bias.""" @@ -1274,29 +1375,35 @@ def parameters_shaped_list_to_array( self, parameters_shaped_list: Sequence[Array], ) -> Array: + for p, s in zip(parameters_shaped_list, self.parameters_shapes): assert p.shape == s if self.has_bias: w, b = parameters_shaped_list return jnp.concatenate([w.reshape([-1, w.shape[-1]]), b[None]], axis=0) + else: # This correctly reshapes the parameters of both dense and conv2d blocks [w] = parameters_shaped_list return w.reshape([-1, w.shape[-1]]) def array_to_parameters_shaped_list(self, array: Array) -> Tuple[Array, ...]: + if self.has_bias: w, b = array[:-1], array[-1] return w.reshape(self.parameters_shapes[0]), b + else: return tuple([array.reshape(self.parameters_shapes[0])]) def _to_dense_unscaled(self, state: "KroneckerFactored.State") -> Array: + assert 0 < self.number_of_parameters <= 2 inputs_factor = state.factors[0].value if self.has_bias and self.parameters_canonical_order[0] != 0: + # Permute the matrix according to the parameters canonical order inputs_factor = utils.block_permuted( state.factors[0].value, @@ -1330,6 +1437,7 @@ def update_curvature_matrix_estimate( for factor, dw in zip(state.diagonal_factors, estimation_data["params_tangent"]): + factor.update(dw * dw / batch_size, ema_old, ema_new) return state diff --git a/kfac_jax/_src/curvature_estimator.py b/kfac_jax/_src/curvature_estimator.py index 8f6575a..ab48ece 100644 --- a/kfac_jax/_src/curvature_estimator.py +++ b/kfac_jax/_src/curvature_estimator.py @@ -546,8 +546,8 @@ class CurvatureEstimator(Generic[StateType], utils.Finalizable): The cached values are only updated once you call the method :func:`~CurvatureEstimator.update_cache`. Multiple methods contain the keyword argument ``use_cached`` which specify whether you want to compute the - corresponding expression using the current curvature estimate or used a cached - version. + corresponding expression using the current curvature estimate or using a + cached version. Attributes: func: The model evaluation function. @@ -643,6 +643,7 @@ def multiply_matpower( exact_power: bool, use_cached: bool, pmap_axis_name: Optional[str], + norm_to_scale_identity_weight_per_block: Optional[str] = None, ) -> utils.Params: """Computes ``(CurvatureMatrix + identity_weight I)**power`` times ``vector``. @@ -665,6 +666,9 @@ def multiply_matpower( pmap_axis_name: The name of any pmap axis, which will be used for aggregating any computed values over multiple devices, as well as parallelizing the computation over devices in a block-wise fashion. + norm_to_scale_identity_weight_per_block: The name of a norm to use to + compute extra per-block scaling for identity_weight. See + psd_matrix_norm() in utils/math.py for the definition of these. Returns: A parameter structured vector containing the product. @@ -678,6 +682,7 @@ def multiply( exact_power: bool, use_cached: bool, pmap_axis_name: Optional[str], + norm_to_scale_identity_weight_per_block: Optional[str] = None, ) -> utils.Params: """Computes ``(CurvatureMatrix + identity_weight I)`` times ``vector``.""" @@ -688,7 +693,8 @@ def multiply( power=1, exact_power=exact_power, use_cached=use_cached, - pmap_axis_name=pmap_axis_name + pmap_axis_name=pmap_axis_name, + norm_to_scale_identity_weight_per_block=norm_to_scale_identity_weight_per_block, ) def multiply_inverse( @@ -699,6 +705,7 @@ def multiply_inverse( exact_power: bool, use_cached: bool, pmap_axis_name: Optional[str], + norm_to_scale_identity_weight_per_block: Optional[str] = None, ) -> utils.Params: """Computes ``(CurvatureMatrix + identity_weight I)^-1`` times ``vector``.""" @@ -709,7 +716,8 @@ def multiply_inverse( power=-1, exact_power=exact_power, use_cached=use_cached, - pmap_axis_name=pmap_axis_name + pmap_axis_name=pmap_axis_name, + norm_to_scale_identity_weight_per_block=norm_to_scale_identity_weight_per_block, ) @abc.abstractmethod @@ -908,6 +916,7 @@ def __init__( auto_register_tags=auto_register_tags, **auto_register_kwargs ) + # Initialized during finalization self._jaxpr: Optional[tracer.ProcessedJaxpr] = None self._blocks: Optional[Tuple[curvature_blocks.CurvatureBlock]] = None @@ -1118,9 +1127,12 @@ def _sync_state( state: "BlockDiagonalCurvature.State", pmap_axis_name: Optional[str], ) -> "BlockDiagonalCurvature.State": + block_states = [] + for block, block_state in zip(self.blocks, state.blocks_states): block_states.append(block.sync(block_state.copy(), pmap_axis_name)) + return BlockDiagonalCurvature.State( synced=jnp.asarray(True), blocks_states=tuple(block_states), @@ -1132,6 +1144,7 @@ def sync( state: "BlockDiagonalCurvature.State", pmap_axis_name: Optional[str], ) -> "BlockDiagonalCurvature.State": + return jax.lax.cond( state.synced, lambda s: s, @@ -1149,6 +1162,7 @@ def multiply_matpower( exact_power: bool, use_cached: bool, pmap_axis_name: Optional[str], + norm_to_scale_identity_weight_per_block: Optional[str] = None, ) -> utils.Params: blocks_vectors = self.params_vector_to_blocks_vectors( @@ -1156,21 +1170,35 @@ def multiply_matpower( identity_weight = utils.to_tuple_or_repeat(identity_weight, self.num_blocks) + def make_thunk(block, block_state, block_vector, block_identity_weight): + + def thunk(): + + weight = block_identity_weight + + if (norm_to_scale_identity_weight_per_block is not None + and norm_to_scale_identity_weight_per_block != "none"): + + weight *= block.norm( + block_state, norm_to_scale_identity_weight_per_block) + + return block.multiply_matpower( + state=block_state, + vector=block_vector, + identity_weight=weight, + power=power, + exact_power=exact_power, + use_cached=use_cached, + ) + + return thunk + thunks = [] for block, block_state, block_vector, block_identity_weight in zip( self.blocks, state.blocks_states, blocks_vectors, identity_weight): thunks.append( - functools.partial( - block.multiply_matpower, - state=block_state, - vector=block_vector, - identity_weight=block_identity_weight, - power=power, - exact_power=exact_power, - use_cached=use_cached, - ) - ) + make_thunk(block, block_state, block_vector, block_identity_weight)) if self._distributed_multiplies and pmap_axis_name is not None: @@ -1419,23 +1447,39 @@ def update_cache( approx_powers: Optional[curvature_blocks.ScalarOrSequence], eigenvalues: bool, pmap_axis_name: Optional[str], + norm_to_scale_identity_weight_per_block: Optional[str] = None, ) -> "BlockDiagonalCurvature.State": + identity_weight = utils.to_tuple_or_repeat(identity_weight, self.num_blocks) + def make_thunk(block, block_state, block_identity_weight): + + def thunk(): + + weight = block_identity_weight + + if (norm_to_scale_identity_weight_per_block is not None + and norm_to_scale_identity_weight_per_block != "none"): + + weight *= block.norm( + block_state, norm_to_scale_identity_weight_per_block) + + return block.update_cache( + state=block_state, + identity_weight=block_identity_weight, + exact_powers=exact_powers, + approx_powers=approx_powers, + eigenvalues=eigenvalues, + ) + + return thunk + thunks = [] for block, block_state, block_identity_weight in zip(self.blocks, state.blocks_states, identity_weight): - thunks.append( - functools.partial( - block.update_cache, - state=block_state, - identity_weight=block_identity_weight, - exact_powers=exact_powers, - approx_powers=approx_powers, - eigenvalues=eigenvalues, - ) - ) + + thunks.append(make_thunk(block, block_state, block_identity_weight)) if self._distributed_cache_updates and pmap_axis_name is not None: diff --git a/kfac_jax/_src/optimizer.py b/kfac_jax/_src/optimizer.py index 0c99d8c..e32f8d0 100644 --- a/kfac_jax/_src/optimizer.py +++ b/kfac_jax/_src/optimizer.py @@ -141,6 +141,7 @@ def __init__( distributed_inverses: bool = True, num_estimator_samples: int = 1, should_vmap_estimator_samples: bool = False, + norm_to_scale_identity_weight_per_block: Optional[str] = None, ): """Initializes the K-FAC optimizer with the provided settings. @@ -350,6 +351,11 @@ def __init__( '[fisher,ggn]_curvature_prop'``. (Default: 1) should_vmap_estimator_samples: Whether to use ``jax.vmap`` to compute samples when ``num_estimator_samples > 1``. (Default: False) + norm_to_scale_identity_weight_per_block: The name of a norm to use to + compute extra per-block scaling for the damping. See psd_matrix_norm() + in utils/math.py for the definition of these. Note that this will not + effect the exact quadratic model that is used as part of the "adaptive" + learning rate, momentum, and damping methods. (Default: None) """ super().__init__( @@ -444,6 +450,16 @@ def schedule_with_first_step_zero( self._use_cached_inverses = (self._inverse_update_period != 1) self._use_exact_inverses = use_exact_inverses + self._norm_to_scale_identity_weight_per_block = ( + norm_to_scale_identity_weight_per_block + ) + + if (norm_to_scale_identity_weight_per_block is not None + and norm_to_scale_identity_weight_per_block != "none"): + + assert (not use_adaptive_learning_rate and not use_adaptive_momentum + and not use_adaptive_damping) # not currently supported + # Curvature estimator self._estimator = curvature_estimator.BlockDiagonalCurvature( func=self._value_func, @@ -783,6 +799,7 @@ def _compute_preconditioned_gradient( exact_power=self._use_exact_inverses, use_cached=self._use_cached_inverses, pmap_axis_name=self.pmap_axis_name, + norm_to_scale_identity_weight_per_block=self._norm_to_scale_identity_weight_per_block, ) if self._norm_constraint is not None: @@ -1275,6 +1292,7 @@ def c_times_v(v): exact_power=True, use_cached=False, pmap_axis_name=self.pmap_axis_name, + norm_to_scale_identity_weight_per_block=self._norm_to_scale_identity_weight_per_block, ) c_vectors = [c_times_v(v_i) for v_i in vectors] diff --git a/kfac_jax/_src/utils/math.py b/kfac_jax/_src/utils/math.py index 2684084..58b9b6d 100644 --- a/kfac_jax/_src/utils/math.py +++ b/kfac_jax/_src/utils/math.py @@ -342,18 +342,21 @@ def psd_inv_cholesky(matrix: Array) -> Array: def psd_matrix_norm( matrix: Array, - norm_type: str = "avg_trace", + norm_type: str = "avg_diag", method_2norm: str = "lobpcg", rng_key: Optional[PRNGKey] = None -) -> Array: +) -> Numeric: """Computes one of several different matrix norms for PSD matrices. + NOTE: not all the functions options provided here are actually norms, but most + are. + Args: matrix: a square matrix represented as a 2D array, a 1D vector giving the diagonal, or a 0D scalar (which gets interpreted as a 1x1 matrix). Must be positive semi-definite (PSD). norm_type: a string specifying the type of matrix norm. Can be "2_norm" for - the matrix 2-norm aka the spectral norm, "avg_trace" for the average of + the matrix 2-norm aka the spectral norm, "avg_diag" for the average of diagonal entries, "1_norm" for the matrix 1-norm, or "avg_fro" for the Frobenius norm divided by the square root of the number of rows. method_2norm: a string specifying the method used to compute 2-norms. Can @@ -386,6 +389,7 @@ def psd_matrix_norm( matrix, v, m=300, tol=1e-8)[0][0] elif method_2norm == "power_iteration": + return optax.power_iteration( matrix, num_iters=300, error_tolerance=1e-7)[1] @@ -395,7 +399,7 @@ def psd_matrix_norm( else: raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") - elif norm_type == "avg_trace": + elif norm_type == "avg_diag": if matrix.ndim == 0: return matrix @@ -409,7 +413,65 @@ def psd_matrix_norm( else: raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") - elif norm_type == "1_norm": + elif norm_type == "median_diag": + + if matrix.ndim == 0: + return matrix + + elif matrix.ndim == 1: + return jnp.median(matrix) + + elif matrix.ndim == 2 and matrix.shape[0] == matrix.shape[1]: + return jnp.median(jnp.diag(matrix)) + + else: + raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") + + elif norm_type == "trace": + + if matrix.ndim == 0: + return matrix + + elif matrix.ndim == 1: + return jnp.sum(matrix) + + elif matrix.ndim == 2 and matrix.shape[0] == matrix.shape[1]: + return jnp.trace(matrix) + + else: + raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") + + elif norm_type == "median_eig": + + if matrix.ndim == 0: + return matrix + + elif matrix.ndim == 1: + return jnp.median(matrix) + + elif matrix.ndim == 2 and matrix.shape[0] == matrix.shape[1]: + # call safe_psd_eigh instead? + s, _ = jnp.linalg.eigh(matrix) + return jnp.median(s) + + else: + raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") + + elif norm_type == "one_over_dim": # this isn't a norm + + if matrix.ndim == 0: + return 1.0 + + elif matrix.ndim == 1: + return 1.0 / matrix.shape[0] + + elif matrix.ndim == 2 and matrix.shape[0] == matrix.shape[1]: + return 1.0 / matrix.shape[0] + + else: + raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") + + elif norm_type == "1_norm": # equiv to inf norm for symmetric matrices if matrix.ndim == 0: return matrix @@ -437,8 +499,21 @@ def psd_matrix_norm( else: raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") - else: - raise ValueError(f"Unrecognized norm type: '{norm_type}'") + elif norm_type == "fro": + + if matrix.ndim == 0: + return matrix + + elif matrix.ndim == 1: + return jnp.linalg.norm(matrix) + + elif matrix.ndim == 2 and matrix.shape[0] == matrix.shape[1]: + return jnp.linalg.norm(matrix) + + else: + raise ValueError(f"Unsupported shape for factor array: {matrix.shape}") + + raise ValueError(f"Unrecognized norm type: '{norm_type}'") def pi_adjusted_kronecker_factors( @@ -475,15 +550,15 @@ def pi_adjusted_kronecker_factors( # scalar factors `c_i` into a single overall scaling coefficient and # distribute the damping to each single non-scalar factor `u_i` equally. - norm_type = "avg_trace" + norm_type = "avg_diag" norms = [psd_matrix_norm(f, norm_type=norm_type) for f in factors] # Compute the normalized factors `u_i`, such that Trace(u_i) / dim(u_i) = 1 us = [fi / ni for fi, ni in zip(factors, norms)] - # kron(arrays) = c * kron(us) - + # Compute the overall norm for the whole Kronecker product. We should have + # kron(arrays) == c * kron(us). c = jnp.prod(jnp.array(norms)) damping = damping.astype(c.dtype) # pytype: disable=attribute-error # numpy-scalars @@ -508,10 +583,10 @@ def regular_case() -> Tuple[Array, ...]: for u in us: - if u.size == 1: + if u.size == 1: # scalar case u_hat = jnp.ones_like(u) # damping not used in the scalar factors - elif u.ndim == 2: + elif u.ndim == 2: # matrix case u_hat = u + d_hat * jnp.eye(u.shape[0], dtype=u.dtype) else: # diagonal case