Source code for ablator.modules.optimizer

import typing as ty
from abc import abstractmethod

import torch
from torch import nn
from torch.optim import SGD, Adam, AdamW, Optimizer

from ablator.config.main import ConfigBase, configclass
from ablator.config.types import Tuple


[docs]def get_parameter_names(model: torch.nn.Module, forbidden_layer_types: list[type]): """ Recurse into the module and return parameter names of all submodules, excluding modules that are of any type defined in ``forbidden_layer_types``. Parameters ---------- model : torch.nn.Module The model for which to get parameter names. forbidden_layer_types : list[type] A list of types of modules inside which parameter names should not be included. Returns ------- list[str] The names of the parameters with the following format: ``<submodule-name>.<parameter-name>``. Examples -------- >>> class MyModel(nn.Module): >>> def __init__(self, embedding_dim=10, vocab_size=10, *args, **kwargs) -> None: >>> super().__init__(*args, **kwargs) >>> self.param = nn.Parameter(torch.ones(100)) >>> self.embedding = nn.Embedding(num_embeddings=vocab_size, >>> embedding_dim=embedding_dim) >>> self.norm_layer = nn.LayerNorm(embedding_dim) >>> def forward(self): >>> x = self.param + torch.rand_like(self.param) * 0.01 >>> return x.sum().abs() >>> mM = MyModel() >>> get_parameter_names(mM,[]) ['embedding.weight', 'norm_layer.weight', 'norm_layer.bias', 'param'] >>> get_parameter_names(mM, [torch.nn.LayerNorm]) ['embedding.weight', 'param'] """ result = [] for name, child in model.named_children(): result += [ f"{name}.{n}" for n in get_parameter_names(child, forbidden_layer_types) if not isinstance(child, tuple(forbidden_layer_types)) ] # Add model specific parameters (defined with nn.Parameter) since they are not in any child. result += list(model._parameters.keys()) return result
[docs]def get_optim_parameters( model: torch.nn.Module, weight_decay: float | None = None, only_requires_grad: bool = True, ): """ Setup the optimizer. Get model parameters to be optimized. If ``weight_decay`` is a ``float``, apply weight decaying to the parameters too (except for bias and parameters from layer normalization module). Parameters ---------- model : torch.nn.Module The model for which to get parameters that will be optimized. weight_decay : float | None The amount of weight decay to use, by default ``None``. only_requires_grad : bool Whether to only use parameters that require gradient or all parameters, by default ``True``. Returns ------- dict | list - If weight_decay is ``None``, return all model parameters. - If weight_decay is not ``None``, return a dictionary of parameter groups of different weight decay. In specific, bias parameters and parameters from layer normalization module will have weight decay of ``0.0``, while any other parameters will have weight decay of ``weight_decay``. Notes ----- We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. Examples -------- >>> class MyModel(nn.Module): >>> def __init__(self, embedding_dim=10, vocab_size=10, *args, **kwargs) -> None: >>> super().__init__(*args, **kwargs) >>> self.param = nn.Parameter(torch.ones(100)) >>> self.embedding = nn.Embedding(num_embeddings=vocab_size, >>> embedding_dim=embedding_dim) >>> self.norm_layer = nn.LayerNorm(embedding_dim) >>> def forward(self): >>> x = self.param + torch.rand_like(self.param) * 0.01 >>> return x.sum().abs() >>> mM = MyModel() >>> get_optim_parameters(mM, 0.2) [ {'params': ['param', 'embedding.weight'], 'weight_decay': 0.2}, {'params': ['norm_layer.weight', 'norm_layer.bias'], 'weight_decay': 0.0} ] """ # default_val = lambda k, v: kwargs[k] if k in kwargs else v params_to_update = {} if only_requires_grad: for name, param in model.named_parameters(): if param.requires_grad: params_to_update[name] = param else: params_to_update = dict(model.named_parameters()) if weight_decay is not None: decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm]) decay_parameters = [ name for name in decay_parameters if "bias" not in name and name in params_to_update ] optimization_params = [ { "params": [ p for n, p in params_to_update.items() if n in decay_parameters ], "weight_decay": weight_decay, }, { "params": [ p for n, p in params_to_update.items() if n not in decay_parameters ], "weight_decay": 0.0, }, ] return optimization_params return list(params_to_update.values())
[docs]@configclass class OptimizerArgs(ConfigBase): """ A base class for optimizer arguments, here we define learning rate lr. Attributes ---------- lr : float Learning rate of the optimizer """ lr: float
[docs] @abstractmethod def init_optimizer(self, model: nn.Module): """ Abstract method to be implemented by derived classes, which initializes the optimizer. """ raise NotImplementedError("init_optimizer method not implemented.")
[docs]@configclass class OptimizerConfig(ConfigBase): """ Configuration for an optimizer, including optimizer name and arguments (these arguments are specific to a certain type of optimizer like SGD, Adam, AdamW). Attributes ---------- name : str Name of the optimizer. arguments : OptimizerArgs Arguments for the optimizer, specific to a certain type of optimizer. """ name: str arguments: OptimizerArgs
[docs] def __init__(self, name, arguments: dict[str, ty.Any]): """ Initializes the optimizer configuration. Add any provided settings to the optimizer. Parameters ---------- name : str Name of the optimizer, this can be any in ``['adamw', 'adam', 'sgd']``. arguments : dict[str, ty.Any] Arguments for the optimizer, specific to a certain type of optimizer. A common argument can be learning rate, e.g ``{'lr': 0.5}``. If ``name`` is ``"adamw"``, can add ``eps`` to ``arguments``, e.g ``{'lr': 0.5, 'eps': 0.001}``. Examples -------- In the following example, ``optim_config`` will initialize property ``arguments`` of type ``SGDConfig``, setting ``lr=0.5`` as its property. We also have access to ``init_optimizer()`` method of the property, which initalizes an SGD optimizer. This method is actually called in ``make_optimizer()`` >>> optim_config = OptimizerConfig("sgd", {"lr": 0.5}) """ argument_cls = OPTIMIZER_CONFIG_MAP[name] _arguments = argument_cls(**arguments) super().__init__(name=name, arguments=_arguments)
[docs] def make_optimizer(self, model: nn.Module) -> Optimizer: """ Creates and returns an optimizer for the given model. Parameters ---------- model : torch.nn.Module The model to optimize. Returns ------- optimizer : torch.optim.Optimizer The created optimizer. Examples -------- >>> optim_config = OptimizerConfig("sgd", {"lr": 0.5, "weight_decay": 0.5}) >>> optim_config.make_optimizer(my_module) SGD ( Parameter Group 0 dampening: 0 differentiable: False foreach: None lr: 0.5 maximize: False momentum: 0.0 nesterov: False weight_decay: 0.5 Parameter Group 1 dampening: 0 differentiable: False foreach: None lr: 0.5 maximize: False momentum: 0.0 nesterov: False weight_decay: 0.0 ) """ return self.arguments.init_optimizer(model)
[docs]@configclass class SGDConfig(OptimizerArgs): """ Configuration for an SGD optimizer. This class has ``init_optimizer()`` method, which is used to initialize and return an SGD optimizer. Attributes ---------- weight_decay : float Weight decay rate. momentum : float Momentum factor. Examples -------- >>> config = SGDConfig(lr=0.1, momentum=0.9) """ weight_decay: float = 0.0 momentum: float = 0.0
[docs] def init_optimizer(self, model: nn.Module): """ Creates and returns an SGD optimizer that optimizes the model's parameters. These parameters will be processed via ``get_optim_parameters`` before used to initalized the optimizer. Parameters ---------- model : torch.nn.Module The model that has parameters that the optimizer will optimize. Returns ------- optimizer : torch.optim.SGD The created SGD optimizer. Examples -------- >>> config = SGDConfig(lr=0.1, weight_decay=0.5, momentum=0.9) >>> config.init_optimizer(MyModel()) SGD ( Parameter Group 0 dampening: 0 differentiable: False foreach: None lr: 0.1 maximize: False momentum: 0.9 nesterov: False weight_decay: 0.5 Parameter Group 1 dampening: 0 differentiable: False foreach: None lr: 0.1 maximize: False momentum: 0.9 nesterov: False weight_decay: 0.0 ) """ kwargs = self.to_dict() weight_decay = getattr(self, "weight_decay", None) # 1e-4 model_parameters = get_optim_parameters(model, weight_decay) return SGD(model_parameters, **kwargs)
[docs]@configclass class AdamWConfig(OptimizerArgs): """ Configuration for an AdamW optimizer. This class has ``init_optimizer()`` method used to initialize and return an ``AdamW`` optimizer. Attributes ---------- betas : Tuple[float, float] Coefficients for computing running averages of gradient and its square (default is ``(0.9, 0.999)``). eps : float Term added to the denominator to improve numerical stability (default is ``1e-8``). weight_decay : float Weight decay rate (default is ``0.0``). Examples -------- >>> config = AdamWConfig(lr=0.1, weight_decay=0.5, betas=(0.9,0.99)) """ betas: Tuple[float, float] = (0.9, 0.999) eps: float = 1e-8 weight_decay: float = 0.0
[docs] def init_optimizer(self, model: nn.Module): """ Creates and returns an ``AdamW`` optimizer that optimizes the model's parameters. These parameters will be processed via ``get_optim_parameters`` before used to initalized the optimizer. Parameters ---------- model : torch.nn.Module The model that has parameters that the optimizer will optimize. Returns ------- Optimizer An instance of the ``AdamW`` optimizer. Examples -------- >>> config = AdamWConfig(lr=0.1, weight_decay=0.5, betas=(0.9,0.99), eps=0.001) >>> config.init_optimizer(MyModel()) AdamW ( Parameter Group 0 amsgrad: False betas: (0.9, 0.99) capturable: False eps: 0.001 foreach: None lr: 0.1 maximize: False weight_decay: 0.5 Parameter Group 1 amsgrad: False betas: (0.9, 0.99) capturable: False eps: 0.001 foreach: None lr: 0.1 maximize: False weight_decay: 0.0 ) """ kwargs = self.to_dict() weight_decay = getattr(self, "weight_decay", None) # 1e-4 model_parameters = get_optim_parameters(model, weight_decay) return AdamW(model_parameters, **kwargs)
[docs]@configclass class AdamConfig(OptimizerArgs): """ Configuration for an ``Adam`` optimizer. This class has ``init_optimizer()`` method used to initialize and return an ``Adam`` optimizer. Attributes ---------- betas : Tuple[float, float] Coefficients for computing running averages of gradient and its square (default is ``(0.5, 0.9)``). weight_decay : float Weight decay rate (default is ``0.0``). """ betas: Tuple[float, float] = (0.5, 0.9) weight_decay: float = 0.0
[docs] def init_optimizer(self, model: nn.Module): """ Creates and returns an ``Adam`` optimizer that optimizes the model's parameters. These parameters will be processed via ``get_optim_parameters`` before used to initalized the optimizer. Parameters ---------- model : torch.nn.Module The model that has parameters that the optimizer will optimize. Returns ------- Optimizer An instance of the ``Adam`` optimizer. Examples -------- >>> config = AdamConfig(lr=0.1, weight_decay=0.5, betas=(0.6,0.9)) >>> config.init_optimizer(MyModel()) Adam ( Parameter Group 0 amsgrad: False betas: (0.6, 0.9) capturable: False differentiable: False eps: 1e-08 foreach: None fused: False lr: 0.1 maximize: False weight_decay: 0.5 Parameter Group 1 amsgrad: False betas: (0.6, 0.9) capturable: False differentiable: False eps: 1e-08 foreach: None fused: False lr: 0.1 maximize: False weight_decay: 0.0 ) """ kwargs = self.to_dict() weight_decay = getattr(self, "weight_decay", None) # 1e-4 model_parameters = get_optim_parameters(model, weight_decay) return Adam(model_parameters, **kwargs)
OPTIMIZER_CONFIG_MAP: dict[str, type] = { "adamw": AdamWConfig, "adam": AdamConfig, "sgd": SGDConfig, }