diff --git a/neuralprophet/configure.py b/neuralprophet/configure.py index a8e01cce3..6ec815379 100644 --- a/neuralprophet/configure.py +++ b/neuralprophet/configure.py @@ -17,6 +17,26 @@ @dataclass class Model: + """ + General configuration settings of the forecasting model. + + Attributes + n_forecasts : int + Number of forecasts to be made. + quantiles : Optional[List[float]] + List of quantiles for prediction intervals. Default is None. + prediction_frequency : Optional[Dict[str]] + Frequency of predictions. Default is None. + max_lags : Optional[int] + Maximum number of lags used in the model. This is set during model configuration. + + Methods + setup_quantiles() + Configures the quantiles for prediction intervals. + set_max_num_lags(n_lags, config_lagged_regressors) + Determines the maximum number of lags between autoregression lags and covariate lags. + """ + n_forecasts: int quantiles: Optional[List[float]] = None prediction_frequency: Optional[Dict[str]] = None @@ -70,6 +90,33 @@ def set_max_num_lags( @dataclass class Normalization: + """ + Cofiguration settings for normalization of data. + + Attributes + ---------- + normalize : str + The type of normalization to apply. + global_normalization : bool + Flag indicating whether to apply global normalization. + global_time_normalization : bool + Flag indicating whether to apply global time normalization. + unknown_data_normalization : bool + Flag indicating whether to apply normalization to unknown data. + local_data_params : dict + Dictionary containing local data parameters, where the key is the name of the dataset and the value is another dictionary with variable names. + global_data_params : dict + Dictionary containing global data parameters, where the key is the name of the variable. + + Methods + ------- + init_data_params(df, config_lagged_regressors=None, config_regressors=None, config_events=None, config_seasonality=None) + Initializes the data parameters for normalization based on the provided dataframe and configuration components. + + get_data_params(df_name) + Retrieves the data parameters for a given dataset name, handling both local and global normalization scenarios. + """ + normalize: str global_normalization: bool global_time_normalization: bool @@ -85,6 +132,23 @@ def init_data_params( config_events: Optional[configure_components.Events] = None, config_seasonality: Optional[configure_components.Seasonalities] = None, ): + """ + Compute parameters for data normalization. + + This method sets up the local and global data parameters required for the normalization of the data. + based on the provided dataframe and configuration options. If only one dataframe + is provided and global normalization is not set, it will enable global normalization. + + Args: + df (pd.DataFrame): The input dataframe containing the data. + config_lagged_regressors (Optional[configure_components.LaggedRegressors]): Configuration for lagged regressors. + config_regressors (Optional): Configuration for additional regressors. + config_events (Optional[configure_components.Events]): Configuration for events. + config_seasonality (Optional[configure_components.Seasonalities]): Configuration for seasonalities. + + Returns: + None + """ if len(df["ID"].unique()) == 1 and not self.global_normalization: log.info("Setting normalization to global as only one dataframe provided for training.") self.global_normalization = True @@ -100,17 +164,37 @@ def init_data_params( ) def get_data_params(self, df_name): + """ + Retrieve the data normalization parameters for a given dataset name. + + Parameters: + ----------- + df_name : str + The name of the dataset for which to retrieve the data parameters. + + Returns: + -------- + dict + The data parameters associated with the given dataset name. + + Raises: + ------- + ValueError + If the dataset name is not found in the local data parameters and + `unknown_data_normalization` is False. + + """ if self.global_normalization: data_params = self.global_data_params else: if df_name in self.local_data_params.keys() and df_name != "__df__": - log.debug(f"Dataset name {df_name!r} found in training data_params") + # log.debug(f"Dataset name {df_name!r} found in training data_params") data_params = self.local_data_params[df_name] elif self.unknown_data_normalization: - log.debug( - f"Dataset name {df_name!r} is not present in valid data_params but unknown_data_normalization is \ - True. Using global_data_params" - ) + # log.debug( + # f"Dataset name {df_name!r} is not present in valid data_params but unknown_data_normalization is \ + # True. Using global_data_params" + # ) data_params = self.global_data_params else: raise ValueError( @@ -122,6 +206,16 @@ def get_data_params(self, df_name): @dataclass class MissingDataHandling: + """ + Configuration for handling missing data in the dataset. + + Attributes: + impute_missing (bool): Flag to indicate if missing data should be imputed. Default is True. + impute_linear (int): Number of missing data points to impute using linear interpolation. Default is 10. + impute_rolling (int): Number of missing data points to impute using rolling average. Default is 10. + drop_missing (bool): Flag to indicate if rows with missing data should be dropped. Default is False. + """ + impute_missing: bool = True impute_linear: int = 10 impute_rolling: int = 10 @@ -130,6 +224,48 @@ class MissingDataHandling: @dataclass class Train: + """ + Settings for model training. + This class encapsulates the configuration parameters and methods for training the model including PyTorch Lightning arguments. + + Attributes + ---------- + learning_rate : Optional[float] + Learning rate for the optimizer. + epochs : Optional[int] + Number of epochs for training. + batch_size : Optional[int] + Batch size for training. + loss_func : Union[str, torch.nn.modules.loss._Loss, Callable] + Loss function for training. + optimizer : Union[str, Type[torch.optim.Optimizer]] + Optimizer for training. + optimizer_args : dict + Arguments for the optimizer. + scheduler : Optional[Union[str, Type[torch.optim.lr_scheduler.LRScheduler]]] + Learning rate scheduler. + scheduler_args : dict + Arguments for the scheduler. + early_stopping : Optional[bool] + Whether to use early stopping. + newer_samples_weight : float + Weight for newer samples. + newer_samples_start : float + Start point for newer samples. + reg_delay_pct : float + Regularization delay percentage. + reg_lambda_trend : Optional[float] + Regularization lambda trend. + trend_reg_threshold : Optional[Union[bool, float]] + Trend regularization threshold. + n_data : int + Number of data points in the dataset. + loss_func_name : str + Name of the loss function. + pl_trainer_config : dict + Configuration for PyTorch Lightning trainer. + """ + learning_rate: Optional[float] epochs: Optional[int] batch_size: Optional[int] @@ -159,6 +295,15 @@ def __post_init__(self): # self.set_scheduler() def set_loss_func(self, quantiles: List[float]): + """ + Set the loss function based on the provided quantiles. + If quantiles are provided, the loss function is wrapped in a PinballLoss. + + Parameters + ---------- + quantiles : List[float] + List of quantiles for the loss function. + """ if isinstance(self.loss_func, str): if self.loss_func.lower() in ["smoothl1", "smoothl1loss", "huber"]: # keeping 'huber' for backwards compatiblility, though not identical @@ -189,6 +334,28 @@ def set_auto_batch_epoch( min_epoch: int = 20, max_epoch: int = 500, ): + """ + Automatically sets the batch size and number of epochs based on the size of the dataset. + + Parameters + ---------- + n_data : int + The number of data points in the dataset. Must be greater than or equal to 1. + min_batch : int, optional + The minimum batch size. Default is 8. + max_batch : int, optional + The maximum batch size. Default is 2048. + min_epoch : int, optional + The minimum number of epochs. Default is 20. + max_epoch : int, optional + The maximum number of epochs. Default is 500. + + Notes + ----- + - If `self.batch_size` is not set, it will be automatically determined based on the size of the dataset. + - If `self.epochs` is not set, it will be automatically determined to ensure a minimum of 1000 steps and a maximum of 100,000 steps. + - The `lambda_delay` attribute is also set based on the regularization delay percentage and the number of epochs. + """ assert n_data >= 1 self.n_data = n_data if self.batch_size is None: @@ -206,16 +373,17 @@ def set_auto_batch_epoch( def set_optimizer(self): """ - Set the optimizer and optimizer args. If optimizer is a string, then it will be converted to the corresponding - torch optimizer. The optimizer is not initialized yet as this is done in configure_optimizers in TimeNet. + Set the optimizer and optimizer args from stored values in self. - Parameters - ---------- - optimizer_name : int - Object provided to NeuralProphet as optimizer. - optimizer_args : dict - Arguments for the optimizer. + If optimizer is a string, then it will be converted to the corresponding torch optimizer class. + The optimizer is not initialized yet as this is done in configure_optimizers in TimeNet. + Notes + ----- + - `self.optimizer_name` : int + Object provided to NeuralProphet as optimizer. + - `self.optimizer_args` : dict + Arguments for the optimizer. """ if isinstance(self.optimizer, str): if self.optimizer.lower() == "adamw": @@ -238,6 +406,10 @@ def set_scheduler(self): """ Set the scheduler and scheduler arg depending on the user selection. The scheduler is not initialized yet as this is done in configure_optimizers in TimeNet. + + Notes + ----- + - If no scheduler is specified, falls back to ExponentialLR scheduler. """ if self.scheduler is None: @@ -289,6 +461,23 @@ def set_scheduler(self): ), "Scheduler must be a subclass of torch.optim.lr_scheduler.LRScheduler" def get_reg_delay_weight(self, progress, reg_start_pct: float = 0.66, reg_full_pct: float = 1.0): + """ + Get the regularization delay weight based on current position in training progress. + + Parameters + ---------- + progress : float + Current progress of the training. + reg_start_pct : float, optional + Percentage of progress to start regularization. Default is 0.66. + reg_full_pct : float, optional + Percentage of progress to fully apply regularization. Default is 1.0. + + Returns + ------- + float + Regularization delay weight. + """ # Ignore type warning of epochs possibly being None (does not work with dataclasses) if reg_start_pct == reg_full_pct: reg_progress = float(progress > reg_start_pct) @@ -303,4 +492,12 @@ def get_reg_delay_weight(self, progress, reg_start_pct: float = 0.66, reg_full_p return delay_weight def set_batches_per_epoch(self, batches_per_epoch: int): + """ + Set the number of batches per epoch. + + Parameters + ---------- + batches_per_epoch : int + Number of batches per epoch. + """ self.batches_per_epoch = batches_per_epoch