Skip to content

hooks

EarlyStoppingHook #

Bases: HookBase

Source code in focoos/trainer/hooks/early_stop.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class EarlyStoppingHook(HookBase):
    def __init__(
        self,
        enabled: bool,
        eval_period: int,
        patience: int,
        val_metric: str,
        mode: str = "max",
    ):
        """
        Initializes the EarlyStoppingHook.

        This hook is designed to monitor a specific validation metric during the training process
        and stop training when no improvement is observed in the metric for a specified number of
        iterations. This is particularly useful for preventing overfitting by halting training
        once the model's performance on the validation set no longer improves.

        Args:
            eval_period (int): The frequency (in iterations) at which the validation metric is evaluated.
                               For example, if `eval_period` is 100, the validation metric will be checked
                               every 100 iterations.
            patience (int): Number of consecutive evaluations with no improvement after which training will be stopped.
                            For example, if `patience` is set to 5, training will stop if the validation metric does not
                            improve for 5 consecutive evaluations.
            val_metric (str): The name of the validation metric to monitor. This should correspond to one of the metrics
                              calculated during the validation phase, such as "accuracy", "loss", etc.
            mode (str, optional): One of "min" or "max". This parameter dictates the direction of improvement
                                  for the validation metric. In "min" mode, the training will stop when the monitored
                                  quantity (e.g., loss) stops decreasing. In "max" mode, training will stop when
                                  the monitored quantity (e.g., accuracy) stops increasing. Defaults to "max".

        """
        self.enabled = enabled
        self.patience = patience
        self.val_metric = val_metric
        self.mode = mode
        self.best_metric = None
        self.num_bad_epochs = 0
        self._period = eval_period
        self._logger = get_logger(__name__)

    def after_step(self):
        next_iter = self.trainer.iter + 1

        if self._period > 0 and next_iter % self._period == 0 and next_iter != self.trainer.max_iter and self.enabled:
            metric_tuple = self.trainer.storage.latest().get(self.val_metric)

            if metric_tuple is None:
                return
            else:
                current_metric, metric_iter = metric_tuple

            if (
                self.best_metric is None
                or (self.mode == "max" and current_metric > self.best_metric)
                or (self.mode == "min" and current_metric < self.best_metric)
            ):
                self.best_metric = current_metric
                self.num_bad_epochs = 0
            else:
                self.num_bad_epochs += 1
                self._logger.info(f"{self.num_bad_epochs}/{self.patience} without improvements..")

            if self.num_bad_epochs >= self.patience:
                self.trainer.storage.put_scalar("early_stopping", True)
                raise EarlyStopException("Early Stopping Exception to stop the training..")

__init__(enabled, eval_period, patience, val_metric, mode='max') #

Initializes the EarlyStoppingHook.

This hook is designed to monitor a specific validation metric during the training process and stop training when no improvement is observed in the metric for a specified number of iterations. This is particularly useful for preventing overfitting by halting training once the model's performance on the validation set no longer improves.

Parameters:

Name Type Description Default
eval_period int

The frequency (in iterations) at which the validation metric is evaluated. For example, if eval_period is 100, the validation metric will be checked every 100 iterations.

required
patience int

Number of consecutive evaluations with no improvement after which training will be stopped. For example, if patience is set to 5, training will stop if the validation metric does not improve for 5 consecutive evaluations.

required
val_metric str

The name of the validation metric to monitor. This should correspond to one of the metrics calculated during the validation phase, such as "accuracy", "loss", etc.

required
mode str

One of "min" or "max". This parameter dictates the direction of improvement for the validation metric. In "min" mode, the training will stop when the monitored quantity (e.g., loss) stops decreasing. In "max" mode, training will stop when the monitored quantity (e.g., accuracy) stops increasing. Defaults to "max".

'max'
Source code in focoos/trainer/hooks/early_stop.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(
    self,
    enabled: bool,
    eval_period: int,
    patience: int,
    val_metric: str,
    mode: str = "max",
):
    """
    Initializes the EarlyStoppingHook.

    This hook is designed to monitor a specific validation metric during the training process
    and stop training when no improvement is observed in the metric for a specified number of
    iterations. This is particularly useful for preventing overfitting by halting training
    once the model's performance on the validation set no longer improves.

    Args:
        eval_period (int): The frequency (in iterations) at which the validation metric is evaluated.
                           For example, if `eval_period` is 100, the validation metric will be checked
                           every 100 iterations.
        patience (int): Number of consecutive evaluations with no improvement after which training will be stopped.
                        For example, if `patience` is set to 5, training will stop if the validation metric does not
                        improve for 5 consecutive evaluations.
        val_metric (str): The name of the validation metric to monitor. This should correspond to one of the metrics
                          calculated during the validation phase, such as "accuracy", "loss", etc.
        mode (str, optional): One of "min" or "max". This parameter dictates the direction of improvement
                              for the validation metric. In "min" mode, the training will stop when the monitored
                              quantity (e.g., loss) stops decreasing. In "max" mode, training will stop when
                              the monitored quantity (e.g., accuracy) stops increasing. Defaults to "max".

    """
    self.enabled = enabled
    self.patience = patience
    self.val_metric = val_metric
    self.mode = mode
    self.best_metric = None
    self.num_bad_epochs = 0
    self._period = eval_period
    self._logger = get_logger(__name__)

SyncToHubHook #

Bases: HookBase

Source code in focoos/trainer/hooks/sync_to_hub.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class SyncToHubHook(HookBase):
    def __init__(
        self,
        remote_model: RemoteModel,
        model_info: ModelInfo,
        output_dir: str,
        sync_period: int = 100,
        eval_period: int = 100,
    ):
        self.remote_model = remote_model
        self.model_info = model_info
        self.output_dir = output_dir
        self.sync_period = sync_period
        self.eval_period = eval_period

    @property
    def iteration(self):
        try:
            _iter = self.trainer.iter
        except Exception:
            _iter = 1
        return _iter

    def before_train(self):
        """
        Called before the first iteration.
        """
        info = HubSyncLocalTraining(
            status=ModelStatus.TRAINING_RUNNING,  # type: ignore
            iterations=0,
            metrics=None,
        )

        self._sync_train_job(info)

    def after_step(self):
        if (self.iteration % self.sync_period == 0) and self.iteration > 0:
            self._sync_train_job(
                sync_info=HubSyncLocalTraining(
                    status=ModelStatus.TRAINING_RUNNING,  # type: ignore
                    iterations=self.iteration,
                    training_info=self.model_info.training_info,
                )
            )

        elif (self.iteration % (self.eval_period + 3) == 0) and self.iteration > 0:
            self._sync_train_job(
                sync_info=HubSyncLocalTraining(
                    status=ModelStatus.TRAINING_RUNNING,  # type: ignore
                    iterations=self.iteration,
                    training_info=self.model_info.training_info,
                ),
                upload_artifacts=[ArtifactName.WEIGHTS],
            )

    def after_train(self):
        # Catch exception and sync training info, final weights will be synced in main trainer fn
        exc_type, exc_value, exc_traceback = sys.exc_info()
        if exc_type is not None:
            logger.error(
                f"Exception during training, status set to TRAINING_ERROR: {str(exc_type.__name__)} {str(exc_value)}"
            )
            status = ModelStatus.TRAINING_ERROR
            self.model_info.status = status
            if self.model_info.training_info is not None:
                self.model_info.training_info.main_status = status
                self.model_info.training_info.failure_reason = str(exc_value)
                self.model_info.training_info.end_time = datetime.now().isoformat()
                if self.model_info.training_info.status_transitions is None:
                    self.model_info.training_info.status_transitions = []
                self.model_info.training_info.status_transitions.append(
                    dict(
                        status=status,
                        timestamp=datetime.now().isoformat(),
                        detail=f"{str(exc_type.__name__)}:  {str(exc_value)}",
                    )
                )
            self.model_info.dump_json(os.path.join(self.output_dir, ArtifactName.INFO))
            self._sync_train_job(
                sync_info=HubSyncLocalTraining(
                    status=status,
                    iterations=self.iteration,
                    training_info=self.model_info.training_info,
                ),
                upload_artifacts=[
                    ArtifactName.WEIGHTS,
                    ArtifactName.LOGS,
                    ArtifactName.INFO,
                    ArtifactName.METRICS,
                ],
            )

    def _sync_train_job(self, sync_info: HubSyncLocalTraining, upload_artifacts: Optional[List[ArtifactName]] = None):
        try:
            self.remote_model.sync_local_training_job(sync_info, self.output_dir, upload_artifacts)
            # logger.debug(f"Sync: {self.iteration} {self.model_info.name} ref: {self.model_info.ref}")
        except Exception as e:
            logger.error(f"[sync_train_job] failed to sync train job: {str(e)}")

before_train() #

Called before the first iteration.

Source code in focoos/trainer/hooks/sync_to_hub.py
37
38
39
40
41
42
43
44
45
46
47
def before_train(self):
    """
    Called before the first iteration.
    """
    info = HubSyncLocalTraining(
        status=ModelStatus.TRAINING_RUNNING,  # type: ignore
        iterations=0,
        metrics=None,
    )

    self._sync_train_job(info)