Skip to content

Commit

Permalink
Training hang detection based on XPU Timer metric. (#1288)
Browse files Browse the repository at this point in the history
* done hang check basic impl and test

* optimized

* optimized

* add strategy params

* deve

* revert

* lint

* refactor training hang impl

* fix

* fix ut

* fix comments
  • Loading branch information
BalaBalaYi authored Nov 13, 2024
1 parent 3e3ab46 commit 07b18ac
Show file tree
Hide file tree
Showing 30 changed files with 1,637 additions and 368 deletions.
4 changes: 4 additions & 0 deletions dlrover/python/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,11 @@ class AscendConstants(object):

class ErrorMonitorConstants(object):
TYPE_INFO = "info"
TYPE_WARN = "warn"
TYPE_ERROR = "error"

JOB_INSTANCE = "job"

ACTION_WORKER_CREATE = "worker_create"
ACTION_STATUS_UPDATE = "status_update"
ACTION_EARLY_STOP = "early_stop"
Expand All @@ -354,3 +357,4 @@ class ErrorMonitorConstants(object):
ACTION_RDZV_TIMEOUT = "rendezvous_timeout"
ACTION_TRAINING_START = "training_start"
ACTION_RESTART_TRAINING = "restart_training"
ACTION_HANG_WARN = "hang_warning"
4 changes: 4 additions & 0 deletions dlrover/python/common/global_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class DefaultValues(object):
SEC_TO_CHANGE_PS = 3600 # 1h
SEC_TO_WAIT_FAILED_PS = 600 # 10min
HANG_CPU_USAGE_RATE = 0.05
HANG_DETECTION = 1


class Context(Singleton):
Expand Down Expand Up @@ -95,6 +96,9 @@ def __init__(self):
self.is_tfv1_ps = False
self.master_port = None
self.relaunch_always = False
# The strategy of 'hang detection':
# 0: log only; 1: notify; 2: with fault tolerance
self.hang_detection = DefaultValues.HANG_DETECTION

def set_params_from_brain(self):
self.train_speed_record_num = self.get_param_value_from_brain(
Expand Down
2 changes: 1 addition & 1 deletion dlrover/python/diagnosis/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class InferenceConfigKey(object):
class DiagnosisConstant(object):
MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180
AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60
MASTER = -1
MASTER_INSTANCE = -1
ANY_INSTANCE = -2
LOCAL_INSTANCE = -3
ACTION_EXPIRED_TIME_PERIOD_DEFAULT = 60 * 5
Expand Down
3 changes: 3 additions & 0 deletions dlrover/python/diagnosis/common/inference_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class InferenceName:
TRAINING = "training"
NODE = "node"
WORKER = "worker"
ACTION = "action"


class InferenceAttribute:
Expand All @@ -31,9 +32,11 @@ class InferenceAttribute:


class InferenceDescription:
NONE = "n/a"
HANG = "hang"
FAILURE = "failure"
METRICS = "metrics"
EVENT = "event"


@dataclass
Expand Down
50 changes: 45 additions & 5 deletions dlrover/python/diagnosis/inferencechain/coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,52 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from typing import List

from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction
from dlrover.python.diagnosis.common.inference_chain import Inference
from dlrover.python.common.global_context import Context
from dlrover.python.diagnosis.common.diagnosis_action import (
DiagnosisAction,
EventAction,
NoAction,
)
from dlrover.python.diagnosis.common.inference_chain import (
Inference,
InferenceAttribute,
InferenceDescription,
InferenceName,
is_same_inference,
)

_dlrover_ctx = Context.singleton_instance()


def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction:
"""
Transform solutions (of Inference) to executable diagnosis action
Args:
solutions: solutions of Inference
Return:
diagnosis action
"""

event_solution = Inference(
name=InferenceName.ACTION,
attribution=InferenceAttribute.IS,
description=InferenceDescription.EVENT,
)

for solution in solutions:
# deal with event
if is_same_inference(solution, event_solution):
event_payload = solution.configs
return EventAction(
event_payload["event_type"],
event_payload["event_instance"],
event_payload["event_action"],
event_payload["event_msg"],
json.loads(event_payload["event_labels"]),
)

def coordinate_inferences(observations: List[Inference]) -> DiagnosisAction:
return DiagnosisAction()
return NoAction()

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction
from dlrover.python.diagnosis.common.inference_chain import Inference


def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction:
"""
Transform solutions (of Inference) to executable diagnosis action
Args:
solutions: solutions of Inference
Return:
diagnosis action
"""
return DiagnosisAction()
Loading

0 comments on commit 07b18ac

Please sign in to comment.