Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training hang detection based on XPU Timer metric. #1288

Merged
4 changes: 4 additions & 0 deletions dlrover/python/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,8 +339,11 @@ class AscendConstants(object):

class ErrorMonitorConstants(object):
TYPE_INFO = "info"
TYPE_WARN = "warn"
TYPE_ERROR = "error"

JOB_INSTANCE = "job"

ACTION_WORKER_CREATE = "worker_create"
ACTION_STATUS_UPDATE = "status_update"
ACTION_EARLY_STOP = "early_stop"
Expand All @@ -353,3 +356,4 @@ class ErrorMonitorConstants(object):
ACTION_RDZV_TIMEOUT = "rendezvous_timeout"
ACTION_TRAINING_START = "training_start"
ACTION_RESTART_TRAINING = "restart_training"
ACTION_HANG_WARN = "hang_warning"
4 changes: 4 additions & 0 deletions dlrover/python/common/global_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class DefaultValues(object):
SEC_TO_CHANGE_PS = 3600 # 1h
SEC_TO_WAIT_FAILED_PS = 600 # 10min
HANG_CPU_USAGE_RATE = 0.05
HANG_DETECTION = 1


class Context(Singleton):
Expand Down Expand Up @@ -95,6 +96,9 @@ def __init__(self):
self.is_tfv1_ps = False
self.master_port = None
self.relaunch_always = False
# The strategy of 'hang detection':
# 0: log only; 1: notify; 2: with fault tolerance
self.hang_detection = DefaultValues.HANG_DETECTION

def set_params_from_brain(self):
self.train_speed_record_num = self.get_param_value_from_brain(
Expand Down
2 changes: 1 addition & 1 deletion dlrover/python/diagnosis/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class InferenceConfigKey(object):
class DiagnosisConstant(object):
MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180
AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60
MASTER = -1
MASTER_INSTANCE = -1
ANY_INSTANCE = -2
LOCAL_INSTANCE = -3
ACTION_EXPIRED_TIME_PERIOD_DEFAULT = 60 * 5
Expand Down
3 changes: 3 additions & 0 deletions dlrover/python/diagnosis/common/inference_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class InferenceName:
TRAINING = "training"
NODE = "node"
WORKER = "worker"
ACTION = "action"


class InferenceAttribute:
Expand All @@ -31,9 +32,11 @@ class InferenceAttribute:


class InferenceDescription:
NONE = "n/a"
HANG = "hang"
FAILURE = "failure"
METRICS = "metrics"
EVENT = "event"


@dataclass
Expand Down
50 changes: 45 additions & 5 deletions dlrover/python/diagnosis/inferencechain/coordinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,52 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
from typing import List

from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction
from dlrover.python.diagnosis.common.inference_chain import Inference
from dlrover.python.common.global_context import Context
from dlrover.python.diagnosis.common.diagnosis_action import (
DiagnosisAction,
EventAction,
NoAction,
)
from dlrover.python.diagnosis.common.inference_chain import (
Inference,
InferenceAttribute,
InferenceDescription,
InferenceName,
is_same_inference,
)

_dlrover_ctx = Context.singleton_instance()


def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction:
"""
Transform solutions (of Inference) to executable diagnosis action

Args:
solutions: solutions of Inference
Return:
diagnosis action
"""

event_solution = Inference(
name=InferenceName.ACTION,
attribution=InferenceAttribute.IS,
description=InferenceDescription.EVENT,
)

for solution in solutions:
# deal with event
if is_same_inference(solution, event_solution):
event_payload = solution.configs
return EventAction(
event_payload["event_type"],
event_payload["event_instance"],
event_payload["event_action"],
event_payload["event_msg"],
json.loads(event_payload["event_labels"]),
)

def coordinate_inferences(observations: List[Inference]) -> DiagnosisAction:
return DiagnosisAction()
return NoAction()

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction
from dlrover.python.diagnosis.common.inference_chain import Inference


def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction:
"""
Transform solutions (of Inference) to executable diagnosis action

Args:
solutions: solutions of Inference
Return:
diagnosis action
"""
return DiagnosisAction()
Loading
Loading