Training hang detection based on XPU Timer metric. (#1288)

* done hang check basic impl and test * optimized * optimized * add strategy params * deve * revert * lint * refactor training hang impl * fix * fix ut * fix comments
intelligent-machine-learning · Nov 13, 2024 · 07b18ac · 07b18ac
1 parent 3e3ab46
commit 07b18ac
Show file tree

Hide file tree

Showing 30 changed files with 1,637 additions and 368 deletions.
diff --git a/dlrover/python/common/constants.py b/dlrover/python/common/constants.py
@@ -340,8 +340,11 @@ class AscendConstants(object):
 
 class ErrorMonitorConstants(object):
     TYPE_INFO = "info"
+    TYPE_WARN = "warn"
     TYPE_ERROR = "error"
 
+    JOB_INSTANCE = "job"
+
     ACTION_WORKER_CREATE = "worker_create"
     ACTION_STATUS_UPDATE = "status_update"
     ACTION_EARLY_STOP = "early_stop"
@@ -354,3 +357,4 @@ class ErrorMonitorConstants(object):
     ACTION_RDZV_TIMEOUT = "rendezvous_timeout"
     ACTION_TRAINING_START = "training_start"
     ACTION_RESTART_TRAINING = "restart_training"
+    ACTION_HANG_WARN = "hang_warning"
diff --git a/dlrover/python/common/global_context.py b/dlrover/python/common/global_context.py
@@ -53,6 +53,7 @@ class DefaultValues(object):
     SEC_TO_CHANGE_PS = 3600  # 1h
     SEC_TO_WAIT_FAILED_PS = 600  # 10min
     HANG_CPU_USAGE_RATE = 0.05
+    HANG_DETECTION = 1
 
 
 class Context(Singleton):
@@ -95,6 +96,9 @@ def __init__(self):
         self.is_tfv1_ps = False
         self.master_port = None
         self.relaunch_always = False
+        # The strategy of 'hang detection':
+        # 0: log only; 1: notify; 2: with fault tolerance
+        self.hang_detection = DefaultValues.HANG_DETECTION
 
     def set_params_from_brain(self):
         self.train_speed_record_num = self.get_param_value_from_brain(

diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py
@@ -24,7 +24,7 @@ class InferenceConfigKey(object):
 class DiagnosisConstant(object):
     MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180
     AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60
-    MASTER = -1
+    MASTER_INSTANCE = -1
     ANY_INSTANCE = -2
     LOCAL_INSTANCE = -3
     ACTION_EXPIRED_TIME_PERIOD_DEFAULT = 60 * 5

diff --git a/dlrover/python/diagnosis/common/inference_chain.py b/dlrover/python/diagnosis/common/inference_chain.py
@@ -21,6 +21,7 @@ class InferenceName:
     TRAINING = "training"
     NODE = "node"
     WORKER = "worker"
+    ACTION = "action"
 
 
 class InferenceAttribute:
@@ -31,9 +32,11 @@ class InferenceAttribute:
 
 
 class InferenceDescription:
+    NONE = "n/a"
     HANG = "hang"
     FAILURE = "failure"
     METRICS = "metrics"
+    EVENT = "event"
 
 
 @dataclass

diff --git a/dlrover/python/diagnosis/inferencechain/coordinator.py b/dlrover/python/diagnosis/inferencechain/coordinator.py
@@ -10,12 +10,52 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import json
 from typing import List
 
-from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction
-from dlrover.python.diagnosis.common.inference_chain import Inference
+from dlrover.python.common.global_context import Context
+from dlrover.python.diagnosis.common.diagnosis_action import (
+    DiagnosisAction,
+    EventAction,
+    NoAction,
+)
+from dlrover.python.diagnosis.common.inference_chain import (
+    Inference,
+    InferenceAttribute,
+    InferenceDescription,
+    InferenceName,
+    is_same_inference,
+)
+
+_dlrover_ctx = Context.singleton_instance()
+
+
+def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction:
+    """
+    Transform solutions (of Inference) to executable diagnosis action
+
+    Args:
+        solutions: solutions of Inference
+    Return:
+        diagnosis action
+    """
+
+    event_solution = Inference(
+        name=InferenceName.ACTION,
+        attribution=InferenceAttribute.IS,
+        description=InferenceDescription.EVENT,
+    )
 
+    for solution in solutions:
+        # deal with event
+        if is_same_inference(solution, event_solution):
+            event_payload = solution.configs
+            return EventAction(
+                event_payload["event_type"],
+                event_payload["event_instance"],
+                event_payload["event_action"],
+                event_payload["event_msg"],
+                json.loads(event_payload["event_labels"]),
+            )
 
-def coordinate_inferences(observations: List[Inference]) -> DiagnosisAction:
-    return DiagnosisAction()
+    return NoAction()
diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py
diff --git a/...is/inferencechain/coordinate_solutions.py → ...in/inferenceoperator/observer/__init__.py b/...is/inferencechain/coordinate_solutions.py → ...in/inferenceoperator/observer/__init__.py
@@ -10,20 +10,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import List
-
-from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction
-from dlrover.python.diagnosis.common.inference_chain import Inference
-
-
-def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction:
-    """
-    Transform solutions (of Inference) to executable diagnosis action
-
-    Args:
-        solutions: solutions of Inference
-    Return:
-        diagnosis action
-    """
-    return DiagnosisAction()
diff --git a/...ceoperator/check_failure_node_operator.py → ...r/observer/check_failure_node_operator.py b/...ceoperator/check_failure_node_operator.py → ...r/observer/check_failure_node_operator.py