exec DM model with gpu (#33609)

* half old-commit-hash: 9f72eca * optimed old-commit-hash: 6e36e2a * thneed old-commit-hash: 419a06c * exec old-commit-hash: 0059c27 * runner old-commit-hash: 34232ad * runs but old-commit-hash: 3db37c0 * it is 01 old-commit-hash: a160d81 * np old-commit-hash: c1caff6 * module url old-commit-hash: 6f4902c * new old-commit-hash: 779ae79 * ds fast * is this work * corcention * real timing * no reg * interim gather * 0e4a9c7b * fa69be0, and halve * list * cleanup * slighly faster * setprotlt * expected * replay ref * more powar * reluctantly * bump tg * 8 * less * less * bump tg * better than exp * closer * cc * see diff * commits * was right * to 32 cast * remove dlc file * support both * dspExecutionTime -> gpuExecutionTime * ignore * time ref * ref commit * last --------- Co-authored-by: Comma Device <[email protected]>
commaai · Sep 26, 2024 · 876f192 · 876f192
1 parent e2f9942
commit 876f192
Show file tree

Hide file tree

Showing 17 changed files with 53 additions and 35 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -2,7 +2,6 @@
 
 # to move existing files into LFS:
 # git add --renormalize .
-*.dlc filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.svg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

diff --git a/cereal/log.capnp b/cereal/log.capnp
@@ -2012,7 +2012,8 @@ struct Joystick {
 struct DriverStateV2 {
   frameId @0 :UInt32;
   modelExecutionTime @1 :Float32;
-  dspExecutionTime @2 :Float32;
+  dspExecutionTimeDEPRECATED @2 :Float32;
+  gpuExecutionTime @8 :Float32;
   rawPredictions @3 :Data;
 
   poorVisionProb @4 :Float32;

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -69,6 +69,10 @@ if arch == "larch64" or GetOption('pc_thneed'):
 
   lenv.Command(fn + ".thneed", [fn + ".onnx"] + tinygrad_files, cmd)
 
+  fn_dm = File("models/dmonitoring_model").abspath
+  cmd = f"cd {Dir('#').abspath}/tinygrad_repo && " + ' '.join(tinygrad_opts) + f" python3 openpilot/compile2.py {fn_dm}.onnx {fn_dm}.thneed"
+  lenv.Command(fn_dm + ".thneed", [fn_dm + ".onnx"] + tinygrad_files, cmd)
+
   thneed_lib = env.SharedLibrary('thneed', thneed_src, LIBS=[gpucommon, common, 'OpenCL', 'dl'])
   thneedmodel_lib = env.Library('thneedmodel', ['runners/thneedmodel.cc'])
   lenvCython.Program('runners/thneedmodel_pyx.so', 'runners/thneedmodel_pyx.pyx', LIBS=envCython["LIBS"]+[thneedmodel_lib, thneed_lib, gpucommon, common, 'dl', 'OpenCL'])
diff --git a/selfdrive/modeld/dmonitoringmodeld b/selfdrive/modeld/dmonitoringmodeld
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null && pwd)"
+cd "$DIR/../../"
+
+if [ -f "$DIR/libthneed.so" ]; then
+  export LD_PRELOAD="$DIR/libthneed.so"
+fi
+
+exec "$DIR/dmonitoringmodeld.py" "$@"
diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
@@ -6,6 +6,7 @@
 import ctypes
 import numpy as np
 from pathlib import Path
+from setproctitle import setproctitle
 
 from cereal import messaging
 from cereal.messaging import PubMaster, SubMaster
@@ -14,16 +15,19 @@
 from openpilot.common.params import Params
 from openpilot.common.realtime import set_realtime_priority
 from openpilot.selfdrive.modeld.runners import ModelRunner, Runtime
+from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
 
 CALIB_LEN = 3
-REG_SCALE = 0.25
 MODEL_WIDTH = 1440
 MODEL_HEIGHT = 960
-OUTPUT_SIZE = 84
+FEATURE_LEN = 512
+OUTPUT_SIZE = 84 + FEATURE_LEN
+
+PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 MODEL_PATHS = {
-  ModelRunner.SNPE: Path(__file__).parent / 'models/dmonitoring_model_q.dlc',
+  ModelRunner.THNEED: Path(__file__).parent / 'models/dmonitoring_model.thneed',
   ModelRunner.ONNX: Path(__file__).parent / 'models/dmonitoring_model.onnx'}
 
 class DriverStateResult(ctypes.Structure):
@@ -49,21 +53,22 @@ class DMonitoringModelResult(ctypes.Structure):
     ("driver_state_lhd", DriverStateResult),
     ("driver_state_rhd", DriverStateResult),
     ("poor_vision_prob", ctypes.c_float),
-    ("wheel_on_right_prob", ctypes.c_float)]
+    ("wheel_on_right_prob", ctypes.c_float),
+    ("features", ctypes.c_float*FEATURE_LEN)]
 
 class ModelState:
   inputs: dict[str, np.ndarray]
   output: np.ndarray
   model: ModelRunner
 
-  def __init__(self):
+  def __init__(self, cl_ctx):
     assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
     self.output = np.zeros(OUTPUT_SIZE, dtype=np.float32)
     self.inputs = {
       'input_img': np.zeros(MODEL_HEIGHT * MODEL_WIDTH, dtype=np.uint8),
       'calib': np.zeros(CALIB_LEN, dtype=np.float32)}
 
-    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.DSP, True, None)
+    self.model = ModelRunner(MODEL_PATHS, self.output, Runtime.GPU, False, cl_ctx)
     self.model.addInput("input_img", None)
     self.model.addInput("calib", self.inputs['calib'])
 
@@ -76,17 +81,17 @@ def run(self, buf:VisionBuf, calib:np.ndarray) -> tuple[np.ndarray, float]:
     input_data = self.inputs['input_img'].reshape(MODEL_HEIGHT, MODEL_WIDTH)
     input_data[:] = buf_data[v_offset:v_offset+MODEL_HEIGHT, h_offset:h_offset+MODEL_WIDTH]
 
-    t1 = time.perf_counter()
     self.model.setInputBuffer("input_img", self.inputs['input_img'].view(np.float32))
+    t1 = time.perf_counter()
     self.model.execute()
     t2 = time.perf_counter()
     return self.output, t2 - t1
 
 
 def fill_driver_state(msg, ds_result: DriverStateResult):
-  msg.faceOrientation = [x * REG_SCALE for x in ds_result.face_orientation]
+  msg.faceOrientation = list(ds_result.face_orientation)
   msg.faceOrientationStd = [math.exp(x) for x in ds_result.face_orientation_std]
-  msg.facePosition = [x * REG_SCALE for x in ds_result.face_position[:2]]
+  msg.facePosition = list(ds_result.face_position[:2])
   msg.facePositionStd = [math.exp(x) for x in ds_result.face_position_std[:2]]
   msg.faceProb = float(sigmoid(ds_result.face_prob))
   msg.leftEyeProb = float(sigmoid(ds_result.left_eye_prob))
@@ -98,13 +103,13 @@ def fill_driver_state(msg, ds_result: DriverStateResult):
   msg.readyProb = [float(sigmoid(x)) for x in ds_result.ready_prob]
   msg.notReadyProb = [float(sigmoid(x)) for x in ds_result.not_ready_prob]
 
-def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts: int, execution_time: float, dsp_execution_time: float):
+def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts: int, execution_time: float, gpu_execution_time: float):
   model_result = ctypes.cast(model_output.ctypes.data, ctypes.POINTER(DMonitoringModelResult)).contents
   msg = messaging.new_message('driverStateV2', valid=True)
   ds = msg.driverStateV2
   ds.frameId = frame_id
   ds.modelExecutionTime = execution_time
-  ds.dspExecutionTime = dsp_execution_time
+  ds.gpuExecutionTime = gpu_execution_time
   ds.poorVisionProb = float(sigmoid(model_result.poor_vision_prob))
   ds.wheelOnRightProb = float(sigmoid(model_result.wheel_on_right_prob))
   ds.rawPredictions = model_output.tobytes() if SEND_RAW_PRED else b''
@@ -115,14 +120,16 @@ def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts:
 
 def main():
   gc.disable()
+  setproctitle(PROCESS_NAME)
   set_realtime_priority(1)
 
-  model = ModelState()
+  cl_context = CLContext()
+  model = ModelState(cl_context)
   cloudlog.warning("models loaded, dmonitoringmodeld starting")
   Params().put_bool("DmModelInitialized", True)
 
   cloudlog.warning("connecting to driver stream")
-  vipc_client = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_DRIVER, True)
+  vipc_client = VisionIpcClient("camerad", VisionStreamType.VISION_STREAM_DRIVER, True, cl_context)
   while not vipc_client.connect(False):
     time.sleep(0.1)
   assert vipc_client.is_connected()
@@ -144,10 +151,10 @@ def main():
       calib[:] = np.array(sm["liveCalibration"].rpyCalib)
 
     t1 = time.perf_counter()
-    model_output, dsp_execution_time = model.run(buf, calib)
+    model_output, gpu_execution_time = model.run(buf, calib)
     t2 = time.perf_counter()
 
-    pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, dsp_execution_time))
+    pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))
     # print("dmonitoring process: %.2fms, from last %.2fms\n" % (t2 - t1, t1 - last))
     # last = t1
 

diff --git a/selfdrive/modeld/models/dmonitoring_model.current b/selfdrive/modeld/models/dmonitoring_model.current
@@ -1,2 +1,2 @@
-5ec97a39-0095-4cea-adfa-6d72b1966cc1
-26cac7a9757a27c783a365403040a1bd27ccdaea
+fa69be01-b430-4504-9d72-7dcb058eb6dd
+d9fb22d1c4fa3ca3d201dbc8edf1d0f0918e53e6
diff --git a/selfdrive/modeld/models/dmonitoring_model.onnx b/selfdrive/modeld/models/dmonitoring_model.onnx
diff --git a/selfdrive/modeld/models/dmonitoring_model_q.dlc b/selfdrive/modeld/models/dmonitoring_model_q.dlc
diff --git a/selfdrive/modeld/runners/onnxmodel.py b/selfdrive/modeld/runners/onnxmodel.py
@@ -67,7 +67,6 @@ class ONNXModel(RunModel):
   def __init__(self, path, output, runtime, use_tf8, cl_context):
     self.inputs = {}
     self.output = output
-    self.use_tf8 = use_tf8
 
     self.session = create_ort_session(path, fp16_to_fp32=True)
     self.input_names = [x.name for x in self.session.get_inputs()]
@@ -91,7 +90,7 @@ def getCLBuffer(self, name):
     return None
 
   def execute(self):
-    inputs = {k: (v.view(np.uint8) / 255. if self.use_tf8 and k == 'input_img' else v) for k,v in self.inputs.items()}
+    inputs = {k: v.view(self.input_dtypes[k]) for k,v in self.inputs.items()}
     inputs = {k: v.reshape(self.input_shapes[k]).astype(self.input_dtypes[k]) for k,v in inputs.items()}
     outputs = self.session.run(None, inputs)
     assert len(outputs) == 1, "Only single model outputs are supported"

diff --git a/selfdrive/monitoring/helpers.py b/selfdrive/monitoring/helpers.py
@@ -33,8 +33,8 @@ def __init__(self):
     self._SG_THRESHOLD = 0.9
     self._BLINK_THRESHOLD = 0.865
 
-    self._EE_THRESH11 = 0.25
-    self._EE_THRESH12 = 7.5
+    self._EE_THRESH11 = 0.4
+    self._EE_THRESH12 = 15.0
     self._EE_MAX_OFFSET1 = 0.06
     self._EE_MIN_OFFSET1 = 0.025
     self._EE_THRESH21 = 0.01

diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
@@ -109,7 +109,7 @@ def model_replay(lr, frs):
         'modelV2.frameDropPerc',
         'modelV2.modelExecutionTime',
         'driverStateV2.modelExecutionTime',
-        'driverStateV2.dspExecutionTime'
+        'driverStateV2.gpuExecutionTime'
       ]
       if PC:
         # TODO We ignore whole bunch so we can compare important stuff

diff --git a/selfdrive/test/process_replay/model_replay_ref_commit b/selfdrive/test/process_replay/model_replay_ref_commit
@@ -1 +1 @@
-666448fce191e196aac68d06e29a0745e6620db9
+7cd64f431b814adfa11118643efe3822c496922b
diff --git a/selfdrive/test/process_replay/process_replay.py b/selfdrive/test/process_replay/process_replay.py
@@ -585,7 +585,7 @@ def selfdrived_config_callback(params, cfg, lr):
     proc_name="dmonitoringmodeld",
     pubs=["liveCalibration", "driverCameraState"],
     subs=["driverStateV2"],
-    ignore=["logMonoTime", "driverStateV2.modelExecutionTime", "driverStateV2.dspExecutionTime"],
+    ignore=["logMonoTime", "driverStateV2.modelExecutionTime", "driverStateV2.gpuExecutionTime"],
     should_recv_callback=dmonitoringmodeld_rcv_callback,
     tolerance=NUMPY_TOLERANCE,
     processing_time=0.020,

diff --git a/selfdrive/test/test_onroad.py b/selfdrive/test/test_onroad.py
@@ -32,6 +32,7 @@
 * total CPU usage of openpilot (sum(PROCS.values())
   should not exceed MAX_TOTAL_CPU
 """
+
 MAX_TOTAL_CPU = 265.  # total for all 8 cores
 PROCS = {
   # Baseline CPU usage by process
@@ -312,7 +313,7 @@ def test_memory_usage(self):
     assert max(mems) - min(mems) <= 3.0
 
   def test_gpu_usage(self):
-    assert self.gpu_procs == {"weston", "ui", "camerad", "selfdrive.modeld.modeld"}
+    assert self.gpu_procs == {"weston", "ui", "camerad", "selfdrive.modeld.modeld", "selfdrive.modeld.dmonitoringmodeld"}
 
   def test_camera_processing_time(self):
     result = "\n"

diff --git a/system/hardware/tici/tests/test_power_draw.py b/system/hardware/tici/tests/test_power_draw.py
@@ -34,7 +34,7 @@ def name(self):
 PROCS = [
   Proc(['camerad'], 2.1, msgs=['roadCameraState', 'wideRoadCameraState', 'driverCameraState']),
   Proc(['modeld'], 1.12, atol=0.2, msgs=['modelV2']),
-  Proc(['dmonitoringmodeld'], 0.4, msgs=['driverStateV2']),
+  Proc(['dmonitoringmodeld'], 0.5, msgs=['driverStateV2']),
   Proc(['encoderd'], 0.23, msgs=[]),
 ]
 

diff --git a/system/manager/process_config.py b/system/manager/process_config.py
@@ -70,7 +70,7 @@ def and_(*fns):
   PythonProcess("micd", "system.micd", iscar),
   PythonProcess("timed", "system.timed", always_run, enabled=not PC),
 
-  PythonProcess("dmonitoringmodeld", "selfdrive.modeld.dmonitoringmodeld", driverview, enabled=(not PC or WEBCAM)),
+  NativeProcess("dmonitoringmodeld", "selfdrive/modeld", ["./dmonitoringmodeld"], driverview, enabled=(not PC or WEBCAM)),
   NativeProcess("encoderd", "system/loggerd", ["./encoderd"], only_onroad),
   NativeProcess("stream_encoderd", "system/loggerd", ["./encoderd", "--stream"], notcar),
   NativeProcess("loggerd", "system/loggerd", ["./loggerd"], logging),

diff --git a/tinygrad_repo b/tinygrad_repo