squash

commaai · Sep 25, 2024 · 4cc0969 · 4cc0969
1 parent b4865dc
commit 4cc0969
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 41 deletions.
diff --git a/selfdrive/modeld/modeld.py b/selfdrive/modeld/modeld.py
@@ -34,6 +34,10 @@
 
 METADATA_PATH = Path(__file__).parent / 'models/supercombo_metadata.pkl'
 
+MODEL_WIDTH = 512
+MODEL_HEIGHT = 256
+MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 // 2
+
 class FrameMeta:
   frame_id: int = 0
   timestamp_sof: int = 0
@@ -63,6 +67,9 @@ def __init__(self, context: CLContext):
       'features_buffer': np.zeros(ModelConstants.HISTORY_BUFFER_LEN * ModelConstants.FEATURE_LEN, dtype=np.float32),
     }
 
+    self.input_imgs = np.zeros(MODEL_FRAME_SIZE*2, dtype=np.uint8)
+    self.big_input_imgs = np.zeros(MODEL_FRAME_SIZE*2, dtype=np.uint8)
+
     with open(METADATA_PATH, 'rb') as f:
       model_metadata = pickle.load(f)
 
@@ -94,10 +101,16 @@ def run(self, buf: VisionBuf, wbuf: VisionBuf, transform: np.ndarray, transform_
     self.inputs['traffic_convention'][:] = inputs['traffic_convention']
     self.inputs['lateral_control_params'][:] = inputs['lateral_control_params']
 
-    # if getCLBuffer is not None, frame will be None
-    self.model.setInputBuffer("input_imgs", self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs")))
+    new_img = self.frame.prepare(buf, transform.flatten(), self.model.getCLBuffer("input_imgs"))
+    self.input_imgs[:MODEL_FRAME_SIZE] = self.input_imgs[-MODEL_FRAME_SIZE:]
+    self.input_imgs[MODEL_FRAME_SIZE:] = new_img[:]
+    self.model.setInputBuffer("input_imgs", self.input_imgs.view(np.float32))
     if wbuf is not None:
-      self.model.setInputBuffer("big_input_imgs", self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs")))
+      new_big_img = self.wide_frame.prepare(wbuf, transform_wide.flatten(), self.model.getCLBuffer("big_input_imgs"))
+      self.big_input_imgs[:MODEL_FRAME_SIZE] = self.big_input_imgs[-MODEL_FRAME_SIZE:]
+      self.big_input_imgs[MODEL_FRAME_SIZE:] = new_big_img[:]
+      self.model.setInputBuffer("big_input_imgs", self.big_input_imgs.view(np.float32))
+
 
     if prepare_only:
       return None
@@ -174,7 +187,6 @@ def main(demo=False):
     CP = convert_to_capnp(get_demo_car_params())
   else:
     CP = messaging.log_from_bytes(params.get("CarParams", block=True), car.CarParams)
-
   cloudlog.info("modeld got CarParams: %s", CP.carName)
 
   # TODO this needs more thought, use .2s extra for now to estimate other delays

diff --git a/selfdrive/modeld/models/commonmodel.cc b/selfdrive/modeld/models/commonmodel.cc
@@ -7,36 +7,26 @@
 #include "common/clutil.h"
 
 ModelFrame::ModelFrame(cl_device_id device_id, cl_context context) {
-  input_frames = std::make_unique<float[]>(buf_size);
+  frame = std::make_unique<uint8_t[]>(MODEL_FRAME_SIZE);
 
   q = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &err));
   y_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_WIDTH * MODEL_HEIGHT, NULL, &err));
   u_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
   v_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, (MODEL_WIDTH / 2) * (MODEL_HEIGHT / 2), NULL, &err));
-  net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(float), NULL, &err));
+  net_input_cl = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, MODEL_FRAME_SIZE * sizeof(uint8_t), NULL, &err));
 
   transform_init(&transform, context, device_id);
   loadyuv_init(&loadyuv, context, device_id, MODEL_WIDTH, MODEL_HEIGHT);
 }
 
-float* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
+uint8_t* ModelFrame::prepare(cl_mem yuv_cl, int frame_width, int frame_height, int frame_stride, int frame_uv_offset, const mat3 &projection, cl_mem *output) {
   transform_queue(&this->transform, q,
                   yuv_cl, frame_width, frame_height, frame_stride, frame_uv_offset,
                   y_cl, u_cl, v_cl, MODEL_WIDTH, MODEL_HEIGHT, projection);
-
-  if (output == NULL) {
-    loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
-
-    std::memmove(&input_frames[0], &input_frames[MODEL_FRAME_SIZE], sizeof(float) * MODEL_FRAME_SIZE);
-    CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(float), &input_frames[MODEL_FRAME_SIZE], 0, nullptr, nullptr));
-    clFinish(q);
-    return &input_frames[0];
-  } else {
-    loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, *output, true);
-    // NOTE: Since thneed is using a different command queue, this clFinish is needed to ensure the image is ready.
-    clFinish(q);
-    return NULL;
-  }
+  loadyuv_queue(&loadyuv, q, y_cl, u_cl, v_cl, net_input_cl);
+  CL_CHECK(clEnqueueReadBuffer(q, net_input_cl, CL_TRUE, 0, MODEL_FRAME_SIZE * sizeof(uint8_t), &frame[0], 0, nullptr, nullptr));
+  clFinish(q);
+  return &frame[0];
 }
 
 ModelFrame::~ModelFrame() {

diff --git a/selfdrive/modeld/models/commonmodel.h b/selfdrive/modeld/models/commonmodel.h
@@ -20,17 +20,16 @@ class ModelFrame {
 public:
   ModelFrame(cl_device_id device_id, cl_context context);
   ~ModelFrame();
-  float* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
+  uint8_t* prepare(cl_mem yuv_cl, int width, int height, int frame_stride, int frame_uv_offset, const mat3& transform, cl_mem *output);
 
   const int MODEL_WIDTH = 512;
   const int MODEL_HEIGHT = 256;
   const int MODEL_FRAME_SIZE = MODEL_WIDTH * MODEL_HEIGHT * 3 / 2;
-  const int buf_size = MODEL_FRAME_SIZE * 2;
 
 private:
   Transform transform;
   LoadYUVState loadyuv;
   cl_command_queue q;
   cl_mem y_cl, u_cl, v_cl, net_input_cl;
-  std::unique_ptr<float[]> input_frames;
+  std::unique_ptr<uint8_t[]> frame;
 };
diff --git a/selfdrive/modeld/models/commonmodel.pxd b/selfdrive/modeld/models/commonmodel.pxd
@@ -13,6 +13,6 @@ cdef extern from "common/clutil.h":
 
 cdef extern from "selfdrive/modeld/models/commonmodel.h":
   cppclass ModelFrame:
-    int buf_size
+    int MODEL_FRAME_SIZE
     ModelFrame(cl_device_id, cl_context)
-    float * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
+    unsigned char * prepare(cl_mem, int, int, int, int, mat3, cl_mem*)
diff --git a/selfdrive/modeld/models/commonmodel_pyx.pyx b/selfdrive/modeld/models/commonmodel_pyx.pyx
@@ -35,11 +35,11 @@ cdef class ModelFrame:
   def prepare(self, VisionBuf buf, float[:] projection, CLMem output):
     cdef mat3 cprojection
     memcpy(cprojection.v, &projection[0], 9*sizeof(float))
-    cdef float * data
+    cdef unsigned char * data
     if output is None:
       data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, NULL)
     else:
       data = self.frame.prepare(buf.buf.buf_cl, buf.width, buf.height, buf.stride, buf.uv_offset, cprojection, output.mem)
     if not data:
       return None
-    return np.asarray(<cnp.float32_t[:self.frame.buf_size]> data)
+    return np.asarray(<cnp.uint8_t[:self.frame.MODEL_FRAME_SIZE]> data)
diff --git a/selfdrive/modeld/models/supercombo.onnx b/selfdrive/modeld/models/supercombo.onnx
diff --git a/selfdrive/modeld/transforms/loadyuv.cl b/selfdrive/modeld/transforms/loadyuv.cl
@@ -1,7 +1,7 @@
 #define UV_SIZE ((TRANSFORMED_WIDTH/2)*(TRANSFORMED_HEIGHT/2))
 
 __kernel void loadys(__global uchar8 const * const Y,
-                     __global float * out,
+                     __global uchar * out,
                      int out_offset)
 {
     const int gid = get_global_id(0);
@@ -10,13 +10,12 @@ __kernel void loadys(__global uchar8 const * const Y,
     const int ox = ois % TRANSFORMED_WIDTH;
 
     const uchar8 ys = Y[gid];
-    const float8 ysf = convert_float8(ys);
 
     // 02
     // 13
 
-    __global float* outy0;
-    __global float* outy1;
+    __global uchar* outy0;
+    __global uchar* outy1;
     if ((oy & 1) == 0) {
       outy0 = out + out_offset; //y0
       outy1 = out + out_offset + UV_SIZE*2; //y2
@@ -25,21 +24,20 @@ __kernel void loadys(__global uchar8 const * const Y,
       outy1 = out + out_offset + UV_SIZE*3; //y3
     }
 
-    vstore4(ysf.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
-    vstore4(ysf.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
+    vstore4(ys.s0246, 0, outy0 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
+    vstore4(ys.s1357, 0, outy1 + (oy/2) * (TRANSFORMED_WIDTH/2) + ox/2);
 }
 
 __kernel void loaduv(__global uchar8 const * const in,
-                     __global float8 * out,
+                     __global uchar8 * out,
                      int out_offset)
 {
   const int gid = get_global_id(0);
   const uchar8 inv = in[gid];
-  const float8 outv  = convert_float8(inv);
-  out[gid + out_offset / 8] = outv;
+  out[gid + out_offset / 8] = inv;
 }
 
-__kernel void copy(__global float8 * inout,
+__kernel void copy(__global uchar8 * inout,
                    int in_offset)
 {
   const int gid = get_global_id(0);

diff --git a/tinygrad_repo b/tinygrad_repo