Updates recognize.py to recognize gestures for 2 hands (#274)

google-ai-edge · Nov 23, 2023 · aee6ad1 · aee6ad1
1 parent 3a32886
commit aee6ad1
Showing 1 changed file with 56 additions and 52 deletions.
diff --git a/examples/gesture_recognizer/raspberry_pi/recognize.py b/examples/gesture_recognizer/raspberry_pi/recognize.py
@@ -67,11 +67,9 @@ def run(model: str, num_hands: int,
   fps_avg_frame_count = 10
 
   # Label box parameters
-  label_text_color = (0, 0, 0)  # red
-  label_background_color = (255, 255, 255)  # white
+  label_text_color = (255, 255, 255)  # white
   label_font_size = 1
   label_thickness = 2
-  label_padding_width = 100  # pixels
 
   recognition_frame = None
   recognition_result_list = []
@@ -123,54 +121,60 @@ def save_result(result: vision.GestureRecognizerResult,
     cv2.putText(current_frame, fps_text, text_location, cv2.FONT_HERSHEY_DUPLEX,
                 font_size, text_color, font_thickness, cv2.LINE_AA)
 
-    # Draw the hand landmarks.
     if recognition_result_list:
-        # Draw landmarks.
-        for hand_landmarks in recognition_result_list[0].hand_landmarks:
-            hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
-            hand_landmarks_proto.landmark.extend([
-                landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y,
-                                                z=landmark.z) for landmark in
-                hand_landmarks
-            ])
-            mp_drawing.draw_landmarks(
-                current_frame,
-                hand_landmarks_proto,
-                mp_hands.HAND_CONNECTIONS,
-                mp_drawing_styles.get_default_hand_landmarks_style(),
-                mp_drawing_styles.get_default_hand_connections_style())
-
-    # Expand the frame to show the labels.
-    current_frame = cv2.copyMakeBorder(current_frame, 0, label_padding_width,
-                                       0, 0,
-                                       cv2.BORDER_CONSTANT, None,
-                                       label_background_color)
-
-    if recognition_result_list:
-      # Show top gesture classification.
-      gestures = recognition_result_list[0].gestures
-
-      if gestures:
-        # print(gestures)
-        category_name = gestures[0][0].category_name
-        score = round(gestures[0][0].score, 2)
-        result_text = category_name + ' (' + str(score) + ')'
-
-        # Compute text size
-        text_size = \
-        cv2.getTextSize(result_text, cv2.FONT_HERSHEY_DUPLEX, label_font_size,
-                        label_thickness)[0]
-        text_width, text_height = text_size
-
-        # Compute centered x, y coordinates
-        legend_x = (current_frame.shape[1] - text_width) // 2
-        legend_y = current_frame.shape[0] - (
-                    label_padding_width - text_height) // 2
-
-        # Draw the text
-        cv2.putText(current_frame, result_text, (legend_x, legend_y),
-                    cv2.FONT_HERSHEY_DUPLEX, label_font_size,
-                    label_text_color, label_thickness, cv2.LINE_AA)
+      # Draw landmarks and write the text for each hand.
+      for hand_index, hand_landmarks in enumerate(
+          recognition_result_list[0].hand_landmarks):
+        # Calculate the bounding box of the hand
+        x_min = min([landmark.x for landmark in hand_landmarks])
+        y_min = min([landmark.y for landmark in hand_landmarks])
+        y_max = max([landmark.y for landmark in hand_landmarks])
+
+        # Convert normalized coordinates to pixel values
+        frame_height, frame_width = current_frame.shape[:2]
+        x_min_px = int(x_min * frame_width)
+        y_min_px = int(y_min * frame_height)
+        y_max_px = int(y_max * frame_height)
+
+        # Get gesture classification results
+        if recognition_result_list[0].gestures:
+          gesture = recognition_result_list[0].gestures[hand_index]
+          category_name = gesture[0].category_name
+          score = round(gesture[0].score, 2)
+          result_text = f'{category_name} ({score})'
+
+          # Compute text size
+          text_size = \
+          cv2.getTextSize(result_text, cv2.FONT_HERSHEY_DUPLEX, label_font_size,
+                          label_thickness)[0]
+          text_width, text_height = text_size
+
+          # Calculate text position (above the hand)
+          text_x = x_min_px
+          text_y = y_min_px - 10  # Adjust this value as needed
+
+          # Make sure the text is within the frame boundaries
+          if text_y < 0:
+            text_y = y_max_px + text_height
+
+          # Draw the text
+          cv2.putText(current_frame, result_text, (text_x, text_y),
+                      cv2.FONT_HERSHEY_DUPLEX, label_font_size,
+                      label_text_color, label_thickness, cv2.LINE_AA)
+
+        # Draw hand landmarks on the frame
+        hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
+        hand_landmarks_proto.landmark.extend([
+          landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y,
+                                          z=landmark.z) for landmark in
+          hand_landmarks
+        ])
+        mp_drawing.draw_landmarks(
+          current_frame,
+          hand_landmarks_proto,
+          mp_hands.HAND_CONNECTIONS,
+          mp_drawing_styles.get_default_hand_landmarks_style(),
+          mp_drawing_styles.get_default_hand_connections_style())
 
       recognition_frame = current_frame
       recognition_result_list.clear()
@@ -218,8 +222,8 @@ def main():
            'considered successful.',
       required=False,
       default=0.5)
-  # Finding the camera ID can be very reliant on platform-dependent methods. 
-  # One common approach is to use the fact that camera IDs are usually indexed sequentially by the OS, starting from 0. 
+  # Finding the camera ID can be very reliant on platform-dependent methods.
+  # One common approach is to use the fact that camera IDs are usually indexed sequentially by the OS, starting from 0.
   # Here, we use OpenCV and create a VideoCapture object for each potential ID with 'cap = cv2.VideoCapture(i)'.
   # If 'cap' is None or not 'cap.isOpened()', it indicates the camera ID is not available.
   parser.add_argument(