|
1 | 1 | Sumber dokumentasi utama: [AI Google Dev](https://ai.google.dev/edge/mediapipe/solutions/vision/gesture_recognizer/python#video_1) |
2 | 2 |
|
3 | | -# Gesture recognition |
| 3 | +# Holistic recognition |
4 | 4 | Mediapipe versi terbaru cukup berbeda dengan versi yang sebelumnya. Perbedaan utama adalah versi terbaru menggunakan tasks dan memanfaatkan model yang sudah disediakan oleh Google |
5 | 5 |
|
6 | | -Model [HandGestureClassifier](https://storage.googleapis.com/mediapipe-models/gesture_recognizer/gesture_recognizer/float16/latest/gesture_recognizer.task) |
7 | | -Contoh kode [Google Colab](https://colab.research.google.com/github/googlesamples/mediapipe/blob/main/examples/gesture_recognizer/python/gesture_recognizer.ipynb#scrollTo=Iy4r2_ePylIa) |
| 6 | +Model (task by mediapipe) |
| 7 | +- [Hand Landmarker](https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/latest/hand_landmarker.task) |
| 8 | +- [Face Landmarker](https://storage.googleapis.com/mediapipe-models/face_landmarker/face_landmarker/float16/latest/face_landmarker.task) |
| 9 | +- [Pose Landmarker]() |
8 | 10 |
|
| 11 | +Contoh kode [Google Colab](https://colab.research.google.com/github/googlesamples/mediapipe/blob/main/examples/gesture_recognizer/python/gesture_recognizer.ipynb#scrollTo=Iy4r2_ePylIa) |
9 | 12 | [Dokumentasi Holistic](https://github.com/google/mediapipe/blob/master/docs/solutions/holistic.md) |
| 13 | + |
| 14 | +# Code Idea Analysis |
| 15 | +Untuk mengekstraksi landmark holistik terdiri dari tangan, pose, dan wajah bisa menggunakan praktik yang sesuai dengan dokumentasi resmi dari google |
| 16 | +## Install Library |
| 17 | +```sh |
| 18 | +pip install mediapipe opencv-python |
| 19 | +``` |
| 20 | +Mediapipe versi 0.10.32 |
| 21 | +Opencv-python versi 4.13.0.92 |
| 22 | +## Load Video |
| 23 | +```py |
| 24 | +import cv2 |
| 25 | + |
| 26 | +cap = cv2.VideoCapture(VIDEO_PATH) |
| 27 | +``` |
| 28 | +## Initialize Tasks Model |
| 29 | +```py |
| 30 | +import mediapipe as mp |
| 31 | + |
| 32 | +pose_options = mp.tasks.vision.PoseLandmarkerOptions( |
| 33 | + base_options=mp.tasks.BaseOptions(POSE_MODEL_PATH), |
| 34 | + running_mode=mp.tasks.vision.RunningMode.VIDEO, |
| 35 | + num_poses=1, |
| 36 | + min_pose_detection_confidence=0.5, |
| 37 | + min_pose_presence_confidence=0.5, |
| 38 | + min_tracking_confidence=0.5, |
| 39 | +) |
| 40 | + |
| 41 | +face_options = mp.tasks.vision.FaceLandmarkerOptions( |
| 42 | + base_options=mp.tasks.BaseOptions(FACE_MODEL_PATH), |
| 43 | + running_mode=mp.tasks.vision.RunningMode.VIDEO, |
| 44 | + num_faces=1, |
| 45 | + min_face_detection_confidence=0.5, |
| 46 | + min_face_presence_confidence=0.5, |
| 47 | + min_tracking_confidence=0.5, |
| 48 | + output_face_blendshapes=False, |
| 49 | + output_facial_transformation_matrixes=False, |
| 50 | +) |
| 51 | + |
| 52 | +hand_options = mp.tasks.vision.HandLandmarkerOptions( |
| 53 | + base_options=mp.tasks.BaseOptions(HAND_MODEL_PATH), |
| 54 | + running_mode=mp.tasks.vision.RunningMode.VIDEO, |
| 55 | + num_hands=2, |
| 56 | + min_hand_detection_confidence=0.5, |
| 57 | + min_hand_presence_confidence=0.5, |
| 58 | + min_tracking_confidence=0.5, |
| 59 | +) |
| 60 | +``` |
| 61 | +## Helper: Landmark to Array |
| 62 | +```py |
| 63 | +def landmarks_to_array(list_lm, extra_fields=None): |
| 64 | + base_fields = ['x', 'y', 'z'] |
| 65 | + cols = base_fields + (extra_fields or []) |
| 66 | + arr = np.full((len(list_lm), len(cols)), np.nan, dtype=np.float32) |
| 67 | + for i, lm in enumerate(list_lm): |
| 68 | + for j, name in enumerate(cols): |
| 69 | + arr[i, j] = getattr(lm, name, np.nan) |
| 70 | + return arr |
| 71 | +``` |
| 72 | +## Extract Landmark from Video |
| 73 | +```py |
| 74 | +with PoseLandmarker.create_from_options(pose_options) as pose_landmarker, FaceLandmarker.create_from_options(face_options) as face_landmarker, HandLandmarker.create_from_options(hand_options) as hand_landmarker: |
| 75 | + |
| 76 | + while True: |
| 77 | + ok, frame_bgr = cap.read() |
| 78 | + if not ok: |
| 79 | + break |
| 80 | + |
| 81 | + # konversi BGR -> RGB (mediapipe pakai SRGB) |
| 82 | + frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) |
| 83 | + |
| 84 | + mp_image = mp.Image( |
| 85 | + image_format=mp.ImageFormat.SRGB, |
| 86 | + data=frame_rgb |
| 87 | + ) |
| 88 | + |
| 89 | + timestamp_ms = int(frame_idx * 1000.0 / fps) |
| 90 | + |
| 91 | + pose_result = pose_landmarker.detect_for_video(mp_image, timestamp_ms) |
| 92 | + face_result = face_landmarker.detect_for_video(mp_image, timestamp_ms) |
| 93 | + hand_result = hand_landmarker.detect_for_video(mp_image, timestamp_ms) |
| 94 | + |
| 95 | + # ====== ambil landmark pose ====== |
| 96 | + if getattr(pose_result, "pose_landmarks", None) and len(pose_result.pose_landmarks) > 0: |
| 97 | + pose_lm = pose_result.pose_landmarks[0] |
| 98 | + pose_arr = landmarks_to_array(pose_lm, extra_fields=["visibility"]) |
| 99 | + else: |
| 100 | + pose_arr = np.full((N_POSE, 4), np.nan, dtype=np.float32) |
| 101 | + |
| 102 | + # ====== ambil landmark face ====== |
| 103 | + if getattr(face_result, "face_landmarks", None) and len(face_result.face_landmarks) > 0: |
| 104 | + face_lm = face_result.face_landmarks[0] |
| 105 | + face_arr = landmarks_to_array(face_lm) # x,y,z |
| 106 | + else: |
| 107 | + face_arr = np.full((N_FACE, 3), np.nan, dtype=np.float32) |
| 108 | + |
| 109 | + # ====== ambil landmark hand (bisa 0,1,2 tangan) ====== |
| 110 | + frame_hands = np.full((2, N_HAND, 3), np.nan, dtype=np.float32) |
| 111 | + |
| 112 | + if getattr(hand_result, "hand_landmarks", None): |
| 113 | + # hand_result.hand_landmarks adalah list [hand][landmark] |
| 114 | + for hand_i, lm_list in enumerate(hand_result.hand_landmarks[:2]): |
| 115 | + hand_arr = landmarks_to_array(lm_list) # x,y,z |
| 116 | + # kalau jumlah landmark kurang dari N_HAND, kita isi sebagian |
| 117 | + n = min(len(hand_arr), N_HAND) |
| 118 | + frame_hands[hand_i, :n, :] = hand_arr[:n, :] |
| 119 | +``` |
0 commit comments