fourMs · alexarje · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/docs/musicalgestures/_pose.md b/docs/musicalgestures/_pose.md
@@ -18,7 +18,7 @@ Helper function to automatically download model (.caffemodel) files.
 
 ## pose
 
-[[find in source code]](https://github.com/fourMs/MGT-python/blob/master/musicalgestures/_pose.py#L14)
+[[find in source code]](https://github.com/fourMs/MGT-python/blob/master/musicalgestures/_pose.py#L30)
 
 ```python
 def pose(
@@ -37,15 +37,20 @@ def pose(
 ```
 
 Renders a video with the pose estimation (aka. "keypoint detection" or "skeleton tracking") overlaid on it.
-Outputs the predictions in a text file containing the normalized x and y coordinates of each keypoints
-(default format is csv). Uses models from the [openpose](https://github.com/CMU-Perceptual-Computing-Lab/openpose) project.
+Outputs the predictions in a text file containing the normalized x and y coordinates of each keypoint
+(default format is csv).
+
+Supports two backends:
+
+- **MediaPipe** (`model='mediapipe'`): Uses Google's MediaPipe Pose which detects 33 landmarks entirely on CPU. Requires the optional `mediapipe` package (`pip install musicalgestures[pose]`). The model file (~8–28 MB) is auto-downloaded on first use and cached in `musicalgestures/models/`.
+- **OpenPose** (`model='body_25'`, `'coco'`, or `'mpi'`): Uses Caffe-based OpenPose models. Model weights (~200 MB) are downloaded on first use.
 
 #### Arguments
 
-- `model` *str, optional* - 'body_25' loads the model trained on the BODY_25 dataset, 'mpi' loads the model trained on the Multi-Person Dataset (MPII), 'coco' loads one trained on the COCO dataset. The BODY_25 model outputs 25 points, the MPII model outputs 15 points, while the COCO model produces 18 points. Defaults to 'body_25'.
-- `device` *str, optional* - Sets the backend to use for the neural network ('cpu' or 'gpu'). Defaults to 'gpu'.
+- `model` *str, optional* - Pose model to use. `'mediapipe'` uses MediaPipe Pose (33 landmarks, model auto-downloaded on first use). `'body_25'` loads the OpenPose BODY_25 model (25 keypoints), `'mpi'` loads the MPII model (15 keypoints), `'coco'` loads the COCO model (18 keypoints). Defaults to 'body_25'.
+- `device` *str, optional* - Sets the backend to use for the neural network ('cpu' or 'gpu'). Ignored when `model='mediapipe'` (MediaPipe always runs on CPU). Defaults to 'gpu'.
 - `threshold` *float, optional* - The normalized confidence threshold that decides whether we keep or discard a predicted point. Discarded points get substituted with (0, 0) in the output data. Defaults to 0.1.
-- `downsampling_factor` *int, optional* - Decides how much we downsample the video before we pass it to the neural network. For example `downsampling_factor=4` means that the input to the network is one-fourth the resolution of the source video. Heaviver downsampling reduces rendering time but produces lower quality pose estimation. Defaults to 2.
+- `downsampling_factor` *int, optional* - Decides how much we downsample the video before we pass it to the neural network. Ignored when `model='mediapipe'`. Defaults to 2.
 - `save_data` *bool, optional* - Whether we save the predicted pose data to a file. Defaults to True.
 - `data_format` *str, optional* - Specifies format of pose-data. Accepted values are 'csv', 'tsv' and 'txt'. For multiple output formats, use list, eg. ['csv', 'txt']. Defaults to 'csv'.
 - `save_video` *bool, optional* - Whether we save the video with the estimated pose overlaid on it. Defaults to True.

diff --git a/docs/musicalgestures/_pose_estimator.md b/docs/musicalgestures/_pose_estimator.md
@@ -26,7 +26,7 @@ This module provides:
 * class `PoseEstimator` – an abstract base class (ABC) defining the common
   interface that all pose backends must implement.
 * class `MediaPipePoseEstimator` – a concrete backend powered by Google
-  MediaPipe Pose (33 landmarks, CPU-friendly, zero model download).
+  MediaPipe Pose (33 landmarks, CPU-friendly, auto-downloads model on first use).
 * class `OpenPosePoseEstimator` – a thin wrapper around the legacy OpenPose /
   Caffe-model implementation already present in :mod:[Pose](_pose.md#pose).
 
@@ -56,30 +56,30 @@ class MediaPipePoseEstimator(PoseEstimator):
         model_complexity: int = 1,
         min_detection_confidence: float = 0.5,
         min_tracking_confidence: float = 0.5,
-        static_image_mode: bool = False,
     ) -> None:
 ```
 
-Pose estimator backed by Google MediaPipe Pose.
+Pose estimator backed by Google MediaPipe Pose (Tasks API).
 
-Requires the optional ``mediapipe`` package
+Requires the optional ``mediapipe>=0.10`` package
 
 ```python
 pip install musicalgestures[pose]
 ```
 
+The first time you use a given complexity level the corresponding
+`.task` model file (~8–28 MB) is downloaded from Google's model
+storage and cached in `musicalgestures/models/`.
+
 Parameters
 ----------
 model_complexity:
-    MediaPipe model complexity (0, 1, or 2).  Higher = more accurate
-    but slower.  Default: 1.
+    MediaPipe model complexity (0 = lite, 1 = full, 2 = heavy).
+    Higher values are more accurate but slower.  Default: 1.
 min_detection_confidence:
     Minimum confidence for initial body detection. Default: 0.5.
 min_tracking_confidence:
     Minimum confidence for landmark tracking. Default: 0.5.
-static_image_mode:
-    If *True*, treat every frame as a static image (no tracking).
-    Default: False.
 
 Examples
 --------

diff --git a/musicalgestures/_pose.py b/musicalgestures/_pose.py
@@ -10,6 +10,22 @@
 
 # implementation mainly inspired by: https://github.com/spmallick/learnopencv/blob/master/OpenPose/OpenPoseVideo.py
 
+# MediaPipe Pose skeleton connections (pairs of landmark indices)
+MEDIAPIPE_POSE_CONNECTIONS = [
+    (0, 1), (1, 2), (2, 3), (3, 7),
+    (0, 4), (4, 5), (5, 6), (6, 8),
+    (9, 10),
+    (11, 12),
+    (11, 13), (13, 15),
+    (12, 14), (14, 16),
+    (15, 17), (15, 19), (15, 21), (17, 19),
+    (16, 18), (16, 20), (16, 22), (18, 20),
+    (11, 23), (12, 24),
+    (23, 24),
+    (23, 25), (25, 27), (27, 29), (27, 31), (29, 31),
+    (24, 26), (26, 28), (28, 30), (28, 32), (30, 32),
+]
+
 
 def pose(
         self,
@@ -24,26 +40,61 @@ def pose(
         target_name_data=None,
         overwrite=False):
     """
-    Renders a video with the pose estimation (aka. "keypoint detection" or "skeleton tracking") overlaid on it. 
-    Outputs the predictions in a text file containing the normalized x and y coordinates of each keypoints 
-    (default format is csv). Uses models from the [openpose](https://github.com/CMU-Perceptual-Computing-Lab/openpose) project.
+    Renders a video with the pose estimation (aka. "keypoint detection" or "skeleton tracking") overlaid on it.
+    Outputs the predictions in a text file containing the normalized x and y coordinates of each keypoint
+    (default format is csv).
+
+    Supports two backends:
+
+    * **MediaPipe** (``model='mediapipe'``): Uses Google's MediaPipe Pose which detects 33
+      landmarks entirely on CPU.  Requires the optional ``mediapipe`` package
+      (``pip install musicalgestures[pose]``).  On first use, the model file
+      (~8–28 MB) is downloaded automatically and cached in ``musicalgestures/models/``.
+    * **OpenPose** (``model='body_25'``, ``'coco'``, or ``'mpi'``): Uses Caffe-based OpenPose
+      models.  Model weights (~200 MB) are downloaded on first use.
 
     Args:
-        model (str, optional): 'body_25' loads the model trained on the BODY_25 dataset, 'mpi' loads the model trained on the Multi-Person Dataset (MPII), 'coco' loads one trained on the COCO dataset. The BODY_25 model outputs 25 points, the MPII model outputs 15 points, while the COCO model produces 18 points. Defaults to 'body_25'.
-        device (str, optional): Sets the backend to use for the neural network ('cpu' or 'gpu'). Defaults to 'gpu'.
-        threshold (float, optional): The normalized confidence threshold that decides whether we keep or discard a predicted point. Discarded points get substituted with (0, 0) in the output data. Defaults to 0.1.
-        downsampling_factor (int, optional): Decides how much we downsample the video before we pass it to the neural network. For example `downsampling_factor=4` means that the input to the network is one-fourth the resolution of the source video. Heaviver downsampling reduces rendering time but produces lower quality pose estimation. Defaults to 2.
+        model (str, optional): Pose model to use. ``'mediapipe'`` uses MediaPipe Pose (33
+            landmarks, model auto-downloaded on first use). ``'body_25'`` loads the OpenPose BODY_25 model
+            (25 keypoints), ``'mpi'`` loads the MPII model (15 keypoints), ``'coco'`` loads
+            the COCO model (18 keypoints). Defaults to 'body_25'.
+        device (str, optional): Sets the backend to use for the neural network ('cpu' or 'gpu').
+            Ignored when ``model='mediapipe'`` (MediaPipe always runs on CPU). Defaults to 'gpu'.
+        threshold (float, optional): The normalized confidence threshold that decides whether we
+            keep or discard a predicted point. Discarded points get substituted with (0, 0) in the
+            output data. Defaults to 0.1.
+        downsampling_factor (int, optional): Decides how much we downsample the video before we
+            pass it to the neural network. Ignored when ``model='mediapipe'``. Defaults to 2.
         save_data (bool, optional): Whether we save the predicted pose data to a file. Defaults to True.
-        data_format (str, optional): Specifies format of pose-data. Accepted values are 'csv', 'tsv' and 'txt'. For multiple output formats, use list, eg. ['csv', 'txt']. Defaults to 'csv'.
-        save_video (bool, optional): Whether we save the video with the estimated pose overlaid on it. Defaults to True.
-        target_name_video (str, optional): Target output name for the video. Defaults to None (which assumes that the input filename with the suffix "_pose" should be used).
-        target_name_data (str, optional): Target output name for the data. Defaults to None (which assumes that the input filename with the suffix "_pose" should be used).
-        overwrite (bool, optional): Whether to allow overwriting existing files or to automatically increment target filenames to avoid overwriting. Defaults to False.
+        data_format (str, optional): Specifies format of pose-data. Accepted values are 'csv', 'tsv'
+            and 'txt'. For multiple output formats, use list, eg. ['csv', 'txt']. Defaults to 'csv'.
+        save_video (bool, optional): Whether we save the video with the estimated pose overlaid on it.
+            Defaults to True.
+        target_name_video (str, optional): Target output name for the video. Defaults to None (which
+            assumes that the input filename with the suffix "_pose" should be used).
+        target_name_data (str, optional): Target output name for the data. Defaults to None (which
+            assumes that the input filename with the suffix "_pose" should be used).
+        overwrite (bool, optional): Whether to allow overwriting existing files or to automatically
+            increment target filenames to avoid overwriting. Defaults to False.
 
     Returns:
         MgVideo: An MgVideo pointing to the output video.
     """
 
+    # --- MediaPipe backend ---------------------------------------------------
+    if model.lower() == 'mediapipe':
+        return _pose_mediapipe(
+            self,
+            threshold=threshold,
+            save_data=save_data,
+            data_format=data_format,
+            save_video=save_video,
+            target_name_video=target_name_video,
+            target_name_data=target_name_data,
+            overwrite=overwrite,
+        )
+    # -------------------------------------------------------------------------
+
     module_path = os.path.abspath(os.path.dirname(musicalgestures.__file__))
 
     if model.lower() == 'mpi':
@@ -367,6 +418,172 @@ def save_single_file(of, width, height, model, data, data_format, target_name_da
         return self
 
 
+def _pose_mediapipe(
+        self,
+        threshold=0.1,
+        save_data=True,
+        data_format='csv',
+        save_video=True,
+        target_name_video=None,
+        target_name_data=None,
+        overwrite=False):
+    """
+    Internal helper: run MediaPipe Pose on a video and render/save the output.
+    Called by :func:`pose` when ``model='mediapipe'``.
+    """
+    from musicalgestures._pose_estimator import MediaPipePoseEstimator, MEDIAPIPE_LANDMARK_NAMES
+
+    of, fex = os.path.splitext(self.filename)
+
+    if fex != '.avi':
+        if "as_avi" not in self.__dict__.keys():
+            file_as_avi = convert_to_avi(of + fex, overwrite=overwrite)
+            self.as_avi = musicalgestures.MgVideo(file_as_avi)
+        of, fex = self.as_avi.of, self.as_avi.fex
+        filename = of + fex
+    else:
+        filename = self.filename
+
+    pb = MgProgressbar(total=self.length, prefix='Rendering MediaPipe pose estimation:')
+
+    if save_video:
+        if target_name_video is None:
+            target_name_video = of + '_pose' + fex
+        else:
+            target_name_video = os.path.splitext(target_name_video)[0] + fex
+        if not overwrite:
+            target_name_video = generate_outfilename(target_name_video)
+
+    # Pipe video with FFmpeg for reading frame by frame
+    cmd = ['ffmpeg', '-y', '-i', filename]
+    process = ffmpeg_cmd(cmd, total_time=self.length, pipe='read')
+    video_out = None
+
+    ii = 0
+    data = []
+
+    estimator = MediaPipePoseEstimator()
+
+    while True:
+        out = process.stdout.read(self.width * self.height * 3)
+
+        if out == b'':
+            pb.progress(self.length)
+            break
+
+        frame = np.frombuffer(out, dtype=np.uint8).reshape([self.height, self.width, 3]).copy()
+
+        result = estimator.predict_frame(frame)
+        keypoints = result.keypoints  # shape (33, 3): x, y, visibility
+
+        # Collect data row: time + normalised (x, y) for every landmark
+        if save_data:
+            time_ms = frame2ms(ii, self.fps)
+            row = [time_ms]
+            for i in range(len(MEDIAPIPE_LANDMARK_NAMES)):
+                x, y, vis = keypoints[i]
+                if vis >= threshold:
+                    row += [float(x), float(y)]
+                else:
+                    row += [0.0, 0.0]
+            data.append(row)
+
+        # Draw skeleton connections
+        for (a, b) in MEDIAPIPE_POSE_CONNECTIONS:
+            xa, ya, va = keypoints[a]
+            xb, yb, vb = keypoints[b]
+            if va >= threshold and vb >= threshold:
+                pt_a = (int(xa * self.width), int(ya * self.height))
+                pt_b = (int(xb * self.width), int(yb * self.height))
+                cv2.line(frame, pt_a, pt_b, (0, 255, 255), 2, lineType=cv2.LINE_AA)
+
+        # Draw landmark circles
+        for i in range(len(MEDIAPIPE_LANDMARK_NAMES)):
+            x, y, vis = keypoints[i]
+            if vis >= threshold:
+                pt = (int(x * self.width), int(y * self.height))
+                cv2.circle(frame, pt, 4, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
+
+        if save_video:
+            if video_out is None:
+                cmd = ['ffmpeg', '-y', '-s', '{}x{}'.format(frame.shape[1], frame.shape[0]),
+                       '-r', str(self.fps), '-f', 'rawvideo', '-pix_fmt', 'bgr24',
+                       '-vcodec', 'rawvideo', '-i', '-', '-vcodec', 'libx264',
+                       '-pix_fmt', 'yuv420p', target_name_video]
+                video_out = ffmpeg_cmd(cmd, total_time=self.length, pipe='write')
+            video_out.stdin.write(frame.astype(np.uint8))
+
+        process.stdout.flush()
+        pb.progress(ii)
+        ii += 1
+
+    estimator.close()
+
+    if save_video:
+        video_out.stdin.close()
+        video_out.wait()
+        if self.has_audio:
+            source_audio = extract_wav(of + fex)
+            embed_audio_in_video(source_audio, target_name_video)
+            os.remove(source_audio)
+
+    process.terminate()
+
+    if save_data:
+        # Build column headers from landmark names
+        headers = ['Time']
+        for name in MEDIAPIPE_LANDMARK_NAMES:
+            headers.append(name.replace('_', ' ').title() + ' X')
+            headers.append(name.replace('_', ' ').title() + ' Y')
+        _save_pose_txt(of, data, headers, data_format, target_name_data, overwrite)
+
+    if save_video:
+        self.pose_video = musicalgestures.MgVideo(target_name_video, color=self.color, returned_by_process=True)
+        return self.pose_video
+    else:
+        return self
+
+
+def _save_pose_txt(of, data, headers, data_format, target_name_data, overwrite):
+    """Save pose data to one or more text files (csv / tsv / txt)."""
+
+    def _save_single(data_format):
+        ext = '.' + data_format.lower()
+        if target_name_data is None:
+            out_path = of + '_pose' + ext
+        else:
+            out_path = os.path.splitext(target_name_data)[0] + ext
+        if not overwrite:
+            out_path = generate_outfilename(out_path)
+
+        df = pd.DataFrame(data=data, columns=headers)
+
+        if data_format.lower() == 'csv':
+            df.to_csv(out_path, index=None)
+        elif data_format.lower() in ('tsv', 'txt'):
+            delimiter = '\t' if data_format.lower() == 'tsv' else ' '
+            with open(out_path, 'wb') as f:
+                head_str = delimiter.join(headers) + '\n'
+                f.write(head_str.encode())
+                fmt_list = ['%d'] + ['%.15f'] * (len(headers) - 1)
+                np.savetxt(f, df.values, delimiter=delimiter, fmt=fmt_list)
+        else:
+            print(f"Invalid data format: '{data_format}'.\nFalling back to '.csv'.")
+            _save_single('csv')
+
+    if isinstance(data_format, str):
+        _save_single(data_format)
+    elif isinstance(data_format, list):
+        valid = [f for f in data_format if f.lower() in ('csv', 'tsv', 'txt')]
+        if len(valid) != len(data_format):
+            invalid = [f for f in data_format if f.lower() not in ('csv', 'tsv', 'txt')]
+            print(f"Unsupported formats {invalid}.\nFalling back to '.csv'.")
+            _save_single('csv')
+        else:
+            for fmt in list(set(valid)):
+                _save_single(fmt)
+
+
 def download_model(modeltype):
     """
     Helper function to automatically download model (.caffemodel) files.