Source code for pyheadtracker.cam

"""
Access position and orientation data via webcam-based head tracking

Classes
- `MPFaceLandmarker`: Webcam-based head tracker using OpenCV and MediaPipe Face Landmarker
"""

from importlib import resources
from pathlib import Path
from typing import Optional
import cv2
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from .dtypes import Quaternion, YPR, Position, HTBase
from .utils import ypr2quat



[docs]
class MPFaceLandmarker(HTBase):
    """Webcam-based head tracker using OpenCV and MediaPipe Face Landmarker

    This class uses MediaPipe's Face Landmarker [1] to track head position and orientation using a (web-)cam. A mesh of 478 face landmarks is fitted to the detected face, from which 6 key points are used to estimate head pose via solvePnP. Orientation is derived from MediaPipe's facial transformation matrix. Tracking of the position is still experimental and might be inaccurate. Parts of the computation are offloaded to the GPU if available. By default, the built-in MediaPipe Face Landmarker model is used, but custom model weights can be provided. The supplied model weights (09/15/2022) are licensed under the Apache 2.0 license.

    Attributes
    ----------
    cam_index : int
        The index of the webcam to use.
    landmark_points_idx : list[int], optional
        Indices of the facial landmarks to use for pose estimation.
    landmark_points_3d : np.ndarray, optional
        Corresponding 3D model points for the selected landmarks. Note, that mediapipe's coordinate system is used (right-handed, x: right, y: down, z: forward).
    orient_format : str
        The format for orientation data. Possible values are "q" (Quaternion) or "ypr" (Yaw, Pitch, Roll).
    reset_orientation : bool
        Flag to indicate whether to reset orientation zeroing on the next read.
    reset_position : bool
        Flag to indicate whether to reset position zeroing on the next read.
    is_opened : bool
        Indicates whether the camera is opened.
    __FaceLandmarkerOptions : vision.FaceLandmarkerOptions
        MediaPipe Face Landmarker options.
    __landmarker : vision.FaceLandmarker
        Internal MediaPipe Face Landmarker instance.
    __cap : cv2.VideoCapture
        Internal OpenCV VideoCapture instance.
    __frame_count : int
        Frame counter for MediaPipe processing.
    __frame_width : int
        Width of the video frames.
    __frame_height : int
        Height of the video frames.
    __camera_matrix : np.ndarray
        Camera intrinsic matrix.
    __dist_coeffs : np.ndarray
        Camera distortion coefficients.
    __initial_rotation_matrix : np.ndarray
        Initial rotation matrix for orientation zeroing.
    __initial_R : np.ndarray
        Initial rotation matrix for position zeroing.
    __initial_tvec : np.ndarray
        Initial translation vector for position zeroing.

    Methods
    -------
    open()
        Initializes the webcam and MediaPipe Face Landmarker.
    close()
        Releases the webcam and MediaPipe resources.
    list_available_cameras(max_index: int = 20) -> list[int]
        Lists all available camera indices on the system.
    zero()
        Use the current position and orientation as reference (zero) values.
    zero_orientation()
        Uses the current orientation as reference (zero) value.
    zero_position()
        Uses the current position as reference (zero) value.
    read_orientation() -> Quaternion or YPR or None
        Returns if available the current head orientation as a Quaternion or YPR object.
    read_position() -> Position or None
        Returns if available the current head position in meters as a Position object.
    read_pose() -> dict or None
        Returns if available the current head position and orientation as a dictionary with Position and Quaternion or YPR objects.

    References
    ----------
    [1] MediaPipe Face Landmarker: https://ai.google.dev/edge/mediapipe/solutions/vision/face_landmarker
    """


[docs]
    def __init__(
        self,
        cam_index: int = 0,
        model_weights: Optional[str] = None,
        min_face_detection_confidence: float = 0.8,
        min_face_presence_confidence: float = 0.8,
        min_tracking_confidence: float = 0.8,
        landmark_points_idx: Optional[list[int]] = None,
        landmark_points_3d: Optional[np.ndarray] = None,
        orient_format: str = "q",
    ):
        """
        Parameters
        ----------
        cam_index : int
            The index of the webcam to use.
        model_weights : str, optional
            Path to custom model weights. If None, the default built-in model is used.
        min_face_detection_confidence : float, optional
            Minimum confidence value ([0.0, 1.0]) for face detection to be considered successful. Defaults to 0.8.
        min_face_presence_confidence : float, optional
            Minimum confidence value ([0.0, 1.0]) for the face presence to be considered detected successfully. Defaults to 0.8.
        min_tracking_confidence : float, optional
            Minimum confidence value ([0.0, 1.0]) for the face landmarks to be considered tracked successfully. Defaults to 0.8.
        landmark_points_idx : list[int], optional
            Indices of the facial landmarks to use for pose estimation.
        landmark_points_3d : np.ndarray, optional
            Corresponding 3D model points for the selected landmarks. Note, that mediapipe's coordinate system is used (right-handed, x: right, y: down, z: forward).
        orient_format : str
            The format for orientation data. Possible values are "q" (Quaternion) or "ypr" (Yaw, Pitch, Roll).
        """
        self.orient_format = orient_format
        self.cam_index = cam_index

        # Load default model weights if not provided
        if model_weights is None:
            try:
                # Use importlib.resources for deployed packages
                if hasattr(resources, "files"):
                    # Python 3.9+
                    data_path = resources.files("pyheadtracker").joinpath(
                        "data/mediapipe-facelandmarker/face_landmarker_v2_with_blendshapes.task"
                    )
                    model_weights = str(data_path)
                else:
                    # Fallback for older Python versions
                    model_weights = str(
                        Path(__file__).parent
                        / "data/mediapipe-facelandmarker/face_landmarker_v2_with_blendshapes.task"
                    )
            except Exception:
                # Final fallback to relative path
                model_weights = "data/mediapipe-facelandmarker/face_landmarker_v2_with_blendshapes.task"

        assert orient_format in [
            "q",
            "ypr",
        ], 'orient_format must be "q" (Quaternion) or "ypr" (Yaw, Pitch, Roll)'

        assert (
            min_face_detection_confidence >= 0.0
            and min_face_detection_confidence <= 1.0
        ), "min_face_detection_confidence must be in [0.0, 1.0]"
        assert (
            min_face_presence_confidence >= 0.0 and min_face_presence_confidence <= 1.0
        ), "min_face_presence_confidence must be in [0.0, 1.0]"
        assert (
            min_tracking_confidence >= 0.0 and min_tracking_confidence <= 1.0
        ), "min_tracking_confidence must be in [0.0, 1.0]"

        if landmark_points_idx is None:
            self.landmark_points_idx = [1, 152, 33, 263, 61, 291]
        else:
            self.landmark_points_idx = landmark_points_idx

        if landmark_points_3d is None:
            self.landmark_points_3d = np.array(
                [
                    [0.0, 0.0, 0.0],  # Nose tip (1)
                    [0.0, -0.063, -0.033],  # Chin (152)
                    [-0.043, 0.032, -0.026],  # Left eye outer corner (33)
                    [0.043, 0.032, -0.026],  # Right eye outer corner (263)
                    [-0.028, -0.028, -0.025],  # Left mouth corner (61)
                    [0.028, -0.028, -0.025],  # Right mouth corner (291)
                ],
                dtype=np.float64,
            )
        else:
            self.landmark_points_3d = landmark_points_3d

        assert (
            len(self.landmark_points_idx) == self.landmark_points_3d.shape[0]
        ), "Length of landmark_points_idx must match number of rows in landmark_points_3d"

        # Initialize MediaPipe Face Landmarker
        BaseOptions = python.BaseOptions(
            model_asset_path=model_weights,
        )
        self.__FaceLandmarkerOptions = vision.FaceLandmarkerOptions(
            base_options=BaseOptions,  # uses built-in model
            min_face_detection_confidence=min_face_detection_confidence,
            min_face_presence_confidence=min_face_presence_confidence,
            min_tracking_confidence=min_tracking_confidence,
            output_face_blendshapes=False,
            output_facial_transformation_matrixes=True,
            num_faces=1,
            running_mode=vision.RunningMode.VIDEO,
        )

        # Internal variables
        self.__landmarker = None
        self.__cap = None
        self.__frame_count = 0
        self.__frame_width = 0
        self.__frame_height = 0

        self.__camera_matrix = None
        self.__dist_coeffs = None

        self.__initial_rotation_matrix = None
        self.__initial_R = None
        self.__initial_tvec = None

        self.reset_orientation = True
        self.reset_position = True

        self.is_opened = False



[docs]
    def open(self):
        """Initializes the webcam and MediaPipe Face Landmarker."""

        self.__landmarker = vision.FaceLandmarker.create_from_options(
            self.__FaceLandmarkerOptions
        )

        self.__cap = cv2.VideoCapture(self.cam_index)

        # Double-check camera can actually read frames
        if not self.__cap.isOpened():
            raise RuntimeError(
                f"Failed to open camera with index {self.cam_index}. Use pyheadtracker.cam.list_available_cameras() to see available cameras."
            )

        self.__frame_count = 0

        self.__frame_width = int(self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.__frame_height = int(self.__cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        focal_length = self.__frame_width

        self.__camera_matrix = np.array(
            [
                [focal_length, 0, self.__frame_width / 2],
                [0, focal_length, self.__frame_height / 2],
                [0, 0, 1],
            ],
            dtype=np.float64,
        )

        self.__dist_coeffs = np.zeros((4, 1))

        self.is_opened = True



[docs]
    @staticmethod
    def list_available_cameras(max_index: int = 20) -> list[int]:
        """
        List all available camera indices on the system.

        Parameters
        ----------
        max_index : int
            Maximum camera index to check (default 10).

        Returns
        -------
        list[int]
            List of available camera indices.
        """

        # Suppress OpenCV logging
        cv2.setLogLevel(0)

        available_cameras = []
        for i in range(max_index):
            cap = cv2.VideoCapture(i)
            if cap.isOpened():
                ret, _ = cap.read()
                cap.release()
                if ret:
                    available_cameras.append(i)

        # Restore OpenCV logging level to default
        cv2.setLogLevel(1)

        return available_cameras



[docs]
    def close(self):
        """Releases the webcam and MediaPipe resources."""
        if self.__cap is not None:
            self.__cap.release()
            self.__cap = None
            self.__landmarker = None
            self.is_opened = False
        cv2.destroyAllWindows()



[docs]
    def read_orientation(self) -> YPR | Quaternion | None:
        """Returns if available the current head orientation as a Quaternion or YPR object.

        Returns
        -------
        Quaternion or YPR or None
            The current head orientation, or None if not available.
        """
        pose = self.__read_pose_internal(calculate_orientation=True)
        if pose is None or pose["orientation"] is None:
            return None
        orientation = pose["orientation"]
        if isinstance(orientation, (YPR, Quaternion)):
            return orientation
        return None



[docs]
    def read_position(self) -> Position | None:
        """Returns if available the current head position as a Position object.

        Returns
        -------
        Position or None
            The current head position, or None if not available.
        """
        pose = self.__read_pose_internal(calculate_position=True)
        if pose is None or pose["position"] is None:
            return None
        position = pose["position"]
        return position if isinstance(position, Position) else None



[docs]
    def read_pose(self) -> dict[str, Position | Quaternion | YPR | None] | None:
        """Returns if available the current head position and orientation as a dictionary with Position and Quaternion or YPR objects.

        Returns
        -------
        dict[str, Position | Quaternion | YPR] or None
            The current head position and orientation, or None if not available.
        """
        pose = self.__read_pose_internal(
            calculate_orientation=True, calculate_position=True
        )
        if pose is None:
            return None
        return pose



[docs]
    def zero(self):
        """Use the current position and orientation as reference (zero) values."""
        self.reset_orientation = True
        self.reset_position = True



[docs]
    def zero_orientation(self):
        """Uses the current orientation as reference (zero) value."""
        self.reset_orientation = True



[docs]
    def zero_position(self):
        """Uses the current position as reference (zero) value."""
        self.reset_position = True


    def __read_pose_internal(
        self, calculate_orientation: bool = False, calculate_position: bool = False
    ) -> dict[str, Position | Quaternion | YPR | None] | None:
        """Internal method to calculate position and orientation

        Parameters
        ----------
        calculate_orientation : bool, optional
            Flag to calculate orientation, by default False
        calculate_position : bool, optional
            Flag to calculate position, by default False

        Returns
        -------
        dict[str, Position | Quaternion | YPR] or None
            The current head position and orientation, or None if not available.

        Raises
        ------
        RuntimeError
            If the camera is not opened when attempting to read orientation or position.
        """

        if not self.is_opened or self.__cap is None or self.__landmarker is None:
            raise RuntimeError(
                "Camera is not opened. Call 'open()' before reading orientation."
            )

        # Read frame from webcam
        ret, frame = self.__cap.read()
        if not ret:
            return None

        mp_image = mp.Image(
            image_format=mp.ImageFormat.SRGB,
            data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB),
        )

        # Process frame with MediaPipe Face Landmarker
        result = self.__landmarker.detect_for_video(mp_image, self.__frame_count)
        self.__frame_count += 1

        tmp_orientation = None
        tmp_position = None

        # Calculate orientation if requested
        if calculate_orientation and result.facial_transformation_matrixes:
            # Extract transformation matrix
            transformation_matrix = np.array(
                result.facial_transformation_matrixes[0].data
            ).reshape(4, 4)

            # Extract rotation matrix and translation vector
            rotation_matrix = transformation_matrix[:3, :3]

            # Initialize initial rotation matrix if needed or zeroed
            if self.__initial_rotation_matrix is None or self.reset_orientation:
                self.__initial_rotation_matrix = rotation_matrix.copy()
                self.reset_orientation = False

            # Calculate relative rotation matrix
            relative_rotation_matrix = (
                rotation_matrix @ self.__initial_rotation_matrix.T
            )

            # Convert rotation matrix to yaw, pitch, roll
            sy = np.sqrt(
                relative_rotation_matrix[0, 0] ** 2
                + relative_rotation_matrix[1, 0] ** 2
            )
            singular = sy < 1e-6

            if not singular:
                pitch = np.arctan2(
                    relative_rotation_matrix[2, 1], relative_rotation_matrix[2, 2]
                )
                yaw = np.arctan2(-relative_rotation_matrix[2, 0], sy)
                roll = np.arctan2(
                    relative_rotation_matrix[1, 0], relative_rotation_matrix[0, 0]
                )
            else:
                pitch = np.arctan2(
                    -relative_rotation_matrix[1, 2], relative_rotation_matrix[1, 1]
                )
                yaw = 0
                roll = np.arctan2(
                    relative_rotation_matrix[0, 1], relative_rotation_matrix[1, 1]
                )

            # Provide orientation in the requested format
            tmp_orientation = (
                YPR(yaw, pitch, roll)
                if self.orient_format == "ypr"
                else ypr2quat(YPR(yaw, pitch, roll))
            )

        # Calculate position if requested
        if calculate_position and result.face_landmarks:
            # Extract 2D image points
            face_landmarks = result.face_landmarks[0]
            image_points = np.array(
                [
                    [
                        face_landmarks[idx].x * self.__frame_width,
                        face_landmarks[idx].y * self.__frame_height,
                    ]
                    for idx in self.landmark_points_idx
                ],
                dtype=np.float64,
            )

            if self.__camera_matrix is None or self.__dist_coeffs is None:
                raise RuntimeError(
                    "Camera intrinsic parameters are not initialized. Call 'open()' before reading position."
                )

            # Solve PnP for absolute position in meters
            success, rvec, tvec = cv2.solvePnP(
                self.landmark_points_3d,
                image_points,
                self.__camera_matrix,
                self.__dist_coeffs,
                flags=cv2.SOLVEPNP_ITERATIVE,
            )

            # Calculate relative position if successful
            if success:

                # Initialize initial position if needed or zeroed
                if (
                    self.__initial_R is None
                    or self.__initial_tvec is None
                    or self.reset_position
                ):
                    self.__initial_tvec = tvec.copy()
                    self.__initial_R, _ = cv2.Rodrigues(rvec)
                    self.reset_position = False

                # Compute rotation matrices
                R_current, _ = cv2.Rodrigues(rvec)

                # Translation relative to initial camera frame (still metric)
                relative_tvec_camera = tvec - self.__initial_tvec

                # Rotate translation into the initial head frame
                relative_tvec_head = self.__initial_R.T @ (
                    R_current @ relative_tvec_camera
                )

                # Convert to Position object (invert y to match right-handed coordinate system)
                y, z, x = relative_tvec_head.flatten()
                y = -y

                tmp_position = Position(x, y, z)

        return {"position": tmp_position, "orientation": tmp_orientation}