"""
Access position and orientation data via webcam-based head tracking
Classes
- `MPFaceLandmarker`: Webcam-based head tracker using OpenCV and MediaPipe Face Landmarker
"""
from importlib import resources
from pathlib import Path
from typing import Optional
import cv2
import numpy as np
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision
from .dtypes import Quaternion, YPR, Position, HTBase
from .utils import ypr2quat
[docs]
class MPFaceLandmarker(HTBase):
"""Webcam-based head tracker using OpenCV and MediaPipe Face Landmarker
This class uses MediaPipe's Face Landmarker [1] to track head position and orientation using a (web-)cam. A mesh of 478 face landmarks is fitted to the detected face, from which 6 key points are used to estimate head pose via solvePnP. Orientation is derived from MediaPipe's facial transformation matrix. Tracking of the position is still experimental and might be inaccurate. Parts of the computation are offloaded to the GPU if available. By default, the built-in MediaPipe Face Landmarker model is used, but custom model weights can be provided. The supplied model weights (09/15/2022) are licensed under the Apache 2.0 license.
Attributes
----------
cam_index : int
The index of the webcam to use.
landmark_points_idx : list[int], optional
Indices of the facial landmarks to use for pose estimation.
landmark_points_3d : np.ndarray, optional
Corresponding 3D model points for the selected landmarks. Note, that mediapipe's coordinate system is used (right-handed, x: right, y: down, z: forward).
orient_format : str
The format for orientation data. Possible values are "q" (Quaternion) or "ypr" (Yaw, Pitch, Roll).
reset_orientation : bool
Flag to indicate whether to reset orientation zeroing on the next read.
reset_position : bool
Flag to indicate whether to reset position zeroing on the next read.
is_opened : bool
Indicates whether the camera is opened.
__FaceLandmarkerOptions : vision.FaceLandmarkerOptions
MediaPipe Face Landmarker options.
__landmarker : vision.FaceLandmarker
Internal MediaPipe Face Landmarker instance.
__cap : cv2.VideoCapture
Internal OpenCV VideoCapture instance.
__frame_count : int
Frame counter for MediaPipe processing.
__frame_width : int
Width of the video frames.
__frame_height : int
Height of the video frames.
__camera_matrix : np.ndarray
Camera intrinsic matrix.
__dist_coeffs : np.ndarray
Camera distortion coefficients.
__initial_rotation_matrix : np.ndarray
Initial rotation matrix for orientation zeroing.
__initial_R : np.ndarray
Initial rotation matrix for position zeroing.
__initial_tvec : np.ndarray
Initial translation vector for position zeroing.
Methods
-------
open()
Initializes the webcam and MediaPipe Face Landmarker.
close()
Releases the webcam and MediaPipe resources.
list_available_cameras(max_index: int = 20) -> list[int]
Lists all available camera indices on the system.
zero()
Use the current position and orientation as reference (zero) values.
zero_orientation()
Uses the current orientation as reference (zero) value.
zero_position()
Uses the current position as reference (zero) value.
read_orientation() -> Quaternion or YPR or None
Returns if available the current head orientation as a Quaternion or YPR object.
read_position() -> Position or None
Returns if available the current head position in meters as a Position object.
read_pose() -> dict or None
Returns if available the current head position and orientation as a dictionary with Position and Quaternion or YPR objects.
References
----------
[1] MediaPipe Face Landmarker: https://ai.google.dev/edge/mediapipe/solutions/vision/face_landmarker
"""
[docs]
def __init__(
self,
cam_index: int = 0,
model_weights: Optional[str] = None,
min_face_detection_confidence: float = 0.8,
min_face_presence_confidence: float = 0.8,
min_tracking_confidence: float = 0.8,
landmark_points_idx: Optional[list[int]] = None,
landmark_points_3d: Optional[np.ndarray] = None,
orient_format: str = "q",
):
"""
Parameters
----------
cam_index : int
The index of the webcam to use.
model_weights : str, optional
Path to custom model weights. If None, the default built-in model is used.
min_face_detection_confidence : float, optional
Minimum confidence value ([0.0, 1.0]) for face detection to be considered successful. Defaults to 0.8.
min_face_presence_confidence : float, optional
Minimum confidence value ([0.0, 1.0]) for the face presence to be considered detected successfully. Defaults to 0.8.
min_tracking_confidence : float, optional
Minimum confidence value ([0.0, 1.0]) for the face landmarks to be considered tracked successfully. Defaults to 0.8.
landmark_points_idx : list[int], optional
Indices of the facial landmarks to use for pose estimation.
landmark_points_3d : np.ndarray, optional
Corresponding 3D model points for the selected landmarks. Note, that mediapipe's coordinate system is used (right-handed, x: right, y: down, z: forward).
orient_format : str
The format for orientation data. Possible values are "q" (Quaternion) or "ypr" (Yaw, Pitch, Roll).
"""
self.orient_format = orient_format
self.cam_index = cam_index
# Load default model weights if not provided
if model_weights is None:
try:
# Use importlib.resources for deployed packages
if hasattr(resources, "files"):
# Python 3.9+
data_path = resources.files("pyheadtracker").joinpath(
"data/mediapipe-facelandmarker/face_landmarker_v2_with_blendshapes.task"
)
model_weights = str(data_path)
else:
# Fallback for older Python versions
model_weights = str(
Path(__file__).parent
/ "data/mediapipe-facelandmarker/face_landmarker_v2_with_blendshapes.task"
)
except Exception:
# Final fallback to relative path
model_weights = "data/mediapipe-facelandmarker/face_landmarker_v2_with_blendshapes.task"
assert orient_format in [
"q",
"ypr",
], 'orient_format must be "q" (Quaternion) or "ypr" (Yaw, Pitch, Roll)'
assert (
min_face_detection_confidence >= 0.0
and min_face_detection_confidence <= 1.0
), "min_face_detection_confidence must be in [0.0, 1.0]"
assert (
min_face_presence_confidence >= 0.0 and min_face_presence_confidence <= 1.0
), "min_face_presence_confidence must be in [0.0, 1.0]"
assert (
min_tracking_confidence >= 0.0 and min_tracking_confidence <= 1.0
), "min_tracking_confidence must be in [0.0, 1.0]"
if landmark_points_idx is None:
self.landmark_points_idx = [1, 152, 33, 263, 61, 291]
else:
self.landmark_points_idx = landmark_points_idx
if landmark_points_3d is None:
self.landmark_points_3d = np.array(
[
[0.0, 0.0, 0.0], # Nose tip (1)
[0.0, -0.063, -0.033], # Chin (152)
[-0.043, 0.032, -0.026], # Left eye outer corner (33)
[0.043, 0.032, -0.026], # Right eye outer corner (263)
[-0.028, -0.028, -0.025], # Left mouth corner (61)
[0.028, -0.028, -0.025], # Right mouth corner (291)
],
dtype=np.float64,
)
else:
self.landmark_points_3d = landmark_points_3d
assert (
len(self.landmark_points_idx) == self.landmark_points_3d.shape[0]
), "Length of landmark_points_idx must match number of rows in landmark_points_3d"
# Initialize MediaPipe Face Landmarker
BaseOptions = python.BaseOptions(
model_asset_path=model_weights,
)
self.__FaceLandmarkerOptions = vision.FaceLandmarkerOptions(
base_options=BaseOptions, # uses built-in model
min_face_detection_confidence=min_face_detection_confidence,
min_face_presence_confidence=min_face_presence_confidence,
min_tracking_confidence=min_tracking_confidence,
output_face_blendshapes=False,
output_facial_transformation_matrixes=True,
num_faces=1,
running_mode=vision.RunningMode.VIDEO,
)
# Internal variables
self.__landmarker = None
self.__cap = None
self.__frame_count = 0
self.__frame_width = 0
self.__frame_height = 0
self.__camera_matrix = None
self.__dist_coeffs = None
self.__initial_rotation_matrix = None
self.__initial_R = None
self.__initial_tvec = None
self.reset_orientation = True
self.reset_position = True
self.is_opened = False
[docs]
def open(self):
"""Initializes the webcam and MediaPipe Face Landmarker."""
self.__landmarker = vision.FaceLandmarker.create_from_options(
self.__FaceLandmarkerOptions
)
self.__cap = cv2.VideoCapture(self.cam_index)
# Double-check camera can actually read frames
if not self.__cap.isOpened():
raise RuntimeError(
f"Failed to open camera with index {self.cam_index}. Use pyheadtracker.cam.list_available_cameras() to see available cameras."
)
self.__frame_count = 0
self.__frame_width = int(self.__cap.get(cv2.CAP_PROP_FRAME_WIDTH))
self.__frame_height = int(self.__cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
focal_length = self.__frame_width
self.__camera_matrix = np.array(
[
[focal_length, 0, self.__frame_width / 2],
[0, focal_length, self.__frame_height / 2],
[0, 0, 1],
],
dtype=np.float64,
)
self.__dist_coeffs = np.zeros((4, 1))
self.is_opened = True
[docs]
@staticmethod
def list_available_cameras(max_index: int = 20) -> list[int]:
"""
List all available camera indices on the system.
Parameters
----------
max_index : int
Maximum camera index to check (default 10).
Returns
-------
list[int]
List of available camera indices.
"""
# Suppress OpenCV logging
cv2.setLogLevel(0)
available_cameras = []
for i in range(max_index):
cap = cv2.VideoCapture(i)
if cap.isOpened():
ret, _ = cap.read()
cap.release()
if ret:
available_cameras.append(i)
# Restore OpenCV logging level to default
cv2.setLogLevel(1)
return available_cameras
[docs]
def close(self):
"""Releases the webcam and MediaPipe resources."""
if self.__cap is not None:
self.__cap.release()
self.__cap = None
self.__landmarker = None
self.is_opened = False
cv2.destroyAllWindows()
[docs]
def read_orientation(self) -> YPR | Quaternion | None:
"""Returns if available the current head orientation as a Quaternion or YPR object.
Returns
-------
Quaternion or YPR or None
The current head orientation, or None if not available.
"""
pose = self.__read_pose_internal(calculate_orientation=True)
if pose is None or pose["orientation"] is None:
return None
orientation = pose["orientation"]
if isinstance(orientation, (YPR, Quaternion)):
return orientation
return None
[docs]
def read_position(self) -> Position | None:
"""Returns if available the current head position as a Position object.
Returns
-------
Position or None
The current head position, or None if not available.
"""
pose = self.__read_pose_internal(calculate_position=True)
if pose is None or pose["position"] is None:
return None
position = pose["position"]
return position if isinstance(position, Position) else None
[docs]
def read_pose(self) -> dict[str, Position | Quaternion | YPR | None] | None:
"""Returns if available the current head position and orientation as a dictionary with Position and Quaternion or YPR objects.
Returns
-------
dict[str, Position | Quaternion | YPR] or None
The current head position and orientation, or None if not available.
"""
pose = self.__read_pose_internal(
calculate_orientation=True, calculate_position=True
)
if pose is None:
return None
return pose
[docs]
def zero(self):
"""Use the current position and orientation as reference (zero) values."""
self.reset_orientation = True
self.reset_position = True
[docs]
def zero_orientation(self):
"""Uses the current orientation as reference (zero) value."""
self.reset_orientation = True
[docs]
def zero_position(self):
"""Uses the current position as reference (zero) value."""
self.reset_position = True
def __read_pose_internal(
self, calculate_orientation: bool = False, calculate_position: bool = False
) -> dict[str, Position | Quaternion | YPR | None] | None:
"""Internal method to calculate position and orientation
Parameters
----------
calculate_orientation : bool, optional
Flag to calculate orientation, by default False
calculate_position : bool, optional
Flag to calculate position, by default False
Returns
-------
dict[str, Position | Quaternion | YPR] or None
The current head position and orientation, or None if not available.
Raises
------
RuntimeError
If the camera is not opened when attempting to read orientation or position.
"""
if not self.is_opened or self.__cap is None or self.__landmarker is None:
raise RuntimeError(
"Camera is not opened. Call 'open()' before reading orientation."
)
# Read frame from webcam
ret, frame = self.__cap.read()
if not ret:
return None
mp_image = mp.Image(
image_format=mp.ImageFormat.SRGB,
data=cv2.cvtColor(frame, cv2.COLOR_BGR2RGB),
)
# Process frame with MediaPipe Face Landmarker
result = self.__landmarker.detect_for_video(mp_image, self.__frame_count)
self.__frame_count += 1
tmp_orientation = None
tmp_position = None
# Calculate orientation if requested
if calculate_orientation and result.facial_transformation_matrixes:
# Extract transformation matrix
transformation_matrix = np.array(
result.facial_transformation_matrixes[0].data
).reshape(4, 4)
# Extract rotation matrix and translation vector
rotation_matrix = transformation_matrix[:3, :3]
# Initialize initial rotation matrix if needed or zeroed
if self.__initial_rotation_matrix is None or self.reset_orientation:
self.__initial_rotation_matrix = rotation_matrix.copy()
self.reset_orientation = False
# Calculate relative rotation matrix
relative_rotation_matrix = (
rotation_matrix @ self.__initial_rotation_matrix.T
)
# Convert rotation matrix to yaw, pitch, roll
sy = np.sqrt(
relative_rotation_matrix[0, 0] ** 2
+ relative_rotation_matrix[1, 0] ** 2
)
singular = sy < 1e-6
if not singular:
pitch = np.arctan2(
relative_rotation_matrix[2, 1], relative_rotation_matrix[2, 2]
)
yaw = np.arctan2(-relative_rotation_matrix[2, 0], sy)
roll = np.arctan2(
relative_rotation_matrix[1, 0], relative_rotation_matrix[0, 0]
)
else:
pitch = np.arctan2(
-relative_rotation_matrix[1, 2], relative_rotation_matrix[1, 1]
)
yaw = 0
roll = np.arctan2(
relative_rotation_matrix[0, 1], relative_rotation_matrix[1, 1]
)
# Provide orientation in the requested format
tmp_orientation = (
YPR(yaw, pitch, roll)
if self.orient_format == "ypr"
else ypr2quat(YPR(yaw, pitch, roll))
)
# Calculate position if requested
if calculate_position and result.face_landmarks:
# Extract 2D image points
face_landmarks = result.face_landmarks[0]
image_points = np.array(
[
[
face_landmarks[idx].x * self.__frame_width,
face_landmarks[idx].y * self.__frame_height,
]
for idx in self.landmark_points_idx
],
dtype=np.float64,
)
if self.__camera_matrix is None or self.__dist_coeffs is None:
raise RuntimeError(
"Camera intrinsic parameters are not initialized. Call 'open()' before reading position."
)
# Solve PnP for absolute position in meters
success, rvec, tvec = cv2.solvePnP(
self.landmark_points_3d,
image_points,
self.__camera_matrix,
self.__dist_coeffs,
flags=cv2.SOLVEPNP_ITERATIVE,
)
# Calculate relative position if successful
if success:
# Initialize initial position if needed or zeroed
if (
self.__initial_R is None
or self.__initial_tvec is None
or self.reset_position
):
self.__initial_tvec = tvec.copy()
self.__initial_R, _ = cv2.Rodrigues(rvec)
self.reset_position = False
# Compute rotation matrices
R_current, _ = cv2.Rodrigues(rvec)
# Translation relative to initial camera frame (still metric)
relative_tvec_camera = tvec - self.__initial_tvec
# Rotate translation into the initial head frame
relative_tvec_head = self.__initial_R.T @ (
R_current @ relative_tvec_camera
)
# Convert to Position object (invert y to match right-handed coordinate system)
y, z, x = relative_tvec_head.flatten()
y = -y
tmp_position = Position(x, y, z)
return {"position": tmp_position, "orientation": tmp_orientation}