Intégration MediaPipe et YOLOv8 : Application de Vision par Ordinateur Avancée

Guide complet pour créer une application qui combine la détection de pose avec MediaPipe et la détection d’objets avec YOLOv8.

1. Configuration Initiale

# Installation des dépendances
pip install mediapipe ultralytics opencv-python numpy

# Import des bibliothèques
import mediapipe as mp
from ultralytics import YOLO
import cv2
import numpy as np
import time
from threading import Thread
from queue import Queue

2. Architecture de l’Application

2.1 Classes de Base

class PoseDetector:
    def __init__(self):
        self.mp_pose = mp.solutions.pose
        self.pose = self.mp_pose.Pose(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.mp_draw = mp.solutions.drawing_utils
        
    def detect(self, image):
        # Conversion en RGB
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Détection de la pose
        results = self.pose.process(image_rgb)
        
        # Dessiner les points de repère
        if results.pose_landmarks:
            self.mp_draw.draw_landmarks(
                image,
                results.pose_landmarks,
                self.mp_pose.POSE_CONNECTIONS
            )
        
        return image, results.pose_landmarks

class ObjectDetector:
    def __init__(self):
        self.model = YOLO('yolov8n.pt')
        
    def detect(self, image):
        # Détection d'objets
        results = self.model(image)
        
        # Traitement des résultats
        for result in results:
            boxes = result.boxes
            for box in boxes:
                # Coordonnées de la boîte
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
                
                # Confiance
                confidence = float(box.conf[0])
                
                # Classe
                class_id = int(box.cls[0])
                class_name = self.model.names[class_id]
                
                # Dessiner la boîte
                cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Ajouter le texte
                label = f"{class_name} {confidence:.2f}"
                cv2.putText(image, label, (x1, y1 - 10),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        return image

3. Traitement Multi-thread

3.1 Gestionnaire de Frames

class FrameManager:
    def __init__(self, max_queue_size=2):
        self.frame_queue = Queue(maxsize=max_queue_size)
        self.result_queue = Queue(maxsize=max_queue_size)
        
    def add_frame(self, frame):
        if not self.frame_queue.full():
            self.frame_queue.put(frame)
            
    def get_frame(self):
        if not self.frame_queue.empty():
            return self.frame_queue.get()
        return None
        
    def add_result(self, result):
        if not self.result_queue.full():
            self.result_queue.put(result)
            
    def get_result(self):
        if not self.result_queue.empty():
            return self.result_queue.get()
        return None

3.2 Workers

class PoseWorker(Thread):
    def __init__(self, frame_manager, detector):
        super().__init__()
        self.frame_manager = frame_manager
        self.detector = detector
        self.running = True
        
    def run(self):
        while self.running:
            frame = self.frame_manager.get_frame()
            if frame is not None:
                # Détection de pose
                processed_frame, landmarks = self.detector.detect(frame)
                self.frame_manager.add_result((processed_frame, landmarks))

class ObjectWorker(Thread):
    def __init__(self, frame_manager, detector):
        super().__init__()
        self.frame_manager = frame_manager
        self.detector = detector
        self.running = True
        
    def run(self):
        while self.running:
            frame = self.frame_manager.get_frame()
            if frame is not None:
                # Détection d'objets
                processed_frame = self.detector.detect(frame)
                self.frame_manager.add_result((processed_frame, None))

4. Application Principale

4.1 Configuration

class Application:
    def __init__(self):
        # Initialisation des détecteurs
        self.pose_detector = PoseDetector()
        self.object_detector = ObjectDetector()
        
        # Initialisation du gestionnaire de frames
        self.frame_manager = FrameManager()
        
        # Initialisation des workers
        self.pose_worker = PoseWorker(self.frame_manager, self.pose_detector)
        self.object_worker = ObjectWorker(self.frame_manager, self.object_detector)
        
        # Configuration de la caméra
        self.cap = cv2.VideoCapture(0)
        
        # Variables de contrôle
        self.running = True
        self.current_mode = 'both'  # 'pose', 'object', 'both'

4.2 Boucle Principale

    def run(self):
        # Démarrage des workers
        self.pose_worker.start()
        self.object_worker.start()
        
        while self.running:
            success, frame = self.cap.read()
            if not success:
                continue
                
            # Ajout du frame au gestionnaire
            self.frame_manager.add_frame(frame.copy())
            
            # Récupération des résultats
            pose_result = self.frame_manager.get_result()
            object_result = self.frame_manager.get_result()
            
            # Affichage des résultats
            if pose_result is not None and object_result is not None:
                pose_frame, pose_landmarks = pose_result
                object_frame, _ = object_result
                
                # Combinaison des résultats
                if self.current_mode == 'both':
                    # Superposition des résultats
                    combined_frame = cv2.addWeighted(
                        pose_frame, 0.5, object_frame, 0.5, 0
                    )
                    cv2.imshow('Combined Detection', combined_frame)
                elif self.current_mode == 'pose':
                    cv2.imshow('Pose Detection', pose_frame)
                else:
                    cv2.imshow('Object Detection', object_frame)
            
            # Gestion des touches
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                self.running = False
            elif key == ord('p'):
                self.current_mode = 'pose'
            elif key == ord('o'):
                self.current_mode = 'object'
            elif key == ord('b'):
                self.current_mode = 'both'
        
        # Nettoyage
        self.cleanup()

4.3 Nettoyage

    def cleanup(self):
        # Arrêt des workers
        self.pose_worker.running = False
        self.object_worker.running = False
        self.pose_worker.join()
        self.object_worker.join()
        
        # Libération des ressources
        self.cap.release()
        cv2.destroyAllWindows()

5. Fonctionnalités Avancées

5.1 Analyse des Interactions

def analyze_interaction(pose_landmarks, object_detections):
    if pose_landmarks is None or object_detections is None:
        return None
        
    # Extraction des points clés de la pose
    left_hand = np.array([
        pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_WRIST.value].x,
        pose_landmarks.landmark[mp_pose.PoseLandmark.LEFT_WRIST.value].y
    ])
    
    right_hand = np.array([
        pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_WRIST.value].x,
        pose_landmarks.landmark[mp_pose.PoseLandmark.RIGHT_WRIST.value].y
    ])
    
    # Analyse des interactions avec les objets détectés
    interactions = []
    for detection in object_detections:
        box_center = np.array([
            (detection[0] + detection[2]) / 2,
            (detection[1] + detection[3]) / 2
        ])
        
        # Calcul des distances
        left_distance = np.linalg.norm(left_hand - box_center)
        right_distance = np.linalg.norm(right_hand - box_center)
        
        # Détection des interactions
        if left_distance < 0.1 or right_distance < 0.1:
            interactions.append({
                'object': detection[4],
                'hand': 'left' if left_distance < right_distance else 'right',
                'distance': min(left_distance, right_distance)
            })
    
    return interactions

5.2 Visualisation Avancée

def visualize_results(frame, pose_landmarks, object_detections, interactions):
    # Dessiner les points de repère de la pose
    if pose_landmarks:
        mp_draw.draw_landmarks(
            frame,
            pose_landmarks,
            mp_pose.POSE_CONNECTIONS
        )
    
    # Dessiner les boîtes de détection d'objets
    for detection in object_detections:
        x1, y1, x2, y2 = detection[:4]
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
    
    # Visualiser les interactions
    if interactions:
        for interaction in interactions:
            # Dessiner une ligne entre la main et l'objet
            hand_point = left_hand if interaction['hand'] == 'left' else right_hand
            cv2.line(
                frame,
                (int(hand_point[0] * frame.shape[1]), int(hand_point[1] * frame.shape[0])),
                (int(box_center[0] * frame.shape[1]), int(box_center[1] * frame.shape[0])),
                (0, 0, 255),
                2
            )
    
    return frame

6. Optimisation des Performances

6.1 Gestion de la Mémoire

def optimize_memory():
    # Configuration pour une utilisation optimale de la mémoire
    pose_detector = PoseDetector()
    object_detector = ObjectDetector()
    
    # Réduction de la taille des frames
    def resize_frame(frame):
        return cv2.resize(frame, (640, 480))
    
    # Nettoyage périodique des queues
    def cleanup_queues():
        while not frame_manager.frame_queue.empty():
            frame_manager.frame_queue.get()
        while not frame_manager.result_queue.empty():
            frame_manager.result_queue.get()

6.2 Accélération GPU

def enable_gpu_acceleration():
    # Configuration pour l'utilisation du GPU
    import torch
    
    if torch.cuda.is_available():
        # Configuration de YOLO pour le GPU
        object_detector.model.to('cuda')
        
        # Configuration de MediaPipe pour le GPU
        pose_detector.pose = mp_pose.Pose(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5,
            model_complexity=1,
            enable_segmentation=False
        )

7. Bonnes Pratiques

Gestion des Ressources
- Utiliser le multi-threading pour les détections
- Implémenter un système de cache
- Nettoyer régulièrement la mémoire
Optimisation
- Réduire la taille des frames
- Utiliser l’accélération GPU
- Ajuster les fréquences de détection
Interface Utilisateur
- Fournir des contrôles intuitifs
- Afficher les métriques de performance
- Permettre le changement de mode en temps réel

Conclusion

Points clés :

Intégration efficace de MediaPipe et YOLOv8
Traitement multi-thread pour de meilleures performances
Analyse des interactions entre pose et objets
Interface utilisateur flexible

Recommandations :

Optimiser l’utilisation de la mémoire
Utiliser l’accélération GPU
Implémenter une gestion robuste des erreurs
Ajouter des fonctionnalités de personnalisation

À propos de InSkillCoach

Expert en formation et technologies

Coach spécialisé dans les technologies avancées et l'IA, porté par GNeurone Inc.

GitHub Voir tous les articles

Certifications:

AWS Certified Solutions Architect – Professional
Certifications Google Cloud
Microsoft Certified: DevOps Engineer Expert
Certified Kubernetes Administrator (CKA)
CompTIA Security+

1.4k