Hand Tracking in Real Time
The effort to identify the shape and motion of hands can help improve the user experience around a wide range of technological domains and platforms. It can, for example, serve as the foundation for comprehending sign language and controlling hand gestures, as well as enabling the projection of digital content and information on top of the physical world in augmented reality. Because hands frequently occlude themselves or one other (e.g. finger/palm occlusions and hand shaking) and lack high contrast patterns, robust real-time hand perception is a difficult computer vision problem. The framework used is called MediaPipe developed by Google. It is a high-fidelity hand and finger tracking solution engaging Machine learning (ML) to deduce 21 3D landmarks of a hand from a single shot delivering real-time performance on a cell phone, and even scales to several hands, whereas existing state-of-the-art systems rely mostly on powerful desktop systems for inference. Moreover, Google created these amazing fundamental models that allowed us to quickly get started with some of the very fundamental AI problems such as face detection, face recognition, facial landmarks, hand tracking, object detection and quite a bit more. So the model I will be working with today is the "Hand Tracking". It uses two main modules at the backend. One is the "Palm detection" and the other one is "Hand Landmarks".
Palm detection basically works on complete image and it primarily provides a cropped image of the hand. From there the hand landmark module finds 21 different landmarks on the cropped image of the hand as shown in figure 2 to train the hand landmark without diving deep into the sea of configurations and installations and within just few clicks we'll be all set to run it.
Step1:
Inside the Pycharm platform first create a new pythonProject1. File --> New --> Python file
Step2:
From the File --> Settings --> pythonProject1 --> Python Interpreter --> install packages. Write opencv-python and mediapipe and hit install. Within two clicks we are all set.
Now we're all poised to start coding. We'll first write a bare minimum code to run and then convert it into a module so that we don't write it again and again for other similar projects. We won't dive deeper into thousands of parameters. Just within 15 to 20 minutes we will have our model working perfectly!
import cv2
import mediapipe as mp
import time
cap = cv2.VideoCapture(0)
mpHands = mp.solutions.hands
hands = mpHands.Hands()
mpDraw = mp.solutions.drawing_utils
pTime = 0
cTime = 0
while True:
success, img = cap.read()
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
results = hands.process(imgRGB)
#print(results.multi_hand_landmarks)
if results.multi_hand_landmarks:
for handLms in results.multi_hand_landmarks:
for id, lm in enumerate(handLms.landmark):
# print(id, lm)
h, w, c = img.shape
cx, cy = int(lm.x*w), int(lm.y*h)
print(id, cx, cy)
if id == 0:
cv2.circle(img, (cx, cy), 5, (255, 0, 255), cv2.FILLED)
mpDraw.draw_landmarks(img, handLms, mpHands.HAND_CONNECTIONS)
cTime = time.time()
fps = 1/(cTime-pTime)
pTime = cTime
cv2.putText(img, str(int(fps)), (10, 70), cv2.FONT_HERSHEY_PLAIN, 3, (255, 0, 255), 3)
cv2.imshow("image", img)
cv2.waitKey(1)
import cv2
import mediapipe as mp
import time
import math
import numpy as np
class handDetector():
def __init__(self, mode=False, maxHands = 2, detectionCon = 0.5, trackCon = 0.5):
self.mode = mode
self.maxHands = maxHands
self.detectionCon = detectionCon
self.trackCon = trackCon
self.mpHands = mp.solutions.hands
self.hands = self.mpHands.Hands(self.mode, self.maxHands, self.detectionCon, self.trackCon)
self.mpDraw = mp.solutions.drawing_utils
self.tipIds = [4, 8, 12, 16, 20]
def findHands(self, img, draw=True):
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
self.results = self.hands.process(imgRGB)
# print(results.multi_hand_landmarks)
if self.results.multi_hand_landmarks:
for self.handLms in self.results.multi_hand_landmarks:
if draw:
self.mpDraw.draw_landmarks(img, self.handLms, self.mpHands.HAND_CONNECTIONS)
return img
def findPosition(self, img, handNo=0, draw=True):
xList = [10]
yList = [10]
bbox = []
self.lmList = []
if self.results.multi_hand_landmarks:
myHand = self.results.multi_hand_landmarks[handNo]
for id, lm in enumerate(myHand.landmark):
h, w, c = img.shape
cx, cy = int(lm.x * w), int(lm.y * h)
xList.append(cx)
yList.append(cy)
#print(id, cx, cy)
self.lmList.append([id, cx, cy])
if draw:
cv2.circle(img, (cx, cy), 5, (255, 0, 255), cv2.FILLED)
xmin, xmax = min(xList), max(xList)
ymin, ymax = min(yList), max(yList)
bbox = xmin, ymin, xmax, ymax
if draw:
cv2.rectangle(img, (xmin - 20, ymin - 20), (xmax + 20, ymax + 20),
(0, 255, 0), 2)
return self.lmList, bbox
def fingersUp(self):
fingers = []
# Thumb
if self.lmList[self.tipIds[0]][1] > self.lmList[self.tipIds[0] - 1][1]:
fingers.append(1)
else:
fingers.append(0)
for id in range(1, 5):
if self.lmList[self.tipIds[id]][2] < self.lmList[self.tipIds[id] - 2][2]:
fingers.append(1)
else:
fingers.append(0)
return fingers
def findDistance(self, p1, p2, img, draw=True, r=15, t=3):
x1, y1 = self.lmList[p1][1:]
x2, y2 = self.lmList[p2][1:]
cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
if draw:
cv2.line(img, (x1, y1), (x2, y2), (255, 0, 255), t)
cv2.circle(img, (x1, y1), r, (255, 0, 255), cv2.FILLED)
cv2.circle(img, (x2, y2), r, (255, 0, 255), cv2.FILLED)
cv2.circle(img, (cx, cy), r, (0, 0, 255), cv2.FILLED)
length = math.hypot(x2 - x1, y2 - y1)
return length, img, [x1, y1, x2, y2, cx, cy]
def main():
pTime = 0
cTime = 0
cap = cv2.VideoCapture(0)
detector = handDetector()
while True:
success, img = cap.read()
img = detector.findHands(img)
lmList = detector.findPosition(img)
if len(lmList) != 0:
print(lmList[4])
cTime = time.time()
fps = 1 / (cTime - pTime)
pTime = cTime
cv2.putText(img, str(int(fps)), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 3, (255, 0, 255), 3)
cv2.imshow("Image", img)
cv2.waitKey(1)
if __name__ == "__main__":
main()
import cv2
import mediapipe as mp
import time
import HandTrackingModule as htm
import math
pTime = 0
cTime = 0
cap = cv2.VideoCapture(0)
detector = htm.handDetector()
while True:
success, img = cap.read()
img = detector.findHands(img)
lmList = detector.findPosition(img)
if len(lmList) != 0:
print(lmList)
cTime = time.time()
fps = 1 / (cTime - pTime)
pTime = cTime
cv2.putText(img, str(int(fps)), (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 3, (255, 0, 0), 3)
cv2.imshow("Image", img)
cv2.waitKey(1)





Comments
Post a Comment