location code: https://github.com/WimPouw/EnvisionBootcamp2021/tree/main/Python/MediaBodyTracking
citation: Pouw, W., & Trujillo, J.P. (2021-11-18). MultiParty Tracking with MediaPipe: Top-View Hand Tracking [day you visited the site]. Retrieved from: https://github.com/WimPouw/EnvisionBootcamp2021/tree/main/Python/MediaBodyTracking
from IPython.display import HTML
HTML('<iframe width="935" height="584" src="https://www.youtube.com/embed/mw8RymohMp0?start=7442" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>')
%config Completer.use_jedi = False
import cv2
import sys
import mediapipe
import pandas as pd
import numpy as np
import csv
from os import listdir
from os.path import isfile, join
#initialize modules
drawingModule = mediapipe.solutions.drawing_utils #the module(s) usd from the mediapipe package
handsModule = mediapipe.solutions.hands #the module(s) usd from the mediapipe package
#list all videos in mediafolder
mypath = "./MediaToAnalyze/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] # get all files that are in mediatoanalyze
#time series output folder
foldtime = "./Timeseries_Output/"
################################some preperatory functions and lists for saving the data
#take some google classification object and convert it into a string
def makegoginto_str(gogobj):
gogobj = str(gogobj).strip("[]")
gogobj = gogobj.split("\n")
return(gogobj[:-1]) #ignore last element as this has nothing
#Hand landmarks
markers = ['WRIST', 'THUMB_CMC', 'THUMB_MCP', 'THUMB_IP', 'THUMB_TIP',
'INDEX_MCP', 'INDEX_PIP', 'INDEX_DIP', 'INDEX_TIP',
'MIDDLE_MCP', 'MIDDLE_PIP', 'MIDDLE_DIP','MIDDLE_TIP',
'RING_MCP', 'RING_TIP', 'RING_DIP', 'RING_TIP',
'PINKY_MCP', 'PINKY_PIP', 'PINKY_DIP', 'PINKY_TIP']
#make the stringifyd position traces into clean values
def listpostions(newsamplemarks):
tracking_p = []
for value in newsamplemarks:
stripped = value.split(':', 1)[1]
stripped = stripped.strip() #remove spaces in the string if present
tracking_p.append(stripped) #add to this list
return(tracking_p)
#a function that only retrieves the numerical info in a string
def only_numerics(seq):
seq_type= type(seq)
return seq_type().join(filter(seq_type.isdigit, seq))
Now we'll pefrorm the actual tracking. This block goes through each video file in your directory, gets the video frames (images) using cv2, creates an output video file, and then collects the tracked points. The saved keypoint coordinates are then drawn onto a copy of the video frame in order to visualize the tracking as well as saved into a .csv file for later analysis.
#loop through the frames of the video
for ff in onlyfiles:
#capture the video and save some video properties
capture = cv2.VideoCapture(mypath+ff)
frameWidth = capture.get(cv2.CAP_PROP_FRAME_WIDTH)
frameHeight = capture.get(cv2.CAP_PROP_FRAME_HEIGHT)
fps = capture.get(cv2.CAP_PROP_FPS)
print(frameWidth, frameHeight, fps ) #print some video info to the console
#make a video file where we will project keypoints on
samplerate = fps #make the same as current
fourcc = cv2.VideoWriter_fourcc(*'XVID') #(*'XVID')
out = cv2.VideoWriter('Videotracking_output/'+ff[:-4]+'.avi', fourcc, fps= samplerate, frameSize = (int(frameWidth), int(frameHeight))) #make sure that frameheight/width is the same a original
#make a variable list with x, y, z, info where data is appended to
markerxyz = []
for mark in markers:
for pos in ['X', 'Y', 'Z']:
nm = pos + "_" + mark
markerxyz.append(nm)
addvariable = ['index', 'confidence', 'hand', 'time']
addvariable.extend(markerxyz)
time = 0
fr = 1
timeseries = [addvariable]
#MAIN ROUTINE
#For finetuning the tracking here check: https://google.github.io/mediapipe/solutions/hands.html
with handsModule.Hands(static_image_mode=False, min_detection_confidence=0.5, min_tracking_confidence=0.75, max_num_hands=6) as hands:
while (True):
ret, frame = capture.read()
if ret == True:
results = hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# the results.multi_hand_landmarks should contain sets of x,y,z values for each landmark
# However, they have no label or ID, just raw coordinates.
# we do know which set of coordinates corresponds to which joint:
# see https://google.github.io/mediapipe/solutions/hands.html and figure 2.21 on that page
if results.multi_hand_landmarks != None:
#attach an id based on location
for handLandmarks, handinfo in zip(results.multi_hand_landmarks,results.multi_handedness):
# these first few lines just convert the results output into something more workable
newsamplelmarks = makegoginto_str(handLandmarks.landmark)
newsamplelmarks = listpostions(newsamplelmarks)
newsampleinfo = makegoginto_str(handinfo) #get info the hands
# now we compile the data into a complete row, and add it to our dataframe
fuldataslice = [fr, newsampleinfo[2], newsampleinfo[3]]
fuldataslice.extend([str(time)]) #add time
fuldataslice.extend(newsamplelmarks) #add positions
timeseries.append(fuldataslice)
#get information about hand index [0], hand confidence [1], handedness [2]
for point in handsModule.HandLandmark:
normalizedLandmark = handLandmarks.landmark[point]
# now draw the landmark onto the video frame
pixelCoordinatesLandmark = drawingModule._normalized_to_pixel_coordinates(normalizedLandmark.x, normalizedLandmark.y, frameWidth, frameHeight)
cv2.circle(frame, pixelCoordinatesLandmark, 5, (0, 255, 0), -1)
if results.multi_hand_landmarks == None:
timeseries.append(["NA"]) #add a row of NAs
cv2.imshow('Test hand', frame)
out.write(frame) #########################################comment this out if you dont wn
time = round(time+1000/samplerate)
fr = fr+1
if cv2.waitKey(1) == 27:
break
if ret == False:
break
out.release()
capture.release()
cv2.destroyAllWindows()
####################################################### data to be written row-wise in csv file
data = timeseries
# opening the csv file in 'w+' mode
file = open(foldtime+ff[:-4]+'.csv', 'w+', newline ='')
#write it
with file:
write = csv.writer(file)
write.writerows(data)
Let's take a first look at the data to see what kind of output we get.
print(foldtime+ff[:-4]+'.csv')
df = pd.read_csv(foldtime+ff[:-4]+'.csv')
df.head()
Above we have the first 5 rows of our output data. The first named column, "index", provides you with the frame number. Note that each frame may have multiple rows, if multiple hands are tracked in that frame. We also get a label, right or left, and x,y,z coordinates (scaled to 0,1 --- see below) for each keypoint.
However, especially for multi-party data we don't know if the first row in frame 1 is the same hand as the first row in frame 2. Thus, we don't know if a left and right hand belong together as there are multiple persons! We'll cover a potential solution to this in the module on linking and pairing hands. This is easier when there is just one person, as mediapipe does differientate between left and right hand.