diff --git a/README.md b/README.md index bd83a11..72c2968 100644 --- a/README.md +++ b/README.md @@ -417,6 +417,26 @@ Only has motion module, no referencenet, requiring less gpu memory. python scripts/inference/text2video.py --sd_model_name majicmixRealv6Fp16 --unet_model_name musev -test_data_path ./configs/tasks/example.yaml --output_dir ./output --n_batch 1 --target_datas yongen --time_size 12 --fps 12 ``` #### video2video +##### pose align +```bash +python ./pose_align/pose_align.py --max_frame 200 --vidfn ./data/source_video/dance.mp4 --imgfn_refer ./data/images/man.jpg --outfn_ref_img_pose ./data/pose_align_results/ref_img_pose.jpg --outfn_align_pose_video ./data/pose_align_results/align_pose_video.mp4 --outfn ./data/pose_align_results/align_demo.mp4 +``` +- `max_frame`: how many frames to align (count from the first frame) +- `vidfn`:real dance video in rgb +- `imgfn_refer`: refer image path +- `outfn_ref_img_pose`: output path of the pose of the refer img +- `outfn_align_pose_video`: output path of the aligned video of the refer img +- `outfn`: output path of the alignment visualization + + + +https://github.com/TMElyralab/MuseV/assets/47803475/787d7193-ec69-43f4-a0e5-73986a808f51 + + + + +then you can use the aligned pose `outfn_align_pose_video` for pose guided generation. You may need to modify the example in the config file `./configs/tasks/example.yaml` +##### generation ```bash python scripts/inference/video2video.py --sd_model_name fantasticmix_v10 --unet_model_name musev -test_data_path ./configs/tasks/example.yaml --output_dir ./output --n_batch 1 --controlnet_name dwpose_body_hand --which2video "video_middle" --target_datas dance1 --fps 12 --time_size 12 ``` diff --git a/data/images/man.jpg b/data/images/man.jpg new file mode 100644 index 0000000..3b3150b Binary files /dev/null and b/data/images/man.jpg differ diff --git a/data/pose_align_results/align_demo.mp4 b/data/pose_align_results/align_demo.mp4 new file mode 100644 index 0000000..ba92e4c Binary files /dev/null and b/data/pose_align_results/align_demo.mp4 differ diff --git a/data/pose_align_results/align_pose_video.mp4 b/data/pose_align_results/align_pose_video.mp4 new file mode 100644 index 0000000..6eac0b6 Binary files /dev/null and b/data/pose_align_results/align_pose_video.mp4 differ diff --git a/data/pose_align_results/ref_img_pose.jpg b/data/pose_align_results/ref_img_pose.jpg new file mode 100644 index 0000000..9b22ea7 Binary files /dev/null and b/data/pose_align_results/ref_img_pose.jpg differ diff --git a/data/source_video/dance.mp4 b/data/source_video/dance.mp4 new file mode 100644 index 0000000..15024ec Binary files /dev/null and b/data/source_video/dance.mp4 differ diff --git a/pose_align/pose_align.py b/pose_align/pose_align.py new file mode 100644 index 0000000..cba5a01 --- /dev/null +++ b/pose_align/pose_align.py @@ -0,0 +1,552 @@ +import torch +import cv2 +import numpy as np +import ffmpeg +import argparse +import time +import traceback +import scipy.signal as signal +import copy +from controlnet_aux import DWposeDetector +from controlnet_aux.dwpose import pose2map +from pprint import pprint + +from pose_utils import get_video_meta_info, size_calculate, warpAffine_kps, Reader, Writer + + + +''' + Detect dwpose from img, then align it by scale parameters + img: frame from the pose video + detector: DWpose + scales: scale parameters +''' +def align_img(img, pose_ori, scales, detect_resolution, image_resolution): + + body_pose = copy.deepcopy(pose_ori['bodies']['candidate']) + hands = copy.deepcopy(pose_ori['hands']) + faces = copy.deepcopy(pose_ori['faces']) + + ''' + 计算逻辑: + 0. 该函数内进行绝对变换,始终保持人体中心点 body_pose[1] 不变 + 1. 先把 ref 和 pose 的高 resize 到一样,且都保持原来的长宽比。 + 2. 用点在图中的实际坐标来计算。 + 3. 实际计算中,把h的坐标归一化到 [0, 1], w为[0, W/H] + 4. 由于 dwpose 的输出本来就是归一化的坐标,所以h不需要变,w要乘W/H + 注意:dwpose 输出是 (w, h) + ''' + + # h不变,w缩放到原比例 + H_in, W_in, C_in = img.shape + video_ratio = W_in / H_in + body_pose[:, 0] = body_pose[:, 0] * video_ratio + hands[:, :, 0] = hands[:, :, 0] * video_ratio + faces[:, :, 0] = faces[:, :, 0] * video_ratio + + # scales of 10 body parts + scale_neck = scales["scale_neck"] + scale_face = scales["scale_face"] + scale_shoulder = scales["scale_shoulder"] + scale_arm_upper = scales["scale_arm_upper"] + scale_arm_lower = scales["scale_arm_lower"] + scale_hand = scales["scale_hand"] + scale_body_len = scales["scale_body_len"] + scale_leg_upper = scales["scale_leg_upper"] + scale_leg_lower = scales["scale_leg_lower"] + + scale_sum = 0 + count = 0 + scale_list = [scale_neck, scale_face, scale_shoulder, scale_arm_upper, scale_arm_lower, scale_hand, scale_body_len, scale_leg_upper, scale_leg_lower] + for i in range(len(scale_list)): + if not np.isinf(scale_list[i]): + scale_sum = scale_sum + scale_list[i] + count = count + 1 + for i in range(len(scale_list)): + if np.isinf(scale_list[i]): + scale_list[i] = scale_sum/count + + + + # offsets of each part + offset = dict() + offset["14_15_16_17_to_0"] = body_pose[[14,15,16,17], :] - body_pose[[0], :] + offset["3_to_2"] = body_pose[[3], :] - body_pose[[2], :] + offset["4_to_3"] = body_pose[[4], :] - body_pose[[3], :] + offset["6_to_5"] = body_pose[[6], :] - body_pose[[5], :] + offset["7_to_6"] = body_pose[[7], :] - body_pose[[6], :] + offset["9_to_8"] = body_pose[[9], :] - body_pose[[8], :] + offset["10_to_9"] = body_pose[[10], :] - body_pose[[9], :] + offset["12_to_11"] = body_pose[[12], :] - body_pose[[11], :] + offset["13_to_12"] = body_pose[[13], :] - body_pose[[12], :] + offset["hand_left_to_4"] = hands[1, :, :] - body_pose[[4], :] + offset["hand_right_to_7"] = hands[0, :, :] - body_pose[[7], :] + + # neck + c_ = body_pose[1] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_neck) + + neck = body_pose[[0], :] + neck = warpAffine_kps(neck, M) + body_pose[[0], :] = neck + + # body_pose_up_shoulder + c_ = body_pose[0] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_face) + + body_pose_up_shoulder = offset["14_15_16_17_to_0"] + body_pose[[0], :] + body_pose_up_shoulder = warpAffine_kps(body_pose_up_shoulder, M) + body_pose[[14,15,16,17], :] = body_pose_up_shoulder + + # shoulder + c_ = body_pose[1] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_shoulder) + + body_pose_shoulder = body_pose[[2,5], :] + body_pose_shoulder = warpAffine_kps(body_pose_shoulder, M) + body_pose[[2,5], :] = body_pose_shoulder + + # arm upper left + c_ = body_pose[2] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_upper) + + elbow = offset["3_to_2"] + body_pose[[2], :] + elbow = warpAffine_kps(elbow, M) + body_pose[[3], :] = elbow + + # arm lower left + c_ = body_pose[3] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_lower) + + wrist = offset["4_to_3"] + body_pose[[3], :] + wrist = warpAffine_kps(wrist, M) + body_pose[[4], :] = wrist + + # hand left + c_ = body_pose[4] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_hand) + + hand = offset["hand_left_to_4"] + body_pose[[4], :] + hand = warpAffine_kps(hand, M) + hands[1, :, :] = hand + + # arm upper right + c_ = body_pose[5] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_upper) + + elbow = offset["6_to_5"] + body_pose[[5], :] + elbow = warpAffine_kps(elbow, M) + body_pose[[6], :] = elbow + + # arm lower right + c_ = body_pose[6] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_arm_lower) + + wrist = offset["7_to_6"] + body_pose[[6], :] + wrist = warpAffine_kps(wrist, M) + body_pose[[7], :] = wrist + + # hand right + c_ = body_pose[7] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_hand) + + hand = offset["hand_right_to_7"] + body_pose[[7], :] + hand = warpAffine_kps(hand, M) + hands[0, :, :] = hand + + # body len + c_ = body_pose[1] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_body_len) + + body_len = body_pose[[8,11], :] + body_len = warpAffine_kps(body_len, M) + body_pose[[8,11], :] = body_len + + # leg upper left + c_ = body_pose[8] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_upper) + + knee = offset["9_to_8"] + body_pose[[8], :] + knee = warpAffine_kps(knee, M) + body_pose[[9], :] = knee + + # leg lower left + c_ = body_pose[9] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_lower) + + ankle = offset["10_to_9"] + body_pose[[9], :] + ankle = warpAffine_kps(ankle, M) + body_pose[[10], :] = ankle + + # leg upper right + c_ = body_pose[11] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_upper) + + knee = offset["12_to_11"] + body_pose[[11], :] + knee = warpAffine_kps(knee, M) + body_pose[[12], :] = knee + + # leg lower right + c_ = body_pose[12] + cx = c_[0] + cy = c_[1] + M = cv2.getRotationMatrix2D((cx,cy), 0, scale_leg_lower) + + ankle = offset["13_to_12"] + body_pose[[12], :] + ankle = warpAffine_kps(ankle, M) + body_pose[[13], :] = ankle + + # none part + body_pose_none = pose_ori['bodies']['candidate'] == -1. + hands_none = pose_ori['hands'] == -1. + faces_none = pose_ori['faces'] == -1. + + body_pose[body_pose_none] = -1. + hands[hands_none] = -1. + nan = float('nan') + if len(hands[np.isnan(hands)]) > 0: + print('nan') + faces[faces_none] = -1. + + # last check nan -> -1. + body_pose = np.nan_to_num(body_pose, nan=-1.) + hands = np.nan_to_num(hands, nan=-1.) + faces = np.nan_to_num(faces, nan=-1.) + + # return + pose_align = copy.deepcopy(pose_ori) + pose_align['bodies']['candidate'] = body_pose + pose_align['hands'] = hands + pose_align['faces'] = faces + + return pose_align + + + +def run_align_video_with_filterPose_translate_smooth(args): + + vidfn=args.vidfn + imgfn_refer=args.imgfn_refer + outfn_ref_img_pose=args.outfn_ref_img_pose + outfn=args.outfn + + reader = Reader(args, vidfn) + audio = reader.get_audio() + height, width = reader.get_resolution() + fps = reader.get_fps() + print(audio) + print(height) + print(width) + print(fps) + + H_in, W_in = height, width + H_out, W_out = size_calculate(H_in,W_in,args.detect_resolution) + H_out, W_out = size_calculate(H_out,W_out,args.image_resolution) + WH_out = np.array([W_out,H_out]) + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + detector = DWposeDetector(device=device) + + refer_img = cv2.imread(imgfn_refer) + output_refer, pose_refer = detector(refer_img,detect_resolution=args.detect_resolution, image_resolution=args.image_resolution, output_type='cv2',return_pose_dict=True) + body_ref_img = pose_refer['bodies']['candidate'] + hands_ref_img = pose_refer['hands'] + faces_ref_img = pose_refer['faces'] + output_refer = cv2.cvtColor(output_refer, cv2.COLOR_RGB2BGR) + cv2.imwrite(outfn_ref_img_pose,output_refer) + + + skip_frames = 0 + max_frame = args.max_frame + pose_list, video_frame_buffer, video_pose_buffer = [], [], [] + for i in range(max_frame): + img = reader.get_frame() + if img is None: + break + else: + video_frame_buffer.append(img) + + if i < skip_frames: + continue + + # estimate scale parameters by the 1st frame in the video + if i==skip_frames: + output_1st_img, pose_1st_img = detector(img, args.detect_resolution, args.image_resolution, output_type='cv2', return_pose_dict=True) + body_1st_img = pose_1st_img['bodies']['candidate'] + hands_1st_img = pose_1st_img['hands'] + faces_1st_img = pose_1st_img['faces'] + + ''' + 计算逻辑: + 1. 先把 ref 和 pose 的高 resize 到一样,且都保持原来的长宽比。 + 2. 用点在图中的实际坐标来计算。 + 3. 实际计算中,把h的坐标归一化到 [0, 1], w为[0, W/H] + 4. 由于 dwpose 的输出本来就是归一化的坐标,所以h不需要变,w要乘W/H + 注意:dwpose 输出是 (w, h) + ''' + + # h不变,w缩放到原比例 + ref_H, ref_W = refer_img.shape[0], refer_img.shape[1] + ref_ratio = ref_W / ref_H + body_ref_img[:, 0] = body_ref_img[:, 0] * ref_ratio + hands_ref_img[:, :, 0] = hands_ref_img[:, :, 0] * ref_ratio + faces_ref_img[:, :, 0] = faces_ref_img[:, :, 0] * ref_ratio + + video_ratio = width / height + body_1st_img[:, 0] = body_1st_img[:, 0] * video_ratio + hands_1st_img[:, :, 0] = hands_1st_img[:, :, 0] * video_ratio + faces_1st_img[:, :, 0] = faces_1st_img[:, :, 0] * video_ratio + + # scale + align_args = dict() + + dist_1st_img = np.linalg.norm(body_1st_img[0]-body_1st_img[1]) # 0.078 + dist_ref_img = np.linalg.norm(body_ref_img[0]-body_ref_img[1]) # 0.106 + align_args["scale_neck"] = dist_ref_img / dist_1st_img # align / pose = ref / 1st + + dist_1st_img = np.linalg.norm(body_1st_img[16]-body_1st_img[17]) + dist_ref_img = np.linalg.norm(body_ref_img[16]-body_ref_img[17]) + align_args["scale_face"] = dist_ref_img / dist_1st_img + + dist_1st_img = np.linalg.norm(body_1st_img[2]-body_1st_img[5]) # 0.112 + dist_ref_img = np.linalg.norm(body_ref_img[2]-body_ref_img[5]) # 0.174 + align_args["scale_shoulder"] = dist_ref_img / dist_1st_img + + dist_1st_img = np.linalg.norm(body_1st_img[2]-body_1st_img[3]) # 0.895 + dist_ref_img = np.linalg.norm(body_ref_img[2]-body_ref_img[3]) # 0.134 + s1 = dist_ref_img / dist_1st_img + dist_1st_img = np.linalg.norm(body_1st_img[5]-body_1st_img[6]) + dist_ref_img = np.linalg.norm(body_ref_img[5]-body_ref_img[6]) + s2 = dist_ref_img / dist_1st_img + align_args["scale_arm_upper"] = (s1+s2)/2 # 1.548 + + dist_1st_img = np.linalg.norm(body_1st_img[3]-body_1st_img[4]) + dist_ref_img = np.linalg.norm(body_ref_img[3]-body_ref_img[4]) + s1 = dist_ref_img / dist_1st_img + dist_1st_img = np.linalg.norm(body_1st_img[6]-body_1st_img[7]) + dist_ref_img = np.linalg.norm(body_ref_img[6]-body_ref_img[7]) + s2 = dist_ref_img / dist_1st_img + align_args["scale_arm_lower"] = (s1+s2)/2 + + # hand + dist_1st_img = np.zeros(10) + dist_ref_img = np.zeros(10) + + dist_1st_img[0] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,1]) + dist_1st_img[1] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,5]) + dist_1st_img[2] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,9]) + dist_1st_img[3] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,13]) + dist_1st_img[4] = np.linalg.norm(hands_1st_img[0,0]-hands_1st_img[0,17]) + dist_1st_img[5] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,1]) + dist_1st_img[6] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,5]) + dist_1st_img[7] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,9]) + dist_1st_img[8] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,13]) + dist_1st_img[9] = np.linalg.norm(hands_1st_img[1,0]-hands_1st_img[1,17]) + + dist_ref_img[0] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,1]) + dist_ref_img[1] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,5]) + dist_ref_img[2] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,9]) + dist_ref_img[3] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,13]) + dist_ref_img[4] = np.linalg.norm(hands_ref_img[0,0]-hands_ref_img[0,17]) + dist_ref_img[5] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,1]) + dist_ref_img[6] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,5]) + dist_ref_img[7] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,9]) + dist_ref_img[8] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,13]) + dist_ref_img[9] = np.linalg.norm(hands_ref_img[1,0]-hands_ref_img[1,17]) + + ratio = 0 + count = 0 + for i in range (10): + if dist_1st_img[i] != 0: + ratio = ratio + dist_ref_img[i]/dist_1st_img[i] + count = count + 1 + if count!=0: + align_args["scale_hand"] = (ratio/count+align_args["scale_arm_upper"]+align_args["scale_arm_lower"])/3 + else: + align_args["scale_hand"] = (align_args["scale_arm_upper"]+align_args["scale_arm_lower"])/2 + + # body + dist_1st_img = np.linalg.norm(body_1st_img[1] - (body_1st_img[8] + body_1st_img[11])/2 ) + dist_ref_img = np.linalg.norm(body_ref_img[1] - (body_ref_img[8] + body_ref_img[11])/2 ) + align_args["scale_body_len"]=dist_ref_img / dist_1st_img + + dist_1st_img = np.linalg.norm(body_1st_img[8]-body_1st_img[9]) + dist_ref_img = np.linalg.norm(body_ref_img[8]-body_ref_img[9]) + s1 = dist_ref_img / dist_1st_img + dist_1st_img = np.linalg.norm(body_1st_img[11]-body_1st_img[12]) + dist_ref_img = np.linalg.norm(body_ref_img[11]-body_ref_img[12]) + s2 = dist_ref_img / dist_1st_img + align_args["scale_leg_upper"] = (s1+s2)/2 + + dist_1st_img = np.linalg.norm(body_1st_img[9]-body_1st_img[10]) + dist_ref_img = np.linalg.norm(body_ref_img[9]-body_ref_img[10]) + s1 = dist_ref_img / dist_1st_img + dist_1st_img = np.linalg.norm(body_1st_img[12]-body_1st_img[13]) + dist_ref_img = np.linalg.norm(body_ref_img[12]-body_ref_img[13]) + s2 = dist_ref_img / dist_1st_img + align_args["scale_leg_lower"] = (s1+s2)/2 + + #################### + #################### + # need adjust nan + for k,v in align_args.items(): + if np.isnan(v): + align_args[k]=1 + + # centre offset (the offset of key point 1) + offset = body_ref_img[1] - body_1st_img[1] + + + # pose align + pose_img, pose_ori = detector(img, args.detect_resolution, args.image_resolution, output_type='cv2', return_pose_dict=True) + video_pose_buffer.append(pose_img) + pose_align = align_img(img, pose_ori, align_args, args.detect_resolution, args.image_resolution) + + + # add centre offset + pose = pose_align + pose['bodies']['candidate'] = pose['bodies']['candidate'] + offset + pose['hands'] = pose['hands'] + offset + pose['faces'] = pose['faces'] + offset + + + # h不变,w从绝对坐标缩放回0-1 注意这里要回到ref的坐标系 + pose['bodies']['candidate'][:, 0] = pose['bodies']['candidate'][:, 0] / ref_ratio + pose['hands'][:, :, 0] = pose['hands'][:, :, 0] / ref_ratio + pose['faces'][:, :, 0] = pose['faces'][:, :, 0] / ref_ratio + pose_list.append(pose) + + # stack + body_list = [pose['bodies']['candidate'][:18] for pose in pose_list] + body_list_subset = [pose['bodies']['subset'][:1] for pose in pose_list] + hands_list = [pose['hands'][:2] for pose in pose_list] + faces_list = [pose['faces'][:1] for pose in pose_list] + + body_seq = np.stack(body_list , axis=0) + body_seq_subset = np.stack(body_list_subset, axis=0) + hands_seq = np.stack(hands_list , axis=0) + faces_seq = np.stack(faces_list , axis=0) + + # smooth + if args.smooth_method=='savgol': + winlen=args.winlen + polyorder=args.polyorder + body_seq = signal.savgol_filter(body_seq, window_length=winlen, polyorder=polyorder, mode='nearest', axis=0) + hands_seq = signal.savgol_filter(hands_seq, window_length=winlen, polyorder=polyorder, mode='nearest', axis=0) + faces_seq = signal.savgol_filter(faces_seq, window_length=winlen, polyorder=polyorder, mode='nearest', axis=0) + + + # concatenate and paint results + H = 512 # paint height + W1 = int((H/ref_H * ref_W)//2 *2) + W2 = int((H/height * width)//2 *2) + writer = Writer(args, None, H, 3*W1+2*W2, outfn, fps) + writer_pose_only = Writer(args, None, H, W1, args.outfn_align_pose_video, fps) + for i in range(len(body_seq)): + pose_t={} + pose_t["bodies"]={} + pose_t["bodies"]["candidate"]=body_seq[i] + pose_t["bodies"]["subset"]=body_seq_subset[i] + pose_t["hands"]=hands_seq[i] + pose_t["faces"]=faces_seq[i] + + ref_img = cv2.cvtColor(refer_img, cv2.COLOR_RGB2BGR) + ref_img = cv2.resize(ref_img, (W1, H)) + ref_pose= cv2.resize(output_refer, (W1, H)) + + output_transformed = pose2map( + pose_t, + H_in, W_in, + args.detect_resolution, + args.image_resolution, + include_body=True, + include_face=False, + include_hand=True, + include_eye=False + ) + output_transformed = cv2.resize(output_transformed, (W1, H)) + + video_frame = cv2.resize(video_frame_buffer[i], (W2, H)) + video_pose = cv2.resize(video_pose_buffer[i], (W2, H)) + + res = np.concatenate([ref_img, ref_pose, output_transformed, video_frame, video_pose], axis=1) + writer.write_frame(res) + writer_pose_only.write_frame(output_transformed) + + writer.close() + writer_pose_only.close() + reader.close() + print(f"pose_list len: {len(pose_list)}") + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--input', type=str, default='inputs', help='Input video, image or folder') + parser.add_argument('--output', type=str, default='results', help='Output folder') + parser.add_argument('--inputscale', type=float, default=1, help='The input scale of the video, reader') + parser.add_argument('--suffix', type=str, default='', help='Suffix of the restored video') + parser.add_argument('--fps', type=float, default=None, help='FPS of the output video') + parser.add_argument('--ffmpeg_bin', type=str, default='ffmpeg', help='The path to ffmpeg') + parser.add_argument('--vcodec', type=str, default='libx264', help='vcodec') + parser.add_argument('--input_pix_fmt', type=str, default='rgb24', help='ffmpeg input pix_fmt') + parser.add_argument('--crf', type=str, default='18', help='crf') + parser.add_argument('--detect_resolution', type=int, default=512, help='detect_resolution') + parser.add_argument('--image_resolution', type=int, default=720, help='image_resolution') + parser.add_argument('--dtype', type=str, default='fp16', help='dtype') + parser.add_argument('--smooth_method', type=str, default=None, help='Suffix of the restored video') + parser.add_argument('--winlen', type=int, default=11, help='window length') + parser.add_argument('--polyorder', type=int, default=1, help='polyorder') + + parser.add_argument('--max_frame', type=int, default=100, help='maximum frame number of the video to align') + parser.add_argument('--vidfn', type=str, default="./data/source_video/dance.mp4", help='Input video path') + parser.add_argument('--imgfn_refer', type=str, default="./data/images/man.jpg", help='refer image path') + parser.add_argument('--outfn_ref_img_pose', type=str, default="./data/pose_align_results/ref_img_pose.jpg", help='output path of the pose of the refer img') + parser.add_argument('--outfn_align_pose_video', type=str, default="./data/pose_align_results/align_pose_video.mp4", help='output path of the aligned video of the refer img') + parser.add_argument('--outfn', type=str, default="./data/pose_align_results/align_demo.mp4", help='Output path of the alignment visualization') + args = parser.parse_args() + + # args.imgfn_refer="/workspace/user_code/projects/PoseAlign/images/full/gibbon.jpg" + # foldername = args.imgfn_refer.split('/')[-2] + # ref_filename = args.imgfn_refer.split('/')[-1] + # ref_name, suffix = ref_filename.split('.') + # args.outfn_ref_img_pose='/workspace/user_code/projects/PoseAlign/results/' + foldername + '/' + 'ref_' + ref_name + "_dwpose" + '.' + suffix + + # args.vidfn='/workspace/user_code/projects/PoseAlign/videos/full/chen.mp4' + # foldername = args.vidfn.split('/')[-2] + # pose_filename = args.vidfn.split('/')[-1] + # pose_name, suffix = pose_filename.split('.') + # args.outfn=f'/workspace/user_code/projects/PoseAlign/results/' + foldername + '/' + 'ref_' + ref_name + "_video_" + pose_name +'.' + suffix + + run_align_video_with_filterPose_translate_smooth(args) + + + +if __name__ == '__main__': + main() diff --git a/pose_align/pose_align.sh b/pose_align/pose_align.sh new file mode 100644 index 0000000..961df2e --- /dev/null +++ b/pose_align/pose_align.sh @@ -0,0 +1,7 @@ +python pose_align/pose_align.py \ +--max_frame 200 \ +--vidfn ./data/source_video/dance.mp4 \ +--imgfn_refer ./data/images/man.jpg \ +--outfn_ref_img_pose ./data/pose_align_results/ref_img_pose.jpg \ +--outfn_align_pose_video ./data/pose_align_results/align_pose_video.mp4 \ +--outfn ./data/pose_align_results/align_demo.mp4 \ No newline at end of file diff --git a/pose_align/pose_utils.py b/pose_align/pose_utils.py new file mode 100644 index 0000000..1628b09 --- /dev/null +++ b/pose_align/pose_utils.py @@ -0,0 +1,164 @@ +import torch +import cv2 +import numpy as np +from pprint import pprint +import ffmpeg +import argparse +import time +import traceback +import scipy.signal as signal + + + +def get_video_meta_info(video_path): + print(video_path) + ret = {} + probe = ffmpeg.probe(video_path) + video_streams = [stream for stream in probe['streams'] if stream['codec_type'] == 'video'] + has_audio = any(stream['codec_type'] == 'audio' for stream in probe['streams']) + ret['width'] = video_streams[0]['width'] + ret['height'] = video_streams[0]['height'] + ret['fps'] = eval(video_streams[0]['avg_frame_rate']) + ret['audio'] = ffmpeg.input(video_path).audio if has_audio else None + if 'nb_frames' in video_streams[0].keys(): + ret['nb_frames'] = int(video_streams[0]['nb_frames']) + else: + cap = cv2.VideoCapture(video_path) + ret['nb_frames']=int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + cap.release() + return ret + + + +# Calculate the resolution +def size_calculate(h, w, resolution): + + H = float(h) + W = float(w) + + # resize the short edge to the resolution + k = float(resolution) / min(H, W) # short edge + H *= k + W *= k + + # resize to the nearest integer multiple of 64 + H = int(np.round(H / 64.0)) * 64 + W = int(np.round(W / 64.0)) * 64 + return H, W + + + +def warpAffine_kps(kps, M): + a = M[:,:2] + t = M[:,2] + kps = np.dot(kps, a.T) + t + return kps + + + +class Reader: + + def __init__(self, args, video_path): + self.args = args + self.audio = None + self.input_fps = None + self.input_type = 'video' + + + meta = get_video_meta_info(video_path) + self.width = meta['width'] + self.height = meta['height'] + self.input_fps = meta['fps'] + self.audio = meta['audio'] + self.nb_frames = meta['nb_frames'] + + self.height = int(self.height) // 2 * 2 + self.width = int(self.width) // 2 * 2 + if args.fps is not None: + self.stream_reader = ( + # ffmpeg.input(video_path, ss=0,t=2).filter('fps',fps=args.fps).filter('scale', self.width , self.height).output('pipe:', format='rawvideo', pix_fmt='rgb24', + ffmpeg.input(video_path).filter('fps',fps=args.fps).filter('scale', self.width , self.height).output('pipe:', format='rawvideo', pix_fmt='rgb24', + loglevel='error').run_async( + pipe_stdin=True, pipe_stdout=True, cmd=args.ffmpeg_bin)) + else: + self.stream_reader = ( + # ffmpeg.input(video_path, ss=0,t=2).filter('scale', self.width , self.height).output('pipe:', format='rawvideo', pix_fmt='rgb24', + ffmpeg.input(video_path).filter('scale', self.width , self.height).output('pipe:', format='rawvideo', pix_fmt='rgb24', + loglevel='error').run_async( + pipe_stdin=True, pipe_stdout=True, cmd=args.ffmpeg_bin)) + + + def get_resolution(self): + return self.height, self.width + + def get_fps(self): + if self.args.fps is not None: + return self.args.fps + elif self.input_fps is not None: + return self.input_fps + return 24 + + def get_audio(self): + return self.audio + + def __len__(self): + return self.nb_frames + + def get_frame_from_stream(self): + img_bytes = self.stream_reader.stdout.read(self.width * self.height * 3) # 3 bytes for one pixel + if not img_bytes: + return None + img = np.frombuffer(img_bytes, np.uint8).reshape([self.height, self.width, 3]) + return img + + def get_frame(self): + if self.input_type.startswith('video'): + return self.get_frame_from_stream() + + def close(self): + if self.input_type.startswith('video'): + self.stream_reader.stdin.close() + # self.stream_reader.wait() + + + +class Writer: + + def __init__(self, args, audio, height, width, video_save_path, fps): + out_width, out_height = int(width), int(height) + if out_height > 2160: + print('You are generating video that is larger than 4K, which will be very slow due to IO speed.', + 'We highly recommend to decrease the outscale(aka, -s).') + + if audio is not None: + self.stream_writer = ( + ffmpeg.input('pipe:', format='rawvideo', pix_fmt=args.input_pix_fmt, s=f'{out_width}x{out_height}', + framerate=fps).output( + audio, + video_save_path, + pix_fmt='yuv420p', + vcodec=args.vcodec, + crf=args.crf, + loglevel='error', + acodec='copy').overwrite_output().run_async( + # acodec='copy').run_async( + pipe_stdin=True, pipe_stdout=True, cmd=args.ffmpeg_bin)) + else: + self.stream_writer = ( + ffmpeg.input('pipe:', format='rawvideo', pix_fmt=args.input_pix_fmt, s=f'{out_width}x{out_height}', + framerate=fps).output( + video_save_path, + pix_fmt='yuv420p', + vcodec=args.vcodec, + crf=args.crf, + loglevel='error').overwrite_output().run_async( + # loglevel='error').run_async( + pipe_stdin=True, pipe_stdout=True, cmd=args.ffmpeg_bin)) + + def write_frame(self, frame): + frame = frame.astype(np.uint8).tobytes() + self.stream_writer.stdin.write(frame) + + def close(self): + self.stream_writer.stdin.close() + self.stream_writer.wait() \ No newline at end of file