Merge pull request #12 from deruyter92/jaap/minor_refactors

xiu-cs · web-flow · commit a4df14be226b · 2026-02-09T14:52:52.000+01:00
Minor refactors
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,8 @@ htmlcov/
 *.pkl
 *.h5
 *.ckpt
+
+# Excluded directories
+pre_trained_models/
+demo/predictions/
+demo/images/
diff --git a/README.md b/README.md
@@ -1,12 +1,12 @@
 # FMPose3D: monocular 3D pose estimation via flow matching
 
 ![Version](https://img.shields.io/badge/python_version-3.10-purple)
-[![PyPI version](https://badge.fury.io/py/fmpose3d.svg)](https://badge.fury.io/py/fmpose3d)
-[![License: LApache 2.0](https://img.shields.io/badge/License-Apache2.0-blue.svg)](https://www.gnu.org/licenses/apach2.0)
+[![PyPI version](https://badge.fury.io/py/fmpose3d.svg?icon=si%3Apython)](https://badge.fury.io/py/fmpose3d)
+[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0)
 
-This is the official implementation of the approach described in the paper:
+This is the official implementation of the approach described in the preprint:
 
-[**FMPose3D: monocular 3D Pose Estimation via Flow Matching**](xxx)            
+[**FMPose3D: monocular 3D pose estimation via flow matching**](https://arxiv.org/abs/2602.05755)            
 Ti Wang, Xiaohang Yu, Mackenzie Weygandt Mathis
 
 <!-- <p align="center"><img src="./images/Frame 4.jpg" width="50%" alt="" /></p> -->
@@ -15,13 +15,13 @@ Ti Wang, Xiaohang Yu, Mackenzie Weygandt Mathis
 
 ## 🚀 TL;DR
 
-FMPose3D replaces slow diffusion models for monocular 3D pose estimation with fast Flow Matching, generating multiple plausible 3D poses via an ODE in just a few steps, then aggregates them using a reprojection-based Bayesian module (RPEA) for accurate predictions, achieving state-of-the-art results on human and animal 3D pose benchmarks.
+FMPose3D creates a 3D pose from a single 2D image. It leverages fast Flow Matching, generating multiple plausible 3D poses via an ODE in just a few steps, then aggregates them using a reprojection-based Bayesian module (RPEA) for accurate predictions, achieving state-of-the-art results on human and animal 3D pose benchmarks.
 
 
 
 ## News!
 
-- [X] Feb 2026: FMPose3D code and arXiv paper is released - check out the demos here or on our [project page](https://xiu-cs.github.io/FMPose3D/)
+- [X] Feb 2026: the FMPose3D code and our arXiv paper is released - check out the demos here or on our [project page](https://xiu-cs.github.io/FMPose3D/)
 - [ ] Planned: This method will be integrated into [DeepLabCut](https://www.mackenziemathislab.org/deeplabcut)
 
 ## Installation
@@ -32,17 +32,11 @@ Make sure you have Python 3.10+. You can set this up with:
 ```bash
 conda create -n fmpose_3d python=3.10
 conda activate fmpose_3d
-```
-<!-- test version -->
-```bash
-git clone https://github.com/AdaptiveMotorControlLab/FMPose3D.git
-# TestPyPI (pre-release/testing build)
-pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple/ fmpose3d==0.0.7
-# Future Official PyPI release
-# pip install fmpose3d
+
+pip install fmpose3d
 ```
 
-## Demo 
+## Demos
 
 ### Testing on in-the-wild images (humans)
 
@@ -85,7 +79,7 @@ The training logs, checkpoints, and related files of each training time will be
 
 For training on Human3.6M:
 ```bash
-sh /scripts/FMPose3D_train.sh
+sh ./scripts/FMPose3D_train.sh
 ```
 
 ### Inference
@@ -98,10 +92,24 @@ To run inference on Human3.6M:
 sh ./scripts/FMPose3D_test.sh
 ```
 
-## Experiments Animals
+## Experiments on non-human animals
 
 For animal training/testing and demo scripts, see [animals/README.md](animals/README.md).
 
+## Citation 
+
+```
+@misc{wang2026fmpose3dmonocular3dpose,
+      title={FMPose3D: monocular 3D pose estimation via flow matching}, 
+      author={Ti Wang and Xiaohang Yu and Mackenzie Weygandt Mathis},
+      year={2026},
+      eprint={2602.05755},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2602.05755}, 
+}
+```
+
 ## Acknowledgements
 
 We thank the Swiss National Science Foundation (SNSF Project # 320030-227871) and the Kavli Foundation for providing financial support for this project.
diff --git a/animals/demo/vis_animals.py b/animals/demo/vis_animals.py
@@ -8,7 +8,6 @@
 """
 
 # SuperAnimal Demo: https://github.com/DeepLabCut/DeepLabCut/blob/main/examples/COLAB/COLAB_YOURDATA_SuperAnimal.ipynb
-import sys
 import os
 import numpy as np
 import glob
@@ -25,8 +24,6 @@
 from fmpose3d.animals.common.arguments import opts as parse_args
 from fmpose3d.common.camera import normalize_screen_coordinates, camera_to_world
 
-sys.path.append(os.getcwd())
-
 args = parse_args().parse()
 os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
 
@@ -334,13 +331,15 @@ def get_pose3D(path, output_dir, type='image'):
     print(f"args.n_joints: {args.n_joints}, args.out_joints: {args.out_joints}")
     
     ## Reload model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model['CFM'] = CFM(args).cuda()
+    model['CFM'] = CFM(args).to(device)
     
     model_dict = model['CFM'].state_dict()
     model_path = args.saved_model_path
     print(f"Loading model from: {model_path}")
-    pre_dict = torch.load(model_path)
+    pre_dict = torch.load(model_path, map_location=device, weights_only=True)
     for name, key in model_dict.items():
         model_dict[name] = pre_dict[name]
     model['CFM'].load_state_dict(model_dict)
@@ -400,7 +399,8 @@ def get_3D_pose_from_image(args, keypoints, i, img, model, output_dir):
         input_2D = np.expand_dims(input_2D, axis=0)  # (1, J, 2)
     
     # Convert to tensor format matching visualize_animal_poses.py
-    input_2D = torch.from_numpy(input_2D.astype('float32')).cuda()  # (1, J, 2)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_2D = torch.from_numpy(input_2D.astype('float32')).to(device)  # (1, J, 2)
     input_2D = input_2D.unsqueeze(0)  # (1, 1, J, 2)
 
     # Euler sampler for CFM
@@ -418,7 +418,7 @@ def euler_sample(c_2d, y_local, steps, model_3d):
     
     # Single inference without flip augmentation
     # Create 3D random noise with shape (1, 1, J, 3)
-    y = torch.randn(input_2D.size(0), input_2D.size(1), input_2D.size(2), 3).cuda()
+    y = torch.randn(input_2D.size(0), input_2D.size(1), input_2D.size(2), 3, device=device)
     output_3D = euler_sample(input_2D, y, steps=args.sample_steps, model_3d=model)
     
     output_3D = output_3D[0:, args.pad].unsqueeze(1)
diff --git a/animals/scripts/main_animal3d.py b/animals/scripts/main_animal3d.py
@@ -75,7 +75,7 @@ def step(split, args, actions, dataLoader, model, optimizer=None, epoch=None, st
         #  gt_3D shape: torch.Size([B, J, 4]) (x,y,z + homogeneous coordinate)
         gt_3D = gt_3D[:,:,:3]  # only use x,y,z for 3D ground truth
         
-        # [input_2D, gt_3D, batch_cam, vis_3D] = get_varialbe(split, [input_2D, gt_3D, batch_cam, vis_3D])
+        # [input_2D, gt_3D, batch_cam, vis_3D] = get_variable(split, [input_2D, gt_3D, batch_cam, vis_3D])
         
         # unsqueeze frame dimension
         input_2D = input_2D.unsqueeze(1)  # (B,F,J,C)
@@ -264,15 +264,17 @@ def get_parameter_number(net):
         test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=args.batch_size,
                                                       shuffle=False, num_workers=int(args.workers), pin_memory=True)    
 
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model['CFM'] = CFM(args).cuda()
+    model['CFM'] = CFM(args).to(device)
 
     if args.reload:
         model_dict = model['CFM'].state_dict()
         # Prefer explicit saved_model_path; otherwise fallback to previous_dir glob
         model_path = args.saved_model_path
         print(model_path)
-        pre_dict = torch.load(model_path)
+        pre_dict = torch.load(model_path, weights_only=True, map_location=device)
         for name, key in model_dict.items():
             model_dict[name] = pre_dict[name]
         model['CFM'].load_state_dict(model_dict)
diff --git a/demo/vis_in_the_wild.py b/demo/vis_in_the_wild.py
@@ -7,7 +7,6 @@
 Licensed under Apache 2.0
 """
 
-import sys
 import cv2
 import os 
 import numpy as np
@@ -16,8 +15,6 @@
 from tqdm import tqdm
 import copy
 
-sys.path.append(os.getcwd())
-
 # Auto-download checkpoint files if missing
 from fmpose3d.lib.checkpoint.download_checkpoints import ensure_checkpoints
 ensure_checkpoints()
@@ -213,7 +210,8 @@ def get_3D_pose_from_image(args, keypoints, i, img, model, output_dir):
     
     input_2D = input_2D[np.newaxis, :, :, :, :]
 
-    input_2D = torch.from_numpy(input_2D.astype('float32')).cuda()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    input_2D = torch.from_numpy(input_2D.astype('float32')).to(device)
 
     N = input_2D.size(0)
 
@@ -229,10 +227,10 @@ def euler_sample(c_2d, y_local, steps, model_3d):
     
     ## estimation
     
-    y = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3).cuda()
+    y = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3, device=device)
     output_3D_non_flip = euler_sample(input_2D[:, 0], y, steps=args.sample_steps, model_3d=model)
     
-    y_flip = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3).cuda()
+    y_flip = torch.randn(input_2D.size(0), input_2D.size(2), input_2D.size(3), 3, device=device)
     output_3D_flip = euler_sample(input_2D[:, 1], y_flip, steps=args.sample_steps, model_3d=model)
 
     output_3D_flip[:, :, :, 0] *= -1
@@ -280,14 +278,16 @@ def get_pose3D(path, output_dir, type='image'):
     # args.type = type 
 
     ## Reload 
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model['CFM'] = CFM(args).cuda()
+    model['CFM'] = CFM(args).to(device)
     
     # if args.reload:
     model_dict = model['CFM'].state_dict()
     model_path = args.model_weights_path
     print(model_path)
-    pre_dict = torch.load(model_path)
+    pre_dict = torch.load(model_path, map_location=device, weights_only=True)
     for name, key in model_dict.items():
         model_dict[name] = pre_dict[name]
     model['CFM'].load_state_dict(model_dict)
diff --git a/fmpose3d/animals/common/arber_dataset.py b/fmpose3d/animals/common/arber_dataset.py
@@ -12,7 +12,6 @@
 import glob
 import os
 import random
-import sys
 
 import cv2
 import matplotlib.pyplot as plt
@@ -23,10 +22,8 @@
 from torch.utils.data import Dataset
 from tqdm import tqdm
 
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from common.camera import normalize_screen_coordinates
-from common.lifter3d import load_camera_params, load_h5_keypoints
+from fmpose3d.common.camera import normalize_screen_coordinates
+from fmpose3d.animals.common.lifter3d import load_camera_params, load_h5_keypoints
 
 
 class ArberDataset(Dataset):
diff --git a/fmpose3d/animals/common/utils.py b/fmpose3d/animals/common/utils.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import torch
-from torch.autograd import Variable
 
 
 def mpjpe_cal(predicted, target):
@@ -220,18 +219,17 @@ def update(self, val, n=1):
         self.avg = self.sum / self.count
 
 
-def get_varialbe(split, target):
+def get_variable(split, target):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     num = len(target)
     var = []
     if split == "train":
         for i in range(num):
-            temp = (
-                Variable(target[i], requires_grad=False).contiguous().type(torch.cuda.FloatTensor)
-            )
+            temp = target[i].requires_grad_(False).contiguous().float().to(device)
             var.append(temp)
     else:
         for i in range(num):
-            temp = Variable(target[i]).contiguous().cuda().type(torch.cuda.FloatTensor)
+            temp = target[i].contiguous().float().to(device)
             var.append(temp)
 
     return var
diff --git a/fmpose3d/common/__init__.py b/fmpose3d/common/__init__.py
@@ -22,7 +22,7 @@
     save_top_N_models,
     test_calculation,
     print_error,
-    get_varialbe,
+    get_variable,
 )
 
 __all__ = [
@@ -36,6 +36,6 @@
     "save_top_N_models",
     "test_calculation",
     "print_error",
-    "get_varialbe",
+    "get_variable",
 ]
 
diff --git a/fmpose3d/common/utils.py b/fmpose3d/common/utils.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import torch
-from torch.autograd import Variable
 
 def deterministic_random(min_value, max_value, data):
     digest = hashlib.sha256(data.encode()).digest()
@@ -186,20 +185,17 @@ def update(self, val, n=1):
         self.avg = self.sum / self.count
 
 
-def get_varialbe(split, target):
+def get_variable(split, target):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     num = len(target)
     var = []
     if split == "train":
         for i in range(num):
-            temp = (
-                Variable(target[i], requires_grad=False)
-                .contiguous()
-                .type(torch.cuda.FloatTensor)
-            )
+            temp = target[i].requires_grad_(False).contiguous().float().to(device)
             var.append(temp)
     else:
         for i in range(num):
-            temp = Variable(target[i]).contiguous().cuda().type(torch.cuda.FloatTensor)
+            temp = target[i].contiguous().float().to(device)
             var.append(temp)
 
     return var
diff --git a/fmpose3d/models/model_GAMLP.py b/fmpose3d/models/model_GAMLP.py
@@ -7,8 +7,6 @@
 Licensed under Apache 2.0
 """
 
-import sys
-sys.path.append("..")
 import torch
 import torch.nn as nn
 import math
diff --git a/scripts/FMPose3D_main.py b/scripts/FMPose3D_main.py
@@ -78,7 +78,7 @@ def test_multi_hypothesis(
 
     for i, data in enumerate(tqdm(dataLoader, 0)):
         batch_cam, gt_3D, input_2D, action, subject, scale, bb_box, cam_ind = data
-        [input_2D, gt_3D, batch_cam, scale, bb_box] = get_varialbe(
+        [input_2D, gt_3D, batch_cam, scale, bb_box] = get_variable(
             split, [input_2D, gt_3D, batch_cam, scale, bb_box]
         )
 
@@ -165,7 +165,7 @@ def train(opt, train_loader, model, optimizer):
 
     for i, data in enumerate(tqdm(train_loader, 0)):
         batch_cam, gt_3D, input_2D, action, subject, scale, bb_box, cam_ind = data
-        [input_2D, gt_3D, batch_cam, scale, bb_box] = get_varialbe(
+        [input_2D, gt_3D, batch_cam, scale, bb_box] = get_variable(
             split, [input_2D, gt_3D, batch_cam, scale, bb_box]
         )
 
@@ -335,14 +335,16 @@ def print_error_action(action_error_sum, is_train):
             pin_memory=True,
         )
 
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
     model = {}
-    model["CFM"] = CFM(args).cuda()
+    model["CFM"] = CFM(args).to(device)
 
     if args.reload:
         model_dict = model["CFM"].state_dict()
         model_path = args.model_weights_path
         print(model_path)
-        pre_dict = torch.load(model_path)
+        pre_dict = torch.load(model_path, map_location=device, weights_only=True)
         for name, key in model_dict.items():
             model_dict[name] = pre_dict[name]
         model["CFM"].load_state_dict(model_dict)

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`save_top_N_models,`
`23`	`23`	`test_calculation,`
`24`	`24`	`print_error,`
`25`		`- get_varialbe,`
	`25`	`+ get_variable,`
`26`	`26`	`)`
`27`	`27`
`28`	`28`	`__all__ = [`
`@@ -36,6 +36,6 @@`
`36`	`36`	`"save_top_N_models",`
`37`	`37`	`"test_calculation",`
`38`	`38`	`"print_error",`
`39`		`- "get_varialbe",`
	`39`	`+ "get_variable",`
`40`	`40`	`]`
`41`	`41`