Skip to content

Commit 0ab8559

Browse files
committed
dcos(slurm): update slurm script, for reader to easy check how is the trainining setup.
1 parent 39b6a18 commit 0ab8559

3 files changed

Lines changed: 24 additions & 38 deletions

File tree

assets/slurm/0_process.sh

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,31 @@
77
#SBATCH -t 1-00:00:00
88
#SBATCH --mail-type=END,FAIL
99
#SBATCH --mail-user=qingwen@kth.se
10-
#SBATCH --output /proj/berzelius-2023-154/users/x_qinzh/workspace/SeFlow/logs/slurm/%J_data.out
11-
#SBATCH --error /proj/berzelius-2023-154/users/x_qinzh/workspace/SeFlow/logs/slurm/%J_data.err
10+
#SBATCH --output /proj/berzelius-2023-154/users/x_qinzh/workspace/OpenSceneFlow/logs/slurm/%J_data.out
11+
#SBATCH --error /proj/berzelius-2023-154/users/x_qinzh/workspace/OpenSceneFlow/logs/slurm/%J_data.err
1212

13-
cd /proj/berzelius-2023-154/users/x_qinzh/workspace/SeFlow
13+
PYTHON=/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/sftool/bin/python
1414
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/proj/berzelius-2023-154/users/x_qinzh/mambaforge/lib
15+
cd /proj/berzelius-2023-364/users/x_qinzh/workspace/OpenSceneFlow
1516
# export HYDRA_FULL_ERROR=1
1617

17-
/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/dataprocess/bin/python dataprocess/extract_av2.py --nproc 64 \
18+
19+
$PYTHON dataprocess/extract_av2.py --nproc 64 \
1820
--av2_type sensor \
1921
--data_mode train \
2022
--argo_dir /proj/berzelius-2023-154/users/x_qinzh/av2 \
21-
--output_dir /proj/berzelius-2023-364/users/x_qinzh/data/av2/preprocess_v2
23+
--output_dir /proj/berzelius-2023-364/users/x_qinzh/data/av2/h5py
2224

23-
/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/dataprocess/bin/python dataprocess/extract_av2.py --nproc 64 \
25+
$PYTHON dataprocess/extract_av2.py --nproc 64 \
2426
--av2_type sensor \
2527
--data_mode val \
2628
--argo_dir /proj/berzelius-2023-154/users/x_qinzh/av2 \
27-
--output_dir /proj/berzelius-2023-364/users/x_qinzh/data/av2/preprocess_v2 \
29+
--output_dir /proj/berzelius-2023-364/users/x_qinzh/data/av2/h5py \
2830
--mask_dir /proj/berzelius-2023-154/users/x_qinzh/av2/3d_scene_flow
2931

30-
/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/dataprocess/bin/python dataprocess/extract_av2.py --nproc 64 \
32+
$PYTHON dataprocess/extract_av2.py --nproc 64 \
3133
--av2_type sensor \
3234
--data_mode test \
3335
--argo_dir /proj/berzelius-2023-154/users/x_qinzh/av2 \
34-
--output_dir /proj/berzelius-2023-364/users/x_qinzh/data/av2/preprocess_v2 \
36+
--output_dir /proj/berzelius-2023-364/users/x_qinzh/data/av2/h5py \
3537
--mask_dir /proj/berzelius-2023-154/users/x_qinzh/av2/3d_scene_flow

assets/slurm/1_train.sh

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,13 @@
77
#SBATCH --output /proj/berzelius-2023-154/users/x_qinzh/seflow/logs/slurm/%J_seflow.out
88
#SBATCH --error /proj/berzelius-2023-154/users/x_qinzh/seflow/logs/slurm/%J_seflow.err
99

10-
cd /proj/berzelius-2023-154/users/x_qinzh/seflow
10+
PYTHON=/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/opensf/bin/python
11+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/proj/berzelius-2023-154/users/x_qinzh/mambaforge/lib
12+
cd /proj/berzelius-2023-364/users/x_qinzh/workspace/OpenSceneFlow
1113

12-
SOURCE="/proj/berzelius-2023-154/users/x_qinzh/data/av2/preprocess_v2"
14+
15+
# ===> to transfer data into local node disk, it can be ignored. <===
16+
SOURCE="/proj/berzelius-2023-364/users/x_qinzh/data/av2/autolabel"
1317
DEST="/scratch/local/av2"
1418
SUBDIRS=("sensor/train" "sensor/val")
1519

@@ -24,14 +28,7 @@ elapsed=$((end_time - start_time))
2428
echo "Copy ${SOURCE} to ${DEST} Total time: ${elapsed} seconds"
2529
echo "Start training..."
2630

27-
# ====> paper model = seflow_official
28-
# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/seflow/bin/python train.py \
29-
# slurm_id=$SLURM_JOB_ID wandb_mode=online train_data=/scratch/local/av2/sensor/train val_data=/scratch/local/av2/sensor/val \
30-
# num_workers=16 model=deflow lr=2e-6 epochs=50 batch_size=20 "model.target.num_iters=2" "model.val_monitor=val/Dynamic/Mean" \
31-
# loss_fn=seflowLoss "add_seloss={chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}"
32-
3331
# ====> leaderboard model = seflow_best
34-
/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/seflow/bin/python train.py \
35-
slurm_id=$SLURM_JOB_ID wandb_mode=online train_data=/scratch/local/av2/sensor/train val_data=/scratch/local/av2/sensor/val \
32+
$PYTHON train.py slurm_id=$SLURM_JOB_ID wandb_mode=online train_data=/scratch/local/av2/sensor/train val_data=/scratch/local/av2/sensor/val \
3633
num_workers=16 model=deflow lr=2e-4 epochs=9 batch_size=16 "model.target.num_iters=2" "model.val_monitor=val/Dynamic/Mean" \
3734
loss_fn=seflowLoss "add_seloss={chamfer_dis: 1.0, static_flow_loss: 1.0, dynamic_chamfer_dis: 1.0, cluster_based_pc0pc1: 1.0}"

assets/slurm/2_eval.sh

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,29 +5,16 @@
55
#SBATCH --output /proj/berzelius-2023-154/users/x_qinzh/seflow/logs/slurm/%J_eval.out
66
#SBATCH --error /proj/berzelius-2023-154/users/x_qinzh/seflow/logs/slurm/%J_eval.err
77

8-
cd /proj/berzelius-2023-154/users/x_qinzh/seflow
98

10-
SOURCE="/proj/berzelius-2023-154/users/x_qinzh/av2/preprocess_v2"
11-
DEST="/scratch/local/av2"
12-
SUBDIRS=("sensor/val")
9+
PYTHON=/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/opensf/bin/python
10+
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/proj/berzelius-2023-154/users/x_qinzh/mambaforge/lib
11+
cd /proj/berzelius-2023-364/users/x_qinzh/workspace/OpenSceneFlow
1312

14-
start_time=$(date +%s)
15-
for dir in "${SUBDIRS[@]}"; do
16-
mkdir -p "${DEST}/${dir}"
17-
find "${SOURCE}/${dir}" -type f -print0 | xargs -0 -n1 -P16 cp -t "${DEST}/${dir}" &
18-
done
19-
wait
20-
end_time=$(date +%s)
21-
elapsed=$((end_time - start_time))
22-
echo "Copy ${SOURCE} to ${DEST} Total time: ${elapsed} seconds"
23-
echo "Start training..."
2413

2514
# ====> leaderboard model
26-
# /proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/seflow/bin/python eval.py \
27-
# wandb_mode=online dataset_path=/scratch/local/av2/sensor \
15+
# $PYTHON eval.py wandb_mode=online dataset_path=/proj/berzelius-2023-364/users/x_qinzh/data/av2/autolabel av2_mode=test \
2816
# checkpoint=/proj/berzelius-2023-154/users/x_qinzh/seflow/logs/wandb/seflow-10086990/checkpoints/epoch_19_seflow.ckpt \
29-
# av2_mode=test save_res=True
17+
# save_res=True
3018

31-
/proj/berzelius-2023-154/users/x_qinzh/mambaforge/envs/seflow/bin/python eval.py \
32-
wandb_mode=online dataset_path=/scratch/local/av2/sensor av2_mode=val \
19+
$PYTHON eval.py wandb_mode=online dataset_path=/proj/berzelius-2023-364/users/x_qinzh/data/av2/autolabel av2_mode=val \
3320
checkpoint=/proj/berzelius-2023-154/users/x_qinzh/seflow/logs/wandb/seflow-10086990/checkpoints/epoch_19_seflow.ckpt

0 commit comments

Comments
 (0)