astro-informatics · JessWhitney · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,8 @@ __pycache__/
 *.DS_Store
 data/*.DS_Store
 
+*.err
+*.out
 
 # Distribution / packaging
 .Python
@@ -134,4 +136,47 @@ venv.bak/
 dmypy.json
 
 # Pyre type checker
-.pyre/
+.pyre/
+mock_c_map_2.png
+mock_c_map.png
+mock_observation_2.png
+mock_observation.png
+jobs/validate_test/avg_err_std_1_v2.png
+jobs/validate_test/avg_err_std_2_v2.png
+jobs/validate_test/avg_err_std_3_v2.png
+jobs/validate_test/avg_err_std_4_v2.png
+jobs/validate_test/avg_err_std_5_v2.png
+jobs/validate_test/diversity_1_v2.png
+jobs/validate_test/diversity_2_v2.png
+jobs/validate_test/diversity_3_v2.png
+jobs/validate_test/diversity_4_v2.png
+jobs/validate_test/diversity_5_v2.png
+jobs/validate_test/generated_samples_plots.npy
+jobs/validate_test/P_ascent_1_v2.png
+jobs/validate_test/P_ascent_2_v2.png
+jobs/validate_test/P_ascent_3_v2.png
+jobs/validate_test/P_ascent_4_v2.png
+jobs/validate_test/P_ascent_5_v2.png
+jobs/validate_test/P_ascent_zoom_1_v2.png
+jobs/validate_test/P_ascent_zoom_2_v2.png
+jobs/validate_test/P_ascent_zoom_3_v2.png
+jobs/validate_test/P_ascent_zoom_4_v2.png
+jobs/validate_test/P_ascent_zoom_5_v2.png
+jobs/validate_test/square_1.png
+jobs/validate_test/square_2.png
+jobs/validate_test/square_3.png
+jobs/validate_test/square_4.png
+jobs/validate_test/square_5.png
+jobs/validate_test/zoomed_overview_1.png
+jobs/validate_test/zoomed_overview_2.png
+jobs/validate_test/zoomed_overview_3.png
+jobs/validate_test/zoomed_overview_4.png
+jobs/validate_test/zoomed_overview_5.png
+jobs/validate_test/val_test_cGAN_538273.err
+jobs/validate_test/val_test_cGAN_538273.out
+jobs/validate_test/val_test_cGAN_540044.err
+jobs/validate_test/val_test_cGAN_540044.out
+jobs/validate_test/val_test_cGAN_540048.err
+jobs/validate_test/val_test_cGAN_540048.out
+jobs/validate_test/val_test_cGAN_540050.err
+jobs/validate_test/val_test_cGAN_540050.out
diff --git a/README.md b/README.md
@@ -1,72 +1,25 @@
-# A Regularized Conditional GAN for Posterior Sampling in Inverse Problems [[arXiv]](https://arxiv.org/abs/2210.13389)
+# Generative modelling for mass-mapping with fast uncertainty quantification [[arXiv]](https://arxiv.org/abs/2410.24197)
 ## Setup
 See ```docs/setup.md``` for basic environment setup instructions.
 
-## Reproducing our Results
-### MRI
-See ```docs/mri.md``` for instructions on how to setup and reproduce our MRI results.
-
-## Extending the Code
-See ```docs/new_applications.md``` for basic instructions on how to extend the code to your application.
+## Reproducing the our Results
+### 
+See ```docs/mass_mapping.md``` for instructions on how to setup and reproduce our COSMOS results.
 
 ## Questions and Concerns
-If you have any questions, or run into any issues, don't hesitate to reach out at [email protected].
-
-## TODO
-- [x] Migrate to PyTorch Lightning
-- [x] Reimplement MRI rcGAN
-- [x] Update MRI experiment to R=8
-- [ ] Reimplement inpainting rcGAN
-- [ ] Extend to super resolution
+If you have any questions, or run into any issues, don't hesitate to reach out at [email protected]
 
 ## References
-This repository contains code from the following works, which should be cited:
-
-```
-@article{zbontar2018fastmri,
-  title={fastMRI: An open dataset and benchmarks for accelerated MRI},
-  author={Zbontar, Jure and Knoll, Florian and Sriram, Anuroop and Murrell, Tullie and Huang, Zhengnan and Muckley, Matthew J and Defazio, Aaron and Stern, Ruben and Johnson, Patricia and Bruno, Mary and others},
-  journal={arXiv preprint arXiv:1811.08839},
-  year={2018}
-}
-
-@article{devries2019evaluation,
-  title={On the evaluation of conditional GANs},
-  author={DeVries, Terrance and Romero, Adriana and Pineda, Luis and Taylor, Graham W and Drozdzal, Michal},
-  journal={arXiv preprint arXiv:1907.08175},
-  year={2019}
-}
+This repository was forked from rcGAN by Bendel et al., with significant changes and modification made by Whitney et al.
 
-@inproceedings{Karras2020ada,
-  title={Training Generative Adversarial Networks with Limited Data},
-  author={Tero Karras and Miika Aittala and Janne Hellsten and Samuli Laine and Jaakko Lehtinen and Timo Aila},
-  booktitle={Proc. NeurIPS},
-  year={2020}
-}
-
-@inproceedings{zhao2021comodgan,
-  title={Large Scale Image Completion via Co-Modulated Generative Adversarial Networks},
-  author={Zhao, Shengyu and Cui, Jonathan and Sheng, Yilun and Dong, Yue and Liang, Xiao and Chang, Eric I and Xu, Yan},
-  booktitle={International Conference on Learning Representations (ICLR)},
-  year={2021}
-}
-
-@misc{zeng2022github,
-    howpublished = {Downloaded from \url{https://github.com/zengxianyu/co-mod-gan-pytorch}},
-    month = sep,
-    author={Yu Zeng},
-    title = {co-mod-gan-pytorch},
-    year = 2022
-}
-```
 
 ## Citation
 If you find this code helpful, please cite our paper:
 ```
-@journal{bendel2022arxiv,
-  author = {Bendel, Matthew and Ahmad, Rizwan and Schniter, Philip},
-  title = {A Regularized Conditional {GAN} for Posterior Sampling in Inverse Problems},
-  year = {2022},
-  journal={arXiv:2210.13389}
+@journal{2024arxiv,
+  author = {Whitney, Jessica and Liaudat, Tobías and Price, Matthew and Mars, Matthijs and McEwen, Jason},
+  title = {Generative modelling for mass-mapping with fast uncertainty quantification},
+  year = {2024},
+  journal={arXiv:2410.24197}
 }
 ```
diff --git a/comments.md b/comments.md
@@ -1,6 +1,14 @@
+# rcGAN development version  
+
 
 # Installation
 
+If in the Hypatia cluster, first run:
+``` bash
+source /share/apps/anaconda/3-2022.05/etc/profile.d/conda.sh
+```
+
+
 First install the conda dependencies setting the correct channels:
 ``` bash
 conda create --name cGAN --file conda_requirements.txt --channel pytorch --channel nvidia --channel conda-forge --channel defaults
@@ -26,10 +34,56 @@ configs -> `~/.config/wandb` -> `WANDB_CONFIG_DIR`
 
 # Set the variables
 ``` bash
-export WANDB_DIR=/share/gpu0/tl3/wandb/logs
-export WANDB_CACHE_DIR=/share/gpu0/tl3/wandb/.cache/wandb
-export WANDB_CONFIG_DIR=/share/gpu0/tl3/wandb/.config/wandb
+export WANDB_DIR=/share/gpu0/jjwhit/wandb/logs
+export WANDB_CACHE_DIR=/share/gpu0/jjwhit/wandb/.cache/wandb
+export WANDB_CONFIG_DIR=/share/gpu0/jjwhit/wandb/.config/wandb
+```
+
+# Training the model
+
+Training is as simple as running the following command:
+```python
+python train.py --config ./configs/mass_map.yml --exp-name rcgan_test --num-gpus X
+```
+where ```X``` is the number of GPUs you plan to use. Note that this project uses Weights and Biases (wandb) for logging.
+See [their documentation](https://docs.wandb.ai/quickstart) for instructions on how to setup environment variables.
+Alternatively, you may use a different logger. See PyTorch Lightning's [documentation](https://lightning.ai/docs/pytorch/stable/extensions/logging.html) for options.
+
+If you need to resume training, use the following command:
+```python
+python train.py --config ./configs/mass_map.yml --exp-name rcgan_test --num-gpus X --resume --resume-epoch Y
+```
+where ```Y``` is the epoch to resume from.
+
+By default, we save the previous 50 epochs. Ensure that your checkpoint path points to a location with sufficient disk space.
+If disk space is a concern, 50 can be reduced to 25.
+This is important for the next step, validation.
+
+
+## Multi-GPU Runs
+To make the lightning module work on multiple GPUs (and on multiple nodes) when using the SLURM workload manager, we need to be careful in setting up the SLURM job script. An example of how to do this can be found here https://pytorch-lightning.readthedocs.io/en/1.2.10/clouds/slurm.html. 
+
+In particular if we want to run on 4 GPUs on one node we need to make sure that we ask for 4 GPUs as well as 4 tasks (since lightning will create 1 task per GPU) per node:
+
+```
+#SBATCH --gres=gpu:4          # n_gpus
+#SBATCH --ntasks-per-node=4   # ntasks needs to be same as n_gpus
+```
+
+An example of a job-script for training using multiple GPUs can be found in [examples/example_multi_gpu.sh](https://github.com/astro-informatics/rcGAN/blob/dev-multiGPU/examples/example_multi_gpu_train.sh)
+
+## Batch size tuning
+Additionally I have created a script, [find_batch_size.py](https://github.com/astro-informatics/rcGAN/blob/dev-multiGPU/find_batch_size.py) that finds the largest batch_size that you can run per GPU. This depends on the VRAM available on the GPU and can therefore vary accross machines/nodes. An example job file can be found in [examples/example_find_batch_size.sh](https://github.com/astro-informatics/rcGAN/blob/dev-multiGPU/examples/example_find_batch_size.sh). Usage is:
+
+```
+python find_batch_size.py --config [config_file.yml]
 ```
 
+Finally, to support larger batch sizes we can accumulate the gradients over batch sizes. In order to enable this and set the amount of accumulation you can add to your config file:
 
+```
+batch_size: 8               # batch_size per GPU (because of DDP)
+accumulate_grad_batches: 2  # updates model after 2 batches per GPU
+```
 
+When using the distributed data processing (DDP) training strategy, the model is copied exactly on each GPU and they all see only a part of the data during the epoch. After processing 1 batch on each of the GPUs, the gradients from each of the GPUs are averaged and the models are updated. If we use gradient accumulation the gradients are instead averaged over several of such steps. The effective batch size of the model is therefore: n_gpus * batch_size *  accumulate_grad_batches. 
diff --git a/configs/mass_map.yml b/configs/mass_map.yml
@@ -0,0 +1,36 @@
+#Change checkpoint and sense_map path
+checkpoint_dir: /share/gpu0/jjwhit/mass_map/mm_models/ # Where model will save checkpoints
+data_path: /share/gpu0/jjwhit/kappa_cosmos_simulations/cropped_dataset/ # Path to simulation dataset
+cosmo_dir_path: /home/jjwhit/rcGAN/mass_map_utils/cosmos/ # Path to cosmos information such as mask
+save_path: /share/gpu0/jjwhit/samples/real_output/ # where figures and samples will be saved
+
+# Define the experience
+experience: mass_mapping
+kappa_mean: 0.00015744006243248638 # Value calculated during preprocessing
+kappa_std: 0.02968584954283938 # Value calculated during preprocessing
+
+# Number of code vectors for each phase
+num_z_test: 32
+num_z_valid: 8
+num_z_train: 2
+
+# Data
+in_chans: 4  # Real+Imag parts from observation + Kaiser squires map
+out_chans: 1 # A real convergence map
+im_size: 300 # Pixel width/height (square iamges)
+
+# Optimizer:
+lr: 0.001
+beta_1: 0
+beta_2: 0.99
+
+# Loss weights
+gp_weight: 10
+adv_weight: 1e-5
+
+# Training
+batch_size: 9
+num_epochs: 100
+psnr_gain_tol: 0.25
+
+num_workers: 4
diff --git a/configs/mri.yml b/configs/mri.yml
@@ -2,6 +2,9 @@ checkpoint_dir: "/share/gpu0/tl3/MRI/mri_models/"
 data_path: "/share/gpu0/tl3/fastMRI_dataset"
 sense_maps_path: "/share/gpu0/tl3/MRI/sense_maps/"
 
+# Define the experience
+experience: mri
+
 # MRI Args
 num_of_top_slices: 8
 use-middle-slices: false

diff --git a/configs/radio_fourier.yml b/configs/radio_fourier.yml
@@ -0,0 +1,34 @@
+#Change checkpoint and sense_map path
+checkpoint_dir: /share/gpu0/mars/TNG_data/rcGAN/models/
+data_path: /share/gpu0/mars/TNG_data/rcGAN/fourier/
+
+# Define the experience
+experience: radio
+
+# Number of code vectors for each phase
+num_z_test: 32
+num_z_valid: 8
+num_z_train: 2
+
+# Data
+in_chans: 3  # Real+Imag parts from obs
+out_chans: 2
+im_size: 360 #384x384 pixel images
+
+
+# Optimizer:
+lr: 0.001
+beta_1: 0
+beta_2: 0.99
+
+# Loss weights
+gp_weight: 10
+adv_weight: 1e-5
+
+# Training
+batch_size: 1
+#Remember to increase this for full training
+num_epochs: 10
+psnr_gain_tol: 0.25
+
+num_workers: 4
diff --git a/configs/radio_image.yml b/configs/radio_image.yml
@@ -0,0 +1,34 @@
+#Change checkpoint and sense_map path
+checkpoint_dir: /share/gpu0/mars/TNG_data/rcGAN/models/varying/
+data_path: /share/gpu0/mars/TNG_data/rcGAN/image_psfs/
+
+# Define the experience
+experience: radio
+
+# Number of code vectors for each phase
+num_z_test: 32
+num_z_valid: 8
+num_z_train: 2
+
+# Data
+in_chans: 3  # Real+Imag parts from obs
+out_chans: 2
+im_size: 256 #384x384 pixel images
+
+
+# Optimizer:
+lr: 0.001
+beta_1: 0
+beta_2: 0.99
+
+# Loss weights
+gp_weight: 10
+adv_weight: 1e-5
+
+# Training
+batch_size: 8
+#Remember to increase this for full training
+num_epochs: 100
+psnr_gain_tol: 0.25
+
+num_workers: 4
diff --git a/configs/radio_image_test.yml b/configs/radio_image_test.yml
@@ -0,0 +1,35 @@
+#Change checkpoint and sense_map path
+checkpoint_dir: /share/gpu0/tl3/cGAN/radio/trained_model/
+data_path: /share/gpu0/mars/TNG_data/rcGAN/image_psfs/
+
+# Define the experience
+experience: radio
+
+# Number of code vectors for each phase
+num_z_test: 32
+num_z_valid: 8
+num_z_train: 2
+
+# Data
+in_chans: 3  # Real+Imag parts from obs
+out_chans: 2
+im_size: 256 #384x384 pixel images
+
+
+# Optimizer:
+lr: 0.001
+beta_1: 0
+beta_2: 0.99
+
+# Loss weights
+gp_weight: 10
+adv_weight: 1e-5
+
+# Training
+batch_size: 4
+accumulate_grad_batches: 2
+#Remember to increase this for full training
+num_epochs: 1
+psnr_gain_tol: 0.25
+
+num_workers: 1