make_warped_noise.py

#Ryan Burgert 2024

#Setup:
#    Run this in a Jupyter Notebook on a computer with at least one GPU
#        `sudo apt install ffmpeg git`
#        `pip install rp`
#    The first time you run this it might be a bit slow (it will download necessary models)
#    The `rp` package will take care of installing the rest of the python packages for you

import rp

rp.r._pip_import_autoyes=True #Automatically install missing packages

rp.pip_import('fire')
rp.git_import('CommonSource') #If missing, installs code from https://github.com/RyannDaGreat/CommonSource
import rp.git.CommonSource.noise_warp as nw
import fire

def main(video:str, output_folder:str):
    """
    Takes a video URL or filepath and an output folder path
    It then resizes that video to height=480, width=720, 49 frames (CogVidX's dimensions)
    Then it calculates warped noise at latent resolution (i.e. 1/8 of the width and height) with 16 channels
    It saves that warped noise, optical flows, and related preview videos and images to the output folder
    The main file you need is <output_folder>/noises.npy which is the gaussian noises in (H,W,C) form
    """

    if rp.folder_exists(output_folder):
        raise RuntimeError(f"The given output_folder={repr(output_folder)} already exists! To avoid clobbering what might be in there, please specify a folder that doesn't exist so I can create one for you. Alternatively, you could delete that folder if you don't care whats in it.")

    FRAME = 2**-1 #We immediately resize the input frames by this factor, before calculating optical flow
                  #The flow is calulated at (input size) × FRAME resolution.
                  #Higher FLOW values result in slower optical flow calculation and higher intermediate noise resolution
                  #Larger is not always better - watch the preview in Jupyter to see if it looks good!

    FLOW = 2**3   #Then, we use bilinear interpolation to upscale the flow by this factor
                  #We warp the noise at (input size) × FRAME × FLOW resolution
                  #The noise is then downsampled back to (input size)
                  #Higher FLOW values result in more temporally consistent noise warping at the cost of higher VRAM usage and slower inference time
    LATENT = 8    #We further downsample the outputs by this amount - because 8 pixels wide corresponds to one latent wide in Stable Diffusion
                  #The final output size is (input size) ÷ LATENT regardless of FRAME and FLOW

    #LATENT = 1    #Uncomment this line for a prettier visualization! But for latent diffusion models, use LATENT=8

    #You can also use video files or URLs
    # video = "https://www.shutterstock.com/shutterstock/videos/1100085499/preview/stock-footage-bremen-germany-october-old-style-carousel-moving-on-square-in-city-horses-on-traditional.webm"

    # output_folder = "NoiseWarpOutputFolder"

    if isinstance(video,str):
        video=rp.load_video(video)

    #Preprocess the video
    video=rp.resize_list(video,length=49) #Stretch or squash video to 49 frames (CogVideoX's length)
    video=rp.resize_images_to_hold(video,height=480,width=720)
    video=rp.crop_images(video,height=480,width=720,origin='center') #Make the resolution 480x720 (CogVideoX's resolution)
    video=rp.as_numpy_array(video)


    #See this function's docstring for more information!
    output = nw.get_noise_from_video(
        video,
        remove_background=False, #Set this to True to matte the foreground - and force the background to have no flow
        visualize=True,          #Generates nice visualization videos and previews in Jupyter notebook
        save_files=True,         #Set this to False if you just want the noises without saving to a numpy file
        
        noise_channels=16,
        output_folder=output_folder,
        resize_frames=FRAME,
        resize_flow=FLOW,
        downscale_factor=round(FRAME * FLOW) * LATENT,
    )

    output.first_frame_path = rp.save_image(video[0],rp.path_join(output_folder,'first_frame.png'))

    rp.save_video_mp4(video, rp.path_join(output_folder, 'input.mp4'), framerate=12, video_bitrate='max')

    #output.numpy_noises_downsampled = as_numpy_images(
        #nw.resize_noise(
            #as_torch_images(x),
            #1 / 8,
        #)for x 
    #)
    #
    #output.numpy_noises_downsampled_path = path_join(output_folder, 'noises_downsampled.npy')
    #np.save(numpy_noises_downsampled_path, output.numpy_noises_downsampled)

    print("Noise shape:"  ,output.numpy_noises.shape)
    print("Flow shape:"   ,output.numpy_flows .shape)
    print("Output folder:",output.output_folder)

if __name__ == "__main__":
    fire.Fire(main)