forked from msieg/deep-music-visualizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
visualize.py
405 lines (278 loc) · 10.6 KB
/
visualize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
import librosa
import argparse
import numpy as np
import moviepy.editor as mpy
import random
import torch
from scipy.misc import toimage
from tqdm import tqdm
from pytorch_pretrained_biggan import (BigGAN, one_hot_from_names, truncated_noise_sample,
save_as_images, display_in_terminal)
#get input arguments
parser = argparse.ArgumentParser()
parser.add_argument("--song",required=True)
parser.add_argument("--resolution", default='512')
parser.add_argument("--duration", type=int)
parser.add_argument("--pitch_sensitivity", type=int, default=220)
parser.add_argument("--tempo_sensitivity", type=float, default=0.25)
parser.add_argument("--depth", type=float, default=1)
parser.add_argument("--classes", nargs='+', type=int)
parser.add_argument("--num_classes", type=int, default=12)
parser.add_argument("--sort_classes_by_power", type=int, default=0)
parser.add_argument("--jitter", type=float, default=0.5)
parser.add_argument("--frame_length", type=int, default=512)
parser.add_argument("--truncation", type=float, default=1)
parser.add_argument("--smooth_factor", type=int, default=20)
parser.add_argument("--batch_size", type=int, default=30)
parser.add_argument("--use_previous_classes", type=int, default=0)
parser.add_argument("--use_previous_vectors", type=int, default=0)
parser.add_argument("--output_file", default="output.mp4")
args = parser.parse_args()
#read song
if args.song:
song=args.song
print('\nReading audio \n')
y, sr = librosa.load(song)
else:
raise ValueError("you must enter an audio file name in the --song argument")
#set model name based on resolution
model_name='biggan-deep-' + args.resolution
frame_length=args.frame_length
#set pitch sensitivity
pitch_sensitivity=(300-args.pitch_sensitivity) * 512 / frame_length
#set tempo sensitivity
tempo_sensitivity=args.tempo_sensitivity * frame_length / 512
#set depth
depth=args.depth
#set number of classes
num_classes=args.num_classes
#set sort_classes_by_power
sort_classes_by_power=args.sort_classes_by_power
#set jitter
jitter=args.jitter
#set truncation
truncation=args.truncation
#set batch size
batch_size=args.batch_size
#set use_previous_classes
use_previous_vectors=args.use_previous_vectors
#set use_previous_vectors
use_previous_classes=args.use_previous_classes
#set output name
outname=args.output_file
#set smooth factor
if args.smooth_factor > 1:
smooth_factor=int(args.smooth_factor * 512 / frame_length)
else:
smooth_factor=args.smooth_factor
#set duration
if args.duration:
seconds=args.duration
frame_lim=int(np.floor(seconds*22050/frame_length/batch_size))
else:
frame_lim=int(np.floor(len(y)/sr*22050/frame_length/batch_size))
# Load pre-trained model
model = BigGAN.from_pretrained(model_name)
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
########################################
########################################
########################################
########################################
########################################
#create spectrogram
spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000, hop_length=frame_length)
#get mean power at each time point
specm=np.mean(spec,axis=0)
#compute power gradient across time points
gradm=np.gradient(specm)
#set max to 1
gradm=gradm/np.max(gradm)
#set negative gradient time points to zero
gradm = gradm.clip(min=0)
#normalize mean power between 0-1
specm=(specm-np.min(specm))/np.ptp(specm)
#create chromagram of pitches X time points
chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=frame_length)
#sort pitches by overall power
chromasort=np.argsort(np.mean(chroma,axis=1))[::-1]
########################################
########################################
########################################
########################################
########################################
if args.classes:
classes=args.classes
if len(classes) not in [12,num_classes]:
raise ValueError("The number of classes entered in the --class argument must equal 12 or [num_classes] if specified")
elif args.use_previous_classes==1:
cvs=np.load('class_vectors.npy')
classes=list(np.where(cvs[0]>0)[0])
else: #select 12 random classes
cls1000=list(range(1000))
random.shuffle(cls1000)
classes=cls1000[:12]
if sort_classes_by_power==1:
classes=[classes[s] for s in np.argsort(chromasort[:num_classes])]
#initialize first class vector
cv1=np.zeros(1000)
for pi,p in enumerate(chromasort[:num_classes]):
if num_classes < 12:
cv1[classes[pi]] = chroma[p][np.min([np.where(chrow>0)[0][0] for chrow in chroma])]
else:
cv1[classes[p]] = chroma[p][np.min([np.where(chrow>0)[0][0] for chrow in chroma])]
#initialize first noise vector
nv1 = truncated_noise_sample(truncation=truncation)[0]
#initialize list of class and noise vectors
class_vectors=[cv1]
noise_vectors=[nv1]
#initialize previous vectors (will be used to track the previous frame)
cvlast=cv1
nvlast=nv1
#initialize the direction of noise vector unit updates
update_dir=np.zeros(128)
for ni,n in enumerate(nv1):
if n<0:
update_dir[ni] = 1
else:
update_dir[ni] = -1
#initialize noise unit update
update_last=np.zeros(128)
########################################
########################################
########################################
########################################
########################################
#get new jitters
def new_jitters(jitter):
jitters=np.zeros(128)
for j in range(128):
if random.uniform(0,1)<0.5:
jitters[j]=1
else:
jitters[j]=1-jitter
return jitters
#get new update directions
def new_update_dir(nv2,update_dir):
for ni,n in enumerate(nv2):
if n >= 2*truncation - tempo_sensitivity:
update_dir[ni] = -1
elif n < -2*truncation + tempo_sensitivity:
update_dir[ni] = 1
return update_dir
#smooth class vectors
def smooth(class_vectors,smooth_factor):
if smooth_factor==1:
return class_vectors
class_vectors_terp=[]
for c in range(int(np.floor(len(class_vectors)/smooth_factor)-1)):
ci=c*smooth_factor
cva=np.mean(class_vectors[int(ci):int(ci)+smooth_factor],axis=0)
cvb=np.mean(class_vectors[int(ci)+smooth_factor:int(ci)+smooth_factor*2],axis=0)
for j in range(smooth_factor):
cvc = cva*(1-j/(smooth_factor-1)) + cvb*(j/(smooth_factor-1))
class_vectors_terp.append(cvc)
return np.array(class_vectors_terp)
#normalize class vector between 0-1
def normalize_cv(cv2):
min_class_val = min(i for i in cv2 if i != 0)
for ci,c in enumerate(cv2):
if c==0:
cv2[ci]=min_class_val
cv2=(cv2-min_class_val)/np.ptp(cv2)
return cv2
print('\nGenerating input vectors \n')
for i in tqdm(range(len(gradm))):
#print progress
pass
#update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity
if i%200==0:
jitters=new_jitters(jitter)
#get last noise vector
nv1=nvlast
#set noise vector update based on direction, sensitivity, jitter, and combination of overall power and gradient of power
update = np.array([tempo_sensitivity for k in range(128)]) * (gradm[i]+specm[i]) * update_dir * jitters
#smooth the update with the previous update (to avoid overly sharp frame transitions)
update=(update+update_last*3)/4
#set last update
update_last=update
#update noise vector
nv2=nv1+update
#append to noise vectors
noise_vectors.append(nv2)
#set last noise vector
nvlast=nv2
#update the direction of noise units
update_dir=new_update_dir(nv2,update_dir)
#get last class vector
cv1=cvlast
#generate new class vector
cv2=np.zeros(1000)
for j in range(num_classes):
cv2[classes[j]] = (cvlast[classes[j]] + ((chroma[chromasort[j]][i])/(pitch_sensitivity)))/(1+(1/((pitch_sensitivity))))
#if more than 6 classes, normalize new class vector between 0 and 1, else simply set max class val to 1
if num_classes > 6:
cv2=normalize_cv(cv2)
else:
cv2=cv2/np.max(cv2)
#adjust depth
cv2=cv2*depth
#this prevents rare bugs where all classes are the same value
if np.std(cv2[np.where(cv2!=0)]) < 0.0000001:
cv2[classes[0]]=cv2[classes[0]]+0.01
#append new class vector
class_vectors.append(cv2)
#set last class vector
cvlast=cv2
#interpolate between class vectors of bin size [smooth_factor] to smooth frames
class_vectors=smooth(class_vectors,smooth_factor)
#check whether to use vectors from last run
if use_previous_vectors==1:
#load vectors from previous run
class_vectors=np.load('class_vectors.npy')
noise_vectors=np.load('noise_vectors.npy')
else:
#save record of vectors for current video
np.save('class_vectors.npy',class_vectors)
np.save('noise_vectors.npy',noise_vectors)
########################################
########################################
########################################
########################################
########################################
#convert to Tensor
noise_vectors = torch.Tensor(np.array(noise_vectors))
class_vectors = torch.Tensor(np.array(class_vectors))
#Generate frames in batches of batch_size
print('\n\nGenerating frames \n')
#send to CUDA if running on GPU
model=model.to(device)
noise_vectors=noise_vectors.to(device)
class_vectors=class_vectors.to(device)
frames = []
for i in tqdm(range(frame_lim)):
#print progress
pass
if (i+1)*batch_size > len(class_vectors):
torch.cuda.empty_cache()
break
#get batch
noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size]
class_vector=class_vectors[i*batch_size:(i+1)*batch_size]
# Generate images
with torch.no_grad():
output = model(noise_vector, class_vector, truncation)
output_cpu=output.cpu().data.numpy()
#convert to image array and add to frames
for out in output_cpu:
im=np.array(toimage(out))
frames.append(im)
#empty cuda cache
torch.cuda.empty_cache()
#Save video
aud = mpy.AudioFileClip(song, fps = 44100)
if args.duration:
aud.duration=args.duration
clip = mpy.ImageSequenceClip(frames, fps=22050/frame_length)
clip = clip.set_audio(aud)
clip.write_videofile(outname,audio_codec='aac')