import matplotlib.pyplot as plt import soundfile as sf import librosa import librosa.display import numpy as np from moviepy.editor import VideoClip, AudioFileClip from PIL import Image def make_mel(offset_load,duration_load,pps,height,fmax,audio_path,mel_path): """ make mel spectrogram image. pps: pixel of x axis per second of audio fmax: max Mel frequency (Hz) """ y, sr = librosa.load(audio_path,sr=None,offset=offset_load,duration=duration_load) if offset_load != 0 or duration_load != None: print(y.shape) audio_path = audio_path + '.cut.wav' sf.write(audio_path,y,sr,'PCM_24') duration = librosa.get_duration(y=y,sr=sr) print("y.shape:" + str(y.shape)) print("sample rate:" + str(sr)) print("duration:%.4f s"%duration) plt.figure(figsize=(duration*pps,height), dpi=1) plt.axis('off') spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512) mel_spect = librosa.power_to_db(spect, ref=np.max) librosa.display.specshow(mel_spect, sr=sr, y_axis='mel', fmax=fmax, x_axis='time') plt.savefig(mel_path, bbox_inches='tight', dpi=1) return duration, audio_path def make_video(audio_path,mel_path,video_path,duration,window,fps): img = Image.open(mel_path) img = np.array(img.convert('RGB')) print(img.shape) screen_h, total_width, _ = img.shape pps = total_width / duration # recalculate pixel per second print("totalwidth:", total_width) print("duration:", duration) print("pps:", pps) screen_w = int(2*window*pps) print(f"Generating a {screen_w} x {screen_h} video") filler = np.zeros((screen_h, screen_w, 3)) superimpose = np.zeros((screen_h, screen_w, 3)) superimpose[:,screen_w//2-2:screen_w//2+2,:] = np.array([100.0, -100.0, -100.0]) def make_frame(t): if t < window: idx = int((t+window)*pps) ret = np.concatenate((filler[:,0:screen_w-idx,:], img[:,:idx,:]), axis=1) elif t > duration - window: idx = int((t-window)*pps) final = int(duration*pps) idx = min(idx+1, final) # round up and cut to avoid screen_w-(final-idx)<0 ret = np.concatenate((img[:,idx:final,:], filler[:,0:screen_w-(final-idx),:]), axis=1) else: idx = int((t-window)*pps) ret = img[:,idx:idx+screen_w,:] return ret + superimpose # Create the audio and video clip audio_clip = AudioFileClip(audio_path) video_clip = VideoClip(make_frame, duration=audio_clip.duration) video_clip = video_clip.set_audio(audio_clip) video_clip.write_videofile(video_path, fps=fps) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Make a video showing a horizontally rolling mel spectrogram\ of the audio input') parser.add_argument('audio_path') parser.add_argument('video_path') parser.add_argument('--pps', type=int, default=200, help="horizondal pixels per second") parser.add_argument('--fmax', type=float, default=8000, help='max Mel frequency (Hz)') parser.add_argument('--window', type=float, default=5, help='2*window size is the duration (seconds)\ of the audio segment represented by every frame') parser.add_argument('--height', type=int, default=1000, help='height of the mel spectrogram and the video') parser.add_argument('--fps', type=int, default=24, help="fps of the output video") parser.add_argument('--offset', type=float, default=0, help="start reading after this time (seconds)") parser.add_argument('--duration', type=float, nargs="?", help="only load up to 'duration' seconds \ of the original audio") args = parser.parse_args() audio_path = args.audio_path if '/' in args.audio_path else ('./media/'+args.audio_path) video_path = args.video_path if '/' in args.video_path else ('./media/'+args.video_path) mel_path = './media/' + args.audio_path.split('/')[-1] + '.png' duration, audio_path = make_mel(args.offset, args.duration, args.pps, args.height, args.fmax, audio_path, mel_path) make_video(audio_path, mel_path, video_path, duration, args.window, args.fps)