2025-04-20 13:05:48 -07:00

92 lines
4.2 KiB
Python

import matplotlib.pyplot as plt
import soundfile as sf
import librosa
import librosa.display
import numpy as np
from moviepy.editor import VideoClip, AudioFileClip
from PIL import Image
def make_mel(offset_load,duration_load,pps,height,fmax,audio_path,mel_path):
"""
make mel spectrogram image.
pps: pixel of x axis per second of audio
fmax: max Mel frequency (Hz)
"""
y, sr = librosa.load(audio_path,sr=None,offset=offset_load,duration=duration_load)
if offset_load != 0 or duration_load != None:
print(y.shape)
audio_path = audio_path + '.cut.wav'
sf.write(audio_path,y,sr,'PCM_24')
duration = librosa.get_duration(y=y,sr=sr)
print("y.shape:" + str(y.shape))
print("sample rate:" + str(sr))
print("duration:%.4f s"%duration)
plt.figure(figsize=(duration*pps,height), dpi=1)
plt.axis('off')
spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512)
mel_spect = librosa.power_to_db(spect, ref=np.max)
librosa.display.specshow(mel_spect, sr=sr, y_axis='mel', fmax=fmax, x_axis='time')
plt.savefig(mel_path, bbox_inches='tight', dpi=1)
return duration, audio_path
def make_video(audio_path,mel_path,video_path,duration,window,fps):
img = Image.open(mel_path)
img = np.array(img.convert('RGB'))
print(img.shape)
screen_h, total_width, _ = img.shape
pps = total_width / duration # recalculate pixel per second
print("totalwidth:", total_width)
print("duration:", duration)
print("pps:", pps)
screen_w = int(2*window*pps)
print(f"Generating a {screen_w} x {screen_h} video")
filler = np.zeros((screen_h, screen_w, 3))
superimpose = np.zeros((screen_h, screen_w, 3))
superimpose[:,screen_w//2-2:screen_w//2+2,:] = np.array([100.0, -100.0, -100.0])
def make_frame(t):
if t < window:
idx = int((t+window)*pps)
ret = np.concatenate((filler[:,0:screen_w-idx,:], img[:,:idx,:]), axis=1)
elif t > duration - window:
idx = int((t-window)*pps)
final = int(duration*pps)
idx = min(idx+1, final) # round up and cut to avoid screen_w-(final-idx)<0
ret = np.concatenate((img[:,idx:final,:], filler[:,0:screen_w-(final-idx),:]), axis=1)
else:
idx = int((t-window)*pps)
ret = img[:,idx:idx+screen_w,:]
return ret + superimpose
# Create the audio and video clip
audio_clip = AudioFileClip(audio_path)
video_clip = VideoClip(make_frame, duration=audio_clip.duration)
video_clip = video_clip.set_audio(audio_clip)
video_clip.write_videofile(video_path, fps=fps)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Make a video showing a horizontally rolling mel spectrogram\
of the audio input')
parser.add_argument('audio_path')
parser.add_argument('video_path')
parser.add_argument('--pps', type=int, default=200, help="horizondal pixels per second")
parser.add_argument('--fmax', type=float, default=8000, help='max Mel frequency (Hz)')
parser.add_argument('--window', type=float, default=5, help='2*window size is the duration (seconds)\
of the audio segment represented by every frame')
parser.add_argument('--height', type=int, default=1000, help='height of the mel spectrogram and the video')
parser.add_argument('--fps', type=int, default=24, help="fps of the output video")
parser.add_argument('--offset', type=float, default=0, help="start reading after this time (seconds)")
parser.add_argument('--duration', type=float, nargs="?", help="only load up to 'duration' seconds \
of the original audio")
args = parser.parse_args()
audio_path = args.audio_path if '/' in args.audio_path else ('./media/'+args.audio_path)
video_path = args.video_path if '/' in args.video_path else ('./media/'+args.video_path)
mel_path = './media/' + args.audio_path.split('/')[-1] + '.png'
duration, audio_path = make_mel(args.offset, args.duration, args.pps, args.height, args.fmax, audio_path, mel_path)
make_video(audio_path, mel_path, video_path, duration, args.window, args.fps)