def extract_features_file(fn, bands=60, frames=41): print('extract features: ' + fn) window_size = 512 * (frames - 1) segment_log_specgrams, segment_labels = [], [] sound_clip, sr = librosa.load(fn) for (start, end) in _windows(sound_clip, window_size): if len(sound_clip[start:end]) == window_size: signal = sound_clip[start:end] melspec = librosa.feature.melspectrogram(signal, n_mels=bands) logspec = librosa.amplitude_to_db(melspec) logspec = logspec.T.flatten()[:, np.newaxis].T segment_log_specgrams.append(logspec) segment_log_specgrams = np.asarray(segment_log_specgrams).reshape(len(segment_log_specgrams), bands, frames, 1) segment_features = np.concatenate((segment_log_specgrams, np.zeros(np.shape(segment_log_specgrams))), axis=3) for i in range(len(segment_features)): segment_features[i, :, :, 1] = librosa.feature.delta( segment_features[i, :, :, 0]) return segment_features