Source code for decibel.audio_tab_aligner.feature_extractor

from typing import Tuple, List

import librosa
import numpy as np
import mir_eval

from decibel.music_objects.song import Song
from decibel.import_export import filehandler


[docs]def get_audio_features(audio_path: str, sampling_rate: int, hop_length: int) -> Tuple[np.ndarray, np.ndarray]: # Load audio with small sampling rate and convert to mono. Audio is an array with a value per *sample* audio, _ = librosa.load(audio_path, sr=sampling_rate, mono=True) # Separate harmonics and percussives into two waveforms. We get two arrays, each with one value per *sample* audio_harmonic, audio_percussive = librosa.effects.hpss(audio) # Beat track on the percussive signal. The result is an array of *frames* which are on a beat _, beat_frames = librosa.beat.beat_track(y=audio_percussive, sr=sampling_rate, hop_length=hop_length, trim=False) # Compute chroma features from the harmonic signal. We get a 12D array of chroma for each *frame* chromagram = librosa.feature.chroma_cqt(y=audio_harmonic, sr=sampling_rate, hop_length=hop_length) # Make sure the last beat is not longer than the length of the chromagram beat_frames = librosa.util.fix_frames(beat_frames, x_max=chromagram.shape[1]) # Aggregate chroma features between *beat events*. We use the mean value of each feature between beat frames beat_chroma = librosa.util.sync(chromagram, beat_frames) beat_chroma = np.transpose(beat_chroma) # Translate beats from frames to time domain beat_times = librosa.frames_to_time(beat_frames, sr=sampling_rate, hop_length=hop_length) return beat_times, beat_chroma
[docs]def beat_align_ground_truth_labels(ground_truth_labels_path: str, beat_times: np.ndarray) -> List[str]: """ Beat-synchronize the reference chord annotations, by assigning the chord with the longest duration within that beat :param ground_truth_labels_path: Path to the ground truth file :param beat_times: Array of beats, measured in seconds :return: List of chords within each beat """ # Load chords from ground truth file (ref_intervals, ref_labels) = mir_eval.io.load_labeled_intervals(ground_truth_labels_path) # Find start and end locations of each beat beat_starts = beat_times[:-1] beat_ends = beat_times[1:] # Create the longest_chords list, which we will fill in the for loop longest_chords_per_beat = [] for i in range(beat_starts.size): # Iterate over the beats in this song, keeping the chord with the longest duration b_s = beat_starts[i] b_e = beat_ends[i] longest_chord_duration = 0 longest_chord = 'N' for j in range(ref_intervals.shape[0]): # Iterate over the intervals in the reference chord annotations r_s = ref_intervals[j][0] # Start time of reference interval r_e = ref_intervals[j][1] # End time of reference interval if r_s < b_e and r_e > b_s: # This reference interval overlaps with the current beat start_inside_beat = max(r_s, b_s) end_inside_beat = min(r_e, b_e) duration_inside_beat = end_inside_beat - start_inside_beat if duration_inside_beat > longest_chord_duration: longest_chord_duration = duration_inside_beat longest_chord = ref_labels[j] # Add the chord with the longest duration to our list longest_chords_per_beat.append(longest_chord) return longest_chords_per_beat
[docs]def get_feature_ground_truth_matrix(full_audio_path: str, ground_truth_labs_path: str) -> np.matrix: # First obtain the audio features per beat using librosa. beat_times, beat_chroma = get_audio_features(full_audio_path, sampling_rate=22050, hop_length=256) # Align the ground truth annotations to the beats. longest_chords_per_beat = beat_align_ground_truth_labels(ground_truth_labs_path, beat_times) # Combine the beat times, chroma values and chord labels into a matrix with 14 columns and |beats| rows. times_features_class = np.c_[beat_times[:-1], beat_chroma, longest_chords_per_beat] return times_features_class
[docs]def export_audio_features_for_song(song: Song) -> None: """ Export the audio features of this song to a file. For this purpose, we use the python package librosa. First, we convert the audio file to mono. Then, we use the HPSS function to separate the harmonic and percussive elements of the audio. Then, we extract chroma from the harmonic part, using constant-Q transform with a sampling rate of 22050 and a hop length of 256 samples. Now we have chroma features for each sample, but we expect that the great majority of chord changes occurs on a beat. Therefore, we beat-synchronize the features: we run a beat-extraction function on the percussive part of the audio and average the chroma features between the consecutive beat positions. The chord annotations need to be beat-synchronized as well. We do this by taking the most prevalent chord label between beats. Each mean feature vector with the corresponding beat-synchronized chord label is regarded as one frame. :param song: Song for which we export the audio features """ if song.full_ground_truth_chord_labs_path != '': # There are chord labels for this song write_path = filehandler.get_full_audio_features_path(song.key) if filehandler.file_exists(write_path): # We already extracted the audio features song.audio_features_path = write_path else: # We still need to extract the audio features. times_features_class = get_feature_ground_truth_matrix(song.full_audio_path, song.full_ground_truth_chord_labs_path) # Export the beat, feature and class matrix to the write_path (a binary .npy file) song.audio_features_path = write_path np.save(write_path, times_features_class)