From f301ea894b7ff810a3af029e11b3688cd0400ab4 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Thu, 30 Mar 2023 15:11:33 +0200 Subject: [PATCH 01/12] add NME_SC method for clustering --- main.py | 75 ++++++++++++++ simple_diarizer/Spectral_clustering.py | 135 +++++++++++++++++++++++++ simple_diarizer/cluster.py | 63 ++++++++++-- simple_diarizer/diarizer.py | 25 +++-- 4 files changed, 283 insertions(+), 15 deletions(-) create mode 100644 main.py create mode 100644 simple_diarizer/Spectral_clustering.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..5471bc0 --- /dev/null +++ b/main.py @@ -0,0 +1,75 @@ +import soundfile as sf +import matplotlib.pyplot as plt +import os,sys,time + + +from simple_diarizer.diarizer import Diarizer +from simple_diarizer.utils import combined_waveplot + +t0 = time.time() + + +diar = Diarizer( + embed_model='ecapa', # 'xvec' and 'ecapa' supported + cluster_method='NME-sc' # 'ahc' 'sc' and 'NME-sc' supported + ) + +WAV_FILE,NUM_SPEAKERS,max_spk= sys.argv[1:] + + +if NUM_SPEAKERS == 'None': + print('None') + segments = diar.diarize(WAV_FILE, num_speakers=None,max_speakers=int(max_spk)) +else: + segments = diar.diarize(WAV_FILE, num_speakers=int(NUM_SPEAKERS)) + + +t1 = time.time() +feature_t = t1 - t0 +print("Time used for extracting features:", feature_t) + + + +json = {} +_segments = [] +_speakers = {} +seg_id = 1 +spk_i = 1 +spk_i_dict = {} + +for seg in segments: + + segment = {} + segment["seg_id"] = seg_id + + # Ensure speaker id continuity and numbers speaker by order of appearance. + if seg['label'] not in spk_i_dict.keys(): + spk_i_dict[seg['label']] = spk_i + spk_i += 1 + + spk_id = "spk" + str(spk_i_dict[seg['label']]) + segment["spk_id"] = spk_id + segment["seg_begin"] = round(seg['start']) + segment["seg_end"] = round(seg['end']) + + if spk_id not in _speakers: + _speakers[spk_id] = {} + _speakers[spk_id]["spk_id"] = spk_id + _speakers[spk_id]["duration"] = seg['end']-seg['start'] + _speakers[spk_id]["nbr_seg"] = 1 + else: + _speakers[spk_id]["duration"] += seg['end']-seg['start'] + _speakers[spk_id]["nbr_seg"] += 1 + + _segments.append(segment) + seg_id += 1 + +for spkstat in _speakers.values(): + spkstat["duration"] = round(spkstat["duration"]) + +json["speakers"] = list(_speakers.values()) +json["segments"] = _segments + + +print(json["speakers"] ) + diff --git a/simple_diarizer/Spectral_clustering.py b/simple_diarizer/Spectral_clustering.py new file mode 100644 index 0000000..c0a1e37 --- /dev/null +++ b/simple_diarizer/Spectral_clustering.py @@ -0,0 +1,135 @@ +import numpy as np +import scipy +from sklearn.cluster import SpectralClustering + +# NME low-level operations +# These functions are taken from the Kaldi scripts. + +# Prepares binarized(0/1) affinity matrix with p_neighbors non-zero elements in each row +def get_kneighbors_conn(X_dist, p_neighbors): + X_dist_out = np.zeros_like(X_dist) + for i, line in enumerate(X_dist): + sorted_idx = np.argsort(line) + sorted_idx = sorted_idx[::-1] + indices = sorted_idx[:p_neighbors] + X_dist_out[indices, i] = 1 + return X_dist_out + + +# Thresolds affinity matrix to leave p maximum non-zero elements in each row +def Threshold(A, p): + N = A.shape[0] + Ap = np.zeros((N, N)) + for i in range(N): + thr = sorted(A[i, :], reverse=True)[p] + Ap[i, A[i, :] > thr] = A[i, A[i, :] > thr] + return Ap + + +# Computes Laplacian of a matrix +def Laplacian(A): + d = np.sum(A, axis=1) - np.diag(A) + D = np.diag(d) + return D - A + + +# Calculates eigengaps (differences between adjacent eigenvalues sorted in descending order) +def Eigengap(S): + S = sorted(S) + return np.diff(S) + + +# Computes parameters of normalized eigenmaps for automatic thresholding selection +def ComputeNMEParameters(A, p, max_num_clusters): + # p-Neighbour binarization + Ap = get_kneighbors_conn(A, p) + # Symmetrization + Ap = (Ap + np.transpose(Ap)) / 2 + # Laplacian matrix computation + Lp = Laplacian(Ap) + # Get max_num_clusters+1 smallest eigenvalues + S = scipy.sparse.linalg.eigsh( + Lp, + k=max_num_clusters + 1, + which="SA", + tol=1e-6, + return_eigenvectors=False, + mode="buckling", + ) + # Get largest eigenvalue + Smax = scipy.sparse.linalg.eigsh( + Lp, k=1, which="LA", tol=1e-6, return_eigenvectors=False, mode="buckling" + ) + # Eigengap computation + e = Eigengap(S) + g = np.max(e[:max_num_clusters]) / (Smax + 1e-10) + r = p / g + k = np.argmax(e[:max_num_clusters]) + return (e, g, k, r) + + +""" +Performs spectral clustering with Normalized Maximum Eigengap (NME) +Parameters: + A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings) + num_clusters: number of clusters to generate (if None, determined automatically) + max_num_clusters: maximum allowed number of clusters to generate + pmax: maximum count for matrix binarization (should be at least 2) + pbest: best count for matrix binarization (if 0, determined automatically) +Returns: cluster assignments for every speaker embedding +""" + + +def NME_SpectralClustering( + A, num_clusters=None, max_num_clusters=10, pbest=0, pmin=3, pmax=20 +): + print(num_clusters,max_num_clusters) + if pbest == 0: + print("Selecting best number of neighbors for affinity matrix thresolding:") + rbest = None + kbest = None + for p in range(pmin, pmax + 1): + e, g, k, r = ComputeNMEParameters(A, p, max_num_clusters) + print("p={}, g={}, k={}, r={}, e={}".format(p, g, k, r, e)) + if rbest is None or rbest > r: + rbest = r + pbest = p + kbest = k + print("Best number of neighbors is {}".format(pbest)) + num_clusters = num_clusters if num_clusters is not None else (kbest + 1) + # Handle some edge cases in AMI SDM + num_clusters = 4 if num_clusters == 1 else num_clusters + return NME_SpectralClustering_sklearn( + A, num_clusters, pbest + ) + if num_clusters is None: + print("Compute number of clusters to generate:") + e, g, k, r = ComputeNMEParameters(A, pbest, max_num_clusters) + print("Number of clusters to generate is {}".format(k + 1)) + return NME_SpectralClustering_sklearn(A, k + 1, pbest) + return NME_SpectralClustering_sklearn(A, num_clusters, pbest) + + +""" +Performs spectral clustering with Normalized Maximum Eigengap (NME) with fixed threshold and number of clusters +Parameters: + A: affinity matrix (matrix of pairwise cosine similarities or PLDA scores between speaker embeddings) + OLVec: 0/1 vector denoting which segments are overlap segments + num_clusters: number of clusters to generate + pbest: best count for matrix binarization +Returns: cluster assignments for every speaker embedding +""" + + +def NME_SpectralClustering_sklearn(A, num_clusters, pbest): + print("Number of speakers is {}".format(num_clusters)) + # Ap = Threshold(A, pbest) + Ap = get_kneighbors_conn(A, pbest) # thresholded and binarized + Ap = (Ap + np.transpose(Ap)) / 2 + + + model = SpectralClustering( + n_clusters=num_clusters, affinity="precomputed", random_state=0 + ) + labels = model.fit_predict(Ap) + return labels diff --git a/simple_diarizer/cluster.py b/simple_diarizer/cluster.py index 737e5a7..3680fc9 100644 --- a/simple_diarizer/cluster.py +++ b/simple_diarizer/cluster.py @@ -5,7 +5,7 @@ from scipy.ndimage import gaussian_filter from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering from sklearn.metrics import pairwise_distances - +from .Spectral_clustering import NME_SpectralClustering def similarity_matrix(embeds, metric="cosine"): return pairwise_distances(embeds, metric=metric) @@ -43,9 +43,7 @@ def cluster_AHC(embeds, n_clusters=None, threshold=None, metric="cosine", **kwar # A lot of these methods are lifted from # https://github.com/wq2012/SpectralCluster ########################################## - - -def cluster_SC(embeds, n_clusters=None, threshold=None, enhance_sim=True, **kwargs): +def cluster_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, enhance_sim=True, **kwargs): """ Cluster embeds using Spectral Clustering """ @@ -59,7 +57,7 @@ def cluster_SC(embeds, n_clusters=None, threshold=None, enhance_sim=True, **kwar if n_clusters is None: (eigenvalues, eigenvectors) = compute_sorted_eigenvectors(S) # Get number of clusters. - k = compute_number_of_clusters(eigenvalues, 100, threshold) + k = compute_number_of_clusters(eigenvalues, max_speakers, threshold) # Get spectral embeddings. spectral_embeddings = eigenvectors[:, :k] @@ -82,6 +80,34 @@ def cluster_SC(embeds, n_clusters=None, threshold=None, enhance_sim=True, **kwar return cluster_model.fit_predict(S) +def cluster_NME_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, enhance_sim=True, **kwargs): + """ + Cluster embeds using NME-Spectral Clustering + + if n_clusters is None: + assert threshold, "If num_clusters is not defined, threshold must be defined" + """ + + S = cos_similarity(embeds) + if n_clusters is None: + labels = NME_SpectralClustering( + S, + num_clusters=n_clusters, + max_num_clusters=max_speakers + + ) + else: + labels = NME_SpectralClustering( + S, + num_clusters=n_clusters, + + + ) + + + return labels + + def diagonal_fill(A): """ Sets the diagonal elemnts of the matrix to the max of each row @@ -134,7 +160,7 @@ def row_max_norm(A): def sim_enhancement(A): func_order = [ diagonal_fill, - gaussian_blur, + row_threshold_mult, symmetrization, diffusion, @@ -144,6 +170,31 @@ def sim_enhancement(A): A = f(A) return A +def cos_similarity(x): + """Compute cosine similarity matrix in CPU & memory sensitive way + + Args: + x (np.ndarray): embeddings, 2D array, embeddings are in rows + + Returns: + np.ndarray: cosine similarity matrix + + """ + assert x.ndim == 2, f"x has {x.ndim} dimensions, it must be matrix" + x = x / (np.sqrt(np.sum(np.square(x), axis=1, keepdims=True)) + 1.0e-32) + assert np.allclose(np.ones_like(x[:, 0]), np.sum(np.square(x), axis=1)) + max_n_elm = 200000000 + step = max(max_n_elm // (x.shape[0] * x.shape[0]), 1) + retval = np.zeros(shape=(x.shape[0], x.shape[0]), dtype=np.float64) + x0 = np.expand_dims(x, 0) + x1 = np.expand_dims(x, 1) + for i in range(0, x.shape[1], step): + product = x0[:, :, i : i + step] * x1[:, :, i : i + step] + retval += np.sum(product, axis=2, keepdims=False) + assert np.all(retval >= -1.0001), retval + assert np.all(retval <= 1.0001), retval + return retval + def compute_affinity_matrix(X): """Compute the affinity matrix from data. diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index c03659b..df89324 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -9,7 +9,7 @@ from speechbrain.pretrained import EncoderClassifier from tqdm.autonotebook import tqdm -from .cluster import cluster_AHC, cluster_SC +from .cluster import cluster_AHC, cluster_SC, cluster_NME_SC from .utils import check_wav_16khz_mono, convert_wavfile @@ -25,12 +25,16 @@ def __init__( assert cluster_method in [ "ahc", "sc", - ], "Only ahc and sc in the supported clustering options" + "NME-sc", + ], "Only ahc,sc and NME-sc in the supported clustering options" if cluster_method == "ahc": self.cluster = cluster_AHC if cluster_method == "sc": self.cluster = cluster_SC + if cluster_method == "NME-sc": + self.cluster = cluster_NME_SC + self.vad_model, self.get_speech_ts = self.setup_VAD() @@ -56,7 +60,7 @@ def __init__( def setup_VAD(self): model, utils = torch.hub.load( - repo_or_dir="snakers4/silero-vad", model="silero_vad" + repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=True ) # force_reload=True) @@ -182,7 +186,8 @@ def diarize( self, wav_file, num_speakers=2, - threshold=None, + max_speakers=None, + threshold=0.01, silence_tolerance=0.2, enhance_sim=True, extra_info=False, @@ -194,6 +199,7 @@ def diarize( Inputs: wav_file (path): Path to input audio file num_speakers (int) or NoneType: Number of speakers to cluster to + max_speakers (int) threshold (float) or NoneType: Threshold to cluster to if num_speakers is not defined silence_tolerance (float): Same speaker segments which are close enough together @@ -229,10 +235,10 @@ def diarize( 'cluster_labels': cluster_labels (list): cluster label for each embed in embeds } - Uses AHC/SC to cluster + Uses AHC/SC/NME-SC to cluster """ recname = os.path.splitext(os.path.basename(wav_file))[0] - + outfile=recname+'.rttm' if check_wav_16khz_mono(wav_file): signal, fs = torchaudio.load(wav_file) else: @@ -258,6 +264,7 @@ def diarize( cluster_labels = self.cluster( embeds, n_clusters=num_speakers, + max_speakers=max_speakers, threshold=threshold, enhance_sim=enhance_sim, ) @@ -269,8 +276,8 @@ def diarize( cleaned_segments, silence_tolerance=silence_tolerance ) print("Done!") - if outfile: - self.rttm_output(cleaned_segments, recname, outfile=outfile) + #if outfile: + self.rttm_output(cleaned_segments, recname, outfile=outfile) if not extra_info: return cleaned_segments @@ -283,7 +290,7 @@ def diarize( @staticmethod def rttm_output(segments, recname, outfile=None): assert outfile, "Please specify an outfile" - rttm_line = "SPEAKER {} 0 {} {} {} \n" + rttm_line = "SPEAKER {} 1 {} {} {} \n" with open(outfile, "w") as fp: for seg in segments: start = seg["start"] From d637251e2f879ec7253140da46388837afd84706 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Mon, 3 Apr 2023 13:43:05 +0200 Subject: [PATCH 02/12] fix version NME-SC --- main.py | 38 ++++++------ simple_diarizer/cluster.py | 14 ++--- simple_diarizer/diarizer.py | 62 +++++++++++-------- ...l_clustering.py => spectral_clustering.py} | 11 ++-- 4 files changed, 63 insertions(+), 62 deletions(-) rename simple_diarizer/{Spectral_clustering.py => spectral_clustering.py} (92%) diff --git a/main.py b/main.py index 5471bc0..c6011b2 100644 --- a/main.py +++ b/main.py @@ -1,35 +1,35 @@ -import soundfile as sf -import matplotlib.pyplot as plt import os,sys,time - - +import argparse from simple_diarizer.diarizer import Diarizer -from simple_diarizer.utils import combined_waveplot - -t0 = time.time() +import pprint +t0 = time.time() diar = Diarizer( embed_model='ecapa', # 'xvec' and 'ecapa' supported - cluster_method='NME-sc' # 'ahc' 'sc' and 'NME-sc' supported + cluster_method='nme-sc' # 'ahc' 'sc' and 'nme-sc' supported ) -WAV_FILE,NUM_SPEAKERS,max_spk= sys.argv[1:] +parser = argparse.ArgumentParser() +parser.add_argument(dest='audio_name', type=str) +parser.add_argument("--number_of_speakers", dest='number_of_speaker', default=None, type=int) +parser.add_argument("--max_speakers", dest='max_speakers', default=25, type=int) +parser.add_argument(dest='outputfile', nargs="?", default=None) +args = parser.parse_args() +WAV_FILE=args.audio_name +NUM_SPEAKERS=args.number_of_speaker if args.number_of_speaker != "None" else None +max_spk= args.max_speakers +output_file=args.outputfile + +segments = diar.diarize(WAV_FILE, num_speakers=None,max_speakers=max_spk,outfile=output_file) -if NUM_SPEAKERS == 'None': - print('None') - segments = diar.diarize(WAV_FILE, num_speakers=None,max_speakers=int(max_spk)) -else: - segments = diar.diarize(WAV_FILE, num_speakers=int(NUM_SPEAKERS)) t1 = time.time() feature_t = t1 - t0 print("Time used for extracting features:", feature_t) - - json = {} _segments = [] _speakers = {} @@ -41,8 +41,7 @@ segment = {} segment["seg_id"] = seg_id - - # Ensure speaker id continuity and numbers speaker by order of appearance. + if seg['label'] not in spk_i_dict.keys(): spk_i_dict[seg['label']] = spk_i spk_i += 1 @@ -70,6 +69,5 @@ json["speakers"] = list(_speakers.values()) json["segments"] = _segments - -print(json["speakers"] ) +pprint.pprint(json["speakers"] ) diff --git a/simple_diarizer/cluster.py b/simple_diarizer/cluster.py index 3680fc9..05f0f9e 100644 --- a/simple_diarizer/cluster.py +++ b/simple_diarizer/cluster.py @@ -5,7 +5,7 @@ from scipy.ndimage import gaussian_filter from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering from sklearn.metrics import pairwise_distances -from .Spectral_clustering import NME_SpectralClustering +from .spectral_clustering import NME_SpectralClustering def similarity_matrix(embeds, metric="cosine"): return pairwise_distances(embeds, metric=metric) @@ -93,18 +93,14 @@ def cluster_NME_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, labels = NME_SpectralClustering( S, num_clusters=n_clusters, - max_num_clusters=max_speakers - + max_num_clusters=max_speakers ) else: labels = NME_SpectralClustering( S, - num_clusters=n_clusters, - - - ) - - + num_clusters=n_clusters + ) + return labels diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index df89324..e9b221a 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -25,14 +25,14 @@ def __init__( assert cluster_method in [ "ahc", "sc", - "NME-sc", - ], "Only ahc,sc and NME-sc in the supported clustering options" + "nme-sc", + ], "Only ahc,sc and nme-sc in the supported clustering options" if cluster_method == "ahc": self.cluster = cluster_AHC if cluster_method == "sc": self.cluster = cluster_SC - if cluster_method == "NME-sc": + if cluster_method == "nme-sc": self.cluster = cluster_NME_SC @@ -238,7 +238,7 @@ def diarize( Uses AHC/SC/NME-SC to cluster """ recname = os.path.splitext(os.path.basename(wav_file))[0] - outfile=recname+'.rttm' + if check_wav_16khz_mono(wav_file): signal, fs = torchaudio.load(wav_file) else: @@ -255,29 +255,37 @@ def diarize( print("Running VAD...") speech_ts = self.vad(signal[0]) print("Splitting by silence found {} utterances".format(len(speech_ts))) - assert len(speech_ts) >= 1, "Couldn't find any speech during VAD" - - print("Extracting embeddings...") - embeds, segments = self.recording_embeds(signal, fs, speech_ts) - - print("Clustering to {} speakers...".format(num_speakers)) - cluster_labels = self.cluster( - embeds, - n_clusters=num_speakers, - max_speakers=max_speakers, - threshold=threshold, - enhance_sim=enhance_sim, - ) - - print("Cleaning up output...") - cleaned_segments = self.join_segments(cluster_labels, segments) - cleaned_segments = self.make_output_seconds(cleaned_segments, fs) - cleaned_segments = self.join_samespeaker_segments( - cleaned_segments, silence_tolerance=silence_tolerance - ) + #assert len(speech_ts) >= 1, "Couldn't find any speech during VAD" + + if len(speech_ts) >= 1: + print("Extracting embeddings...") + embeds, segments = self.recording_embeds(signal, fs, speech_ts) + + [w,k]=embeds.shape + if w >= 2: + print('Clustering to {} speakers...'.format(num_speakers)) + cluster_labels = self.cluster(embeds, n_clusters=num_speakers,max_speakers=max_speakers, + threshold=threshold, enhance_sim=enhance_sim) + + + + cleaned_segments = self.join_segments(cluster_labels, segments) + cleaned_segments = self.make_output_seconds(cleaned_segments, fs) + cleaned_segments = self.join_samespeaker_segments(cleaned_segments, + silence_tolerance=silence_tolerance) + + + else: + cluster_labels =[ 1] + cleaned_segments = self.join_segments(cluster_labels, segments) + cleaned_segments = self.make_output_seconds(cleaned_segments, fs) + + else: + cleaned_segments = [] + print("Done!") - #if outfile: - self.rttm_output(cleaned_segments, recname, outfile=outfile) + if outfile: + self.rttm_output(cleaned_segments, recname, outfile=outfile) if not extra_info: return cleaned_segments @@ -290,7 +298,7 @@ def diarize( @staticmethod def rttm_output(segments, recname, outfile=None): assert outfile, "Please specify an outfile" - rttm_line = "SPEAKER {} 1 {} {} {} \n" + rttm_line = "SPEAKER {} 0 {} {} {} \n" with open(outfile, "w") as fp: for seg in segments: start = seg["start"] diff --git a/simple_diarizer/Spectral_clustering.py b/simple_diarizer/spectral_clustering.py similarity index 92% rename from simple_diarizer/Spectral_clustering.py rename to simple_diarizer/spectral_clustering.py index c0a1e37..359d622 100644 --- a/simple_diarizer/Spectral_clustering.py +++ b/simple_diarizer/spectral_clustering.py @@ -83,19 +83,18 @@ def ComputeNMEParameters(A, p, max_num_clusters): def NME_SpectralClustering( A, num_clusters=None, max_num_clusters=10, pbest=0, pmin=3, pmax=20 ): - print(num_clusters,max_num_clusters) + if pbest == 0: print("Selecting best number of neighbors for affinity matrix thresolding:") rbest = None kbest = None for p in range(pmin, pmax + 1): e, g, k, r = ComputeNMEParameters(A, p, max_num_clusters) - print("p={}, g={}, k={}, r={}, e={}".format(p, g, k, r, e)) if rbest is None or rbest > r: rbest = r pbest = p kbest = k - print("Best number of neighbors is {}".format(pbest)) + num_clusters = num_clusters if num_clusters is not None else (kbest + 1) # Handle some edge cases in AMI SDM num_clusters = 4 if num_clusters == 1 else num_clusters @@ -103,9 +102,9 @@ def NME_SpectralClustering( A, num_clusters, pbest ) if num_clusters is None: - print("Compute number of clusters to generate:") + e, g, k, r = ComputeNMEParameters(A, pbest, max_num_clusters) - print("Number of clusters to generate is {}".format(k + 1)) + return NME_SpectralClustering_sklearn(A, k + 1, pbest) return NME_SpectralClustering_sklearn(A, num_clusters, pbest) @@ -122,7 +121,7 @@ def NME_SpectralClustering( def NME_SpectralClustering_sklearn(A, num_clusters, pbest): - print("Number of speakers is {}".format(num_clusters)) + # Ap = Threshold(A, pbest) Ap = get_kneighbors_conn(A, pbest) # thresholded and binarized Ap = (Ap + np.transpose(Ap)) / 2 From 26992703c3877eaa8d2bc1ee483e16c8528e787b Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Mon, 3 Apr 2023 15:24:48 +0200 Subject: [PATCH 03/12] correct main.py --- main.py | 4 ++-- simple_diarizer/cluster.py | 16 +++++----------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index c6011b2..9be0ddf 100644 --- a/main.py +++ b/main.py @@ -18,11 +18,11 @@ args = parser.parse_args() WAV_FILE=args.audio_name -NUM_SPEAKERS=args.number_of_speaker if args.number_of_speaker != "None" else None +num_speakers=args.number_of_speaker if args.number_of_speaker != "None" else None max_spk= args.max_speakers output_file=args.outputfile -segments = diar.diarize(WAV_FILE, num_speakers=None,max_speakers=max_spk,outfile=output_file) +segments = diar.diarize(WAV_FILE, num_speakers=num_speakers,max_speakers=max_spk,outfile=output_file) diff --git a/simple_diarizer/cluster.py b/simple_diarizer/cluster.py index 05f0f9e..78cbb25 100644 --- a/simple_diarizer/cluster.py +++ b/simple_diarizer/cluster.py @@ -89,17 +89,11 @@ def cluster_NME_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, """ S = cos_similarity(embeds) - if n_clusters is None: - labels = NME_SpectralClustering( - S, - num_clusters=n_clusters, - max_num_clusters=max_speakers - ) - else: - labels = NME_SpectralClustering( - S, - num_clusters=n_clusters - ) + labels = NME_SpectralClustering( + S, + num_clusters=n_clusters, + max_num_clusters=max_speakers + ) return labels From cafb95f7c0d400fe1b482c12cdb2259f574eabd1 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Thu, 27 Apr 2023 08:22:45 +0200 Subject: [PATCH 04/12] channel index as an option --- simple_diarizer/diarizer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index e9b221a..b912ae3 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -285,7 +285,7 @@ def diarize( print("Done!") if outfile: - self.rttm_output(cleaned_segments, recname, outfile=outfile) + self.rttm_output(cleaned_segments, recname, outfile=outfile) if not extra_info: return cleaned_segments @@ -296,9 +296,9 @@ def diarize( "cluster_labels": cluster_labels} @staticmethod - def rttm_output(segments, recname, outfile=None): + def rttm_output(segments, recname, outfile=None, channel=0): assert outfile, "Please specify an outfile" - rttm_line = "SPEAKER {} 0 {} {} {} \n" + rttm_line = "SPEAKER {} "+str(channel)+" {} {} {} \n" with open(outfile, "w") as fp: for seg in segments: start = seg["start"] From 73926b8467b8f12ad8cab564115b0932e44ed2e4 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Thu, 27 Apr 2023 08:23:01 +0200 Subject: [PATCH 05/12] improve main --- main.py | 118 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/main.py b/main.py index 9be0ddf..18befae 100644 --- a/main.py +++ b/main.py @@ -3,71 +3,75 @@ from simple_diarizer.diarizer import Diarizer import pprint +parser = argparse.ArgumentParser( + description="Speaker diarization", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + +) +parser.add_argument(dest='audio_name', type=str, help="Input audio file") +parser.add_argument(dest='outputfile', nargs="?", default=None, help="Optional output file") +parser.add_argument("--number_of_speakers", dest='number_of_speaker', default=None, type=int, help="Number of speakers (if known)") +parser.add_argument("--max_speakers", dest='max_speakers', default=25, type=int, help="Maximum number of speakers (if number of speaker is unknown)") +parser.add_argument("--embed_model", dest='embed_model', default="ecapa", type=str, help="Name of embedding") +parser.add_argument("--cluster_method", dest='cluster_method', default="nme-sc", type=str, help="Clustering method") +args = parser.parse_args() -t0 = time.time() diar = Diarizer( - embed_model='ecapa', # 'xvec' and 'ecapa' supported - cluster_method='nme-sc' # 'ahc' 'sc' and 'nme-sc' supported - ) - -parser = argparse.ArgumentParser() -parser.add_argument(dest='audio_name', type=str) -parser.add_argument("--number_of_speakers", dest='number_of_speaker', default=None, type=int) -parser.add_argument("--max_speakers", dest='max_speakers', default=25, type=int) -parser.add_argument(dest='outputfile', nargs="?", default=None) -args = parser.parse_args() + embed_model=args.embed_model, # 'xvec' and 'ecapa' supported + cluster_method=args.cluster_method # 'ahc' 'sc' and 'nme-sc' supported +) WAV_FILE=args.audio_name num_speakers=args.number_of_speaker if args.number_of_speaker != "None" else None max_spk= args.max_speakers output_file=args.outputfile -segments = diar.diarize(WAV_FILE, num_speakers=num_speakers,max_speakers=max_spk,outfile=output_file) - +t0 = time.time() +segments = diar.diarize(WAV_FILE, num_speakers=num_speakers,max_speakers=max_spk,outfile=output_file) -t1 = time.time() -feature_t = t1 - t0 -print("Time used for extracting features:", feature_t) - -json = {} -_segments = [] -_speakers = {} -seg_id = 1 -spk_i = 1 -spk_i_dict = {} - -for seg in segments: - - segment = {} - segment["seg_id"] = seg_id - - if seg['label'] not in spk_i_dict.keys(): - spk_i_dict[seg['label']] = spk_i - spk_i += 1 - - spk_id = "spk" + str(spk_i_dict[seg['label']]) - segment["spk_id"] = spk_id - segment["seg_begin"] = round(seg['start']) - segment["seg_end"] = round(seg['end']) - - if spk_id not in _speakers: - _speakers[spk_id] = {} - _speakers[spk_id]["spk_id"] = spk_id - _speakers[spk_id]["duration"] = seg['end']-seg['start'] - _speakers[spk_id]["nbr_seg"] = 1 - else: - _speakers[spk_id]["duration"] += seg['end']-seg['start'] - _speakers[spk_id]["nbr_seg"] += 1 - - _segments.append(segment) - seg_id += 1 - -for spkstat in _speakers.values(): - spkstat["duration"] = round(spkstat["duration"]) - -json["speakers"] = list(_speakers.values()) -json["segments"] = _segments - -pprint.pprint(json["speakers"] ) +print("Time used for processing:", time.time() - t0) + +if not output_file: + + json = {} + _segments = [] + _speakers = {} + seg_id = 1 + spk_i = 1 + spk_i_dict = {} + + for seg in segments: + + segment = {} + segment["seg_id"] = seg_id + + if seg['label'] not in spk_i_dict.keys(): + spk_i_dict[seg['label']] = spk_i + spk_i += 1 + + spk_id = "spk" + str(spk_i_dict[seg['label']]) + segment["spk_id"] = spk_id + segment["seg_begin"] = round(seg['start']) + segment["seg_end"] = round(seg['end']) + + if spk_id not in _speakers: + _speakers[spk_id] = {} + _speakers[spk_id]["spk_id"] = spk_id + _speakers[spk_id]["duration"] = seg['end']-seg['start'] + _speakers[spk_id]["nbr_seg"] = 1 + else: + _speakers[spk_id]["duration"] += seg['end']-seg['start'] + _speakers[spk_id]["nbr_seg"] += 1 + + _segments.append(segment) + seg_id += 1 + + for spkstat in _speakers.values(): + spkstat["duration"] = round(spkstat["duration"]) + + json["speakers"] = list(_speakers.values()) + json["segments"] = _segments + + pprint.pprint(json) From 5e9670cefbc632fe29765eaf034161ed005adf11 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Thu, 27 Apr 2023 08:40:32 +0200 Subject: [PATCH 06/12] merge manually modifications from branch max_speaker --- requirements.txt | 6 ++++-- simple_diarizer/__init__.py | 2 +- simple_diarizer/diarizer.py | 2 +- simple_diarizer/utils.py | 7 ++++--- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad2b2dc..02f3975 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ -ipython>=7.9.0 -matplotlib>=3.5.1 +# ipython>=7.9.0 +# matplotlib>=3.5.1 pandas>=1.3.5 scikit-learn>=1.0.2 speechbrain>=0.5.11 torchaudio>=0.10.1 +onnxruntime>=1.14.0 +scipy<=1.8.1 # newer version can provoke segmentation faults \ No newline at end of file diff --git a/simple_diarizer/__init__.py b/simple_diarizer/__init__.py index dc085ab..525e1ce 100644 --- a/simple_diarizer/__init__.py +++ b/simple_diarizer/__init__.py @@ -1,3 +1,3 @@ import os -__version__ = os.getenv("GITHUB_REF_NAME", "latest") +__version__ = os.getenv("GITHUB_REF_NAME", "1.0.2") diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index b912ae3..8138b21 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -187,7 +187,7 @@ def diarize( wav_file, num_speakers=2, max_speakers=None, - threshold=0.01, + threshold=None, silence_tolerance=0.2, enhance_sim=True, extra_info=False, diff --git a/simple_diarizer/utils.py b/simple_diarizer/utils.py index de70954..cf92ad4 100644 --- a/simple_diarizer/utils.py +++ b/simple_diarizer/utils.py @@ -2,11 +2,8 @@ import subprocess from pprint import pprint -import matplotlib.pyplot as plt import numpy as np import torchaudio -from IPython.display import Audio, display - ################## # Audio utils @@ -83,6 +80,7 @@ def waveplot(signal, fs, start_idx=0, figsize=(5, 3), color="tab:blue"): Outputs: - Returns the matplotlib figure """ + import matplotlib.pyplot as plt plt.figure(figsize=figsize) start_time = start_idx / fs end_time = start_time + (len(signal) / fs) @@ -113,6 +111,7 @@ def combined_waveplot(signal, fs, segments, figsize=(10, 3), tick_interval=60): Outputs: - The matplotlib figure """ + import matplotlib.pyplot as plt plt.figure(figsize=figsize) for seg in segments: start = seg["start_sample"] @@ -153,6 +152,8 @@ def waveplot_perspeaker(signal, fs, segments): Designed to be run in a jupyter notebook """ + import matplotlib.pyplot as plt + from IPython.display import Audio, display for seg in segments: start = seg["start_sample"] end = seg["end_sample"] From 10db72d509956a1871f7ef1f40710213d8855391 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Tue, 9 May 2023 16:35:38 +0200 Subject: [PATCH 07/12] correct cluster by NME --- simple_diarizer/cluster.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/simple_diarizer/cluster.py b/simple_diarizer/cluster.py index 78cbb25..dfec4bf 100644 --- a/simple_diarizer/cluster.py +++ b/simple_diarizer/cluster.py @@ -89,11 +89,21 @@ def cluster_NME_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, """ S = cos_similarity(embeds) - labels = NME_SpectralClustering( - S, - num_clusters=n_clusters, - max_num_clusters=max_speakers - ) + + if n_clusters is None: + labels = NME_SpectralClustering( + S, + num_clusters=n_clusters, + max_num_clusters=max_speakers + + ) + else: + labels = NME_SpectralClustering( + S, + num_clusters=n_clusters, + + + ) return labels From eeac8f3c558f2653d6ef51d08e181a6c54a0ab81 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Tue, 9 May 2023 17:21:56 +0200 Subject: [PATCH 08/12] modify NME --- simple_diarizer/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/simple_diarizer/cluster.py b/simple_diarizer/cluster.py index dfec4bf..3232470 100644 --- a/simple_diarizer/cluster.py +++ b/simple_diarizer/cluster.py @@ -5,7 +5,7 @@ from scipy.ndimage import gaussian_filter from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering from sklearn.metrics import pairwise_distances -from .spectral_clustering import NME_SpectralClustering +from .spectral_clustering import NME_SpectralClustering,NME_SpectralClustering_sklearn def similarity_matrix(embeds, metric="cosine"): return pairwise_distances(embeds, metric=metric) @@ -98,7 +98,7 @@ def cluster_NME_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, ) else: - labels = NME_SpectralClustering( + labels = NME_SpectralClustering_sklearn( S, num_clusters=n_clusters, From 04bbffb5927abf35a586561f3c872f06b4069a84 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Tue, 9 May 2023 18:26:36 +0200 Subject: [PATCH 09/12] fix max_num_clusters at the root --- simple_diarizer/cluster.py | 21 ++++++--------------- simple_diarizer/spectral_clustering.py | 6 ++++-- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/simple_diarizer/cluster.py b/simple_diarizer/cluster.py index 3232470..7f67426 100644 --- a/simple_diarizer/cluster.py +++ b/simple_diarizer/cluster.py @@ -5,7 +5,7 @@ from scipy.ndimage import gaussian_filter from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralClustering from sklearn.metrics import pairwise_distances -from .spectral_clustering import NME_SpectralClustering,NME_SpectralClustering_sklearn +from .spectral_clustering import NME_SpectralClustering def similarity_matrix(embeds, metric="cosine"): return pairwise_distances(embeds, metric=metric) @@ -90,20 +90,11 @@ def cluster_NME_SC(embeds, n_clusters=None, max_speakers= None, threshold=None, S = cos_similarity(embeds) - if n_clusters is None: - labels = NME_SpectralClustering( - S, - num_clusters=n_clusters, - max_num_clusters=max_speakers - - ) - else: - labels = NME_SpectralClustering_sklearn( - S, - num_clusters=n_clusters, - - - ) + labels = NME_SpectralClustering( + S, + num_clusters=n_clusters, + max_num_clusters=max_speakers + ) return labels diff --git a/simple_diarizer/spectral_clustering.py b/simple_diarizer/spectral_clustering.py index 359d622..d03dca3 100644 --- a/simple_diarizer/spectral_clustering.py +++ b/simple_diarizer/spectral_clustering.py @@ -81,9 +81,11 @@ def ComputeNMEParameters(A, p, max_num_clusters): def NME_SpectralClustering( - A, num_clusters=None, max_num_clusters=10, pbest=0, pmin=3, pmax=20 + A, num_clusters=None, max_num_clusters=None, pbest=0, pmin=3, pmax=20 ): - + if max_num_clusters is None and num_clusters is not None: + max_num_clusters = num_clusters + if pbest == 0: print("Selecting best number of neighbors for affinity matrix thresolding:") rbest = None From c6432874766ff2c89eb0ea0ff595dbc3d5e12cad Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Wed, 10 May 2023 10:08:45 +0200 Subject: [PATCH 10/12] remove hack --- simple_diarizer/spectral_clustering.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/simple_diarizer/spectral_clustering.py b/simple_diarizer/spectral_clustering.py index d03dca3..d9fbd23 100644 --- a/simple_diarizer/spectral_clustering.py +++ b/simple_diarizer/spectral_clustering.py @@ -98,16 +98,14 @@ def NME_SpectralClustering( kbest = k num_clusters = num_clusters if num_clusters is not None else (kbest + 1) - # Handle some edge cases in AMI SDM - num_clusters = 4 if num_clusters == 1 else num_clusters return NME_SpectralClustering_sklearn( A, num_clusters, pbest ) + if num_clusters is None: - e, g, k, r = ComputeNMEParameters(A, pbest, max_num_clusters) - return NME_SpectralClustering_sklearn(A, k + 1, pbest) + return NME_SpectralClustering_sklearn(A, num_clusters, pbest) From 02fbb851bb820b9e179ce3c84a12ec8b65d304e2 Mon Sep 17 00:00:00 2001 From: Jeronymous Date: Wed, 10 May 2023 13:16:28 +0200 Subject: [PATCH 11/12] safety check --- simple_diarizer/spectral_clustering.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/simple_diarizer/spectral_clustering.py b/simple_diarizer/spectral_clustering.py index d9fbd23..8b3ac01 100644 --- a/simple_diarizer/spectral_clustering.py +++ b/simple_diarizer/spectral_clustering.py @@ -83,7 +83,8 @@ def ComputeNMEParameters(A, p, max_num_clusters): def NME_SpectralClustering( A, num_clusters=None, max_num_clusters=None, pbest=0, pmin=3, pmax=20 ): - if max_num_clusters is None and num_clusters is not None: + if max_num_clusters is None: + assert num_clusters is not None, "Cannot have both num_clusters and max_num_clusters be None" max_num_clusters = num_clusters if pbest == 0: From f190afe580662b5104e3683c7ed45cba4351e5e0 Mon Sep 17 00:00:00 2001 From: wghezaiel Date: Thu, 14 Mar 2024 13:24:20 +0100 Subject: [PATCH 12/12] speechbrain version 1.0.0 --- simple_diarizer/diarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simple_diarizer/diarizer.py b/simple_diarizer/diarizer.py index 8138b21..400e991 100644 --- a/simple_diarizer/diarizer.py +++ b/simple_diarizer/diarizer.py @@ -6,7 +6,7 @@ import pandas as pd import torch import torchaudio -from speechbrain.pretrained import EncoderClassifier +from speechbrain.inference.speaker import EncoderClassifier from tqdm.autonotebook import tqdm from .cluster import cluster_AHC, cluster_SC, cluster_NME_SC