Source code for spectral_similarity

import numpy as np
from typing import Union

try:
    from . import math_distance, ms_distance
    from .tools import clean_spectrum, match_peaks_in_spectra, normalize_distance
except:
    pass

methods_name = {
    "entropy": "Entropy distance",
    "unweighted_entropy": "Unweighted entropy distance",
    "euclidean": "Euclidean distance",
    "manhattan": "Manhattan distance",
    "chebyshev": "Chebyshev distance",
    "squared_euclidean": "Squared Euclidean distance",
    "fidelity": "Fidelity distance",
    "matusita": "Matusita distance",
    "squared_chord": "Squared-chord distance",
    "bhattacharya_1": "Bhattacharya 1 distance",
    "bhattacharya_2": "Bhattacharya 2 distance",
    "harmonic_mean": "Harmonic mean distance",
    "probabilistic_symmetric_chi_squared": "Probabilistic symmetric χ2 distance",
    "ruzicka": "Ruzicka distance",
    "roberts": "Roberts distance",
    "intersection": "Intersection distance",
    "motyka": "Motyka distance",
    "canberra": "Canberra distance",
    "baroni_urbani_buser": "Baroni-Urbani-Buser distance",
    "penrose_size": "Penrose size distance",
    "mean_character": "Mean character distance",
    "lorentzian": "Lorentzian distance",
    "penrose_shape": "Penrose shape distance",
    "clark": "Clark distance",
    "hellinger": "Hellinger distance",
    "whittaker_index_of_association": "Whittaker index of association distance",
    "symmetric_chi_squared": "Symmetric χ2 distance",
    "pearson_correlation": "Pearson/Spearman Correlation Coefficient",
    "improved_similarity": "Improved Similarity",
    "absolute_value": "Absolute Value Distance",
    "dot_product": "Dot product distance",
    "cosine": "Cosine distance",
    "dot_product_reverse": "Reverse dot product distance",
    "spectral_contrast_angle": "Spectral Contrast Angle",
    "wave_hedges": "Wave Hedges distance",
    "jaccard": "Jaccard distance",
    "dice": "Dice distance",
    "inner_product": "Inner product distance",
    "divergence": "Divergence distance",
    "avg_l": "Avg (L1, L∞) distance",
    "vicis_symmetric_chi_squared_3": "Vicis-Symmetric χ2 3 distance",
    "ms_for_id_v1": "MSforID distance version 1",
    "ms_for_id": "MSforID distance",
    "weighted_dot_product": "Weighted dot product distance",
}

methods_range = {
    "entropy": [0, np.log(4)],
    "unweighted_entropy": [0, np.log(4)],
    "absolute_value": [0, 2],
    "avg_l": [0, 1.5],
    "bhattacharya_1": [0, np.arccos(0) ** 2],
    "bhattacharya_2": [0, np.inf],
    "canberra": [0, np.inf],
    "clark": [0, np.inf],
    "divergence": [0, np.inf],
    "euclidean": [0, np.sqrt(2)],
    "hellinger": [0, np.inf],
    "improved_similarity": [0, np.inf],
    "lorentzian": [0, np.inf],
    "manhattan": [0, 2],
    "matusita": [0, np.sqrt(2)],
    "mean_character": [0, 2],
    "motyka": [-0.5, 0],
    "ms_for_id": [-np.inf, 0],
    "ms_for_id_v1": [0, np.inf],
    "pearson_correlation": [-1, 1],
    "penrose_shape": [0, np.sqrt(2)],
    "penrose_size": [0, np.inf],
    "probabilistic_symmetric_chi_squared": [0, 1],
    "similarity_index": [0, np.inf],
    "squared_chord": [0, 2],
    "squared_euclidean": [0, 2],
    "symmetric_chi_squared": [0, 0.5 * np.sqrt(2)],
    "vicis_symmetric_chi_squared_3": [0, 2],
    "wave_hedges": [0, np.inf],
    "whittaker_index_of_association": [0, np.inf]
}


[docs]def similarity(spectrum_query: Union[list, np.ndarray], spectrum_library: Union[list, np.ndarray], method: str, ms2_ppm: float = None, ms2_da: float = None, need_clean_spectra: bool = True, need_normalize_result: bool = True) -> float: """ Calculate the similarity between two spectra, find common peaks. If both ms2_ppm and ms2_da is defined, ms2_da will be used. :param spectrum_query: The query spectrum, need to be in numpy array format. :param spectrum_library: The library spectrum, need to be in numpy array format. :param method: Supported methods: "entropy", "unweighted_entropy", "euclidean", "manhattan", "chebyshev", "squared_euclidean", "fidelity", \ "matusita", "squared_chord", "bhattacharya_1", "bhattacharya_2", "harmonic_mean", \ "probabilistic_symmetric_chi_squared", "ruzicka", "roberts", "intersection", \ "motyka", "canberra", "baroni_urbani_buser", "penrose_size", "mean_character", "lorentzian",\ "penrose_shape", "clark", "hellinger", "whittaker_index_of_association", "symmetric_chi_squared", \ "pearson_correlation", "improved_similarity", "absolute_value", "dot_product", "dot_product_reverse", \ "spectral_contrast_angle", "wave_hedges", "jaccard", "dice", "inner_product", "divergence", \ "avg_l", "vicis_symmetric_chi_squared_3", "ms_for_id_v1", "ms_for_id", "weighted_dot_product" :param ms2_ppm: The MS/MS tolerance in ppm. :param ms2_da: The MS/MS tolerance in Da. :param need_clean_spectra: Normalize spectra before comparing, required for not normalized spectrum. :param need_normalize_result: Normalize the result into [0,1]. :return: Similarity between two spectra """ if need_normalize_result: return 1 - distance(spectrum_query=spectrum_query, spectrum_library=spectrum_library, method=method, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da) else: return 0 - distance(spectrum_query=spectrum_query, spectrum_library=spectrum_library, method=method, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da)
[docs]def all_similarity(spectrum_query: Union[list, np.ndarray], spectrum_library: Union[list, np.ndarray], ms2_ppm: float = None, ms2_da: float = None, need_clean_spectra: bool = True, need_normalize_result: bool = True) -> dict: """ Calculate all the similarity between two spectra, find common peaks. If both ms2_ppm and ms2_da is defined, ms2_da will be used. :param spectrum_query: The query spectrum, need to be in numpy array format. :param spectrum_library: The library spectrum, need to be in numpy array format. :param ms2_ppm: The MS/MS tolerance in ppm. :param ms2_da: The MS/MS tolerance in Da. :param need_clean_spectra: Normalize spectra before comparing, required for not normalized spectrum. :param need_normalize_result: Normalize the result into [0,1]. :return: A dict contains all similarity. """ all_similarity_score = all_distance(spectrum_query=spectrum_query, spectrum_library=spectrum_library, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da) for m in all_similarity_score: if need_normalize_result: all_similarity_score[m] = 1 - all_similarity_score[m] else: all_similarity_score[m] = 0 - all_similarity_score[m] return all_similarity_score
[docs]def multiple_similarity(spectrum_query: Union[list, np.ndarray], spectrum_library: Union[list, np.ndarray], methods: list = None, ms2_ppm: float = None, ms2_da: float = None, need_clean_spectra: bool = True, need_normalize_result: bool = True) -> dict: """ Calculate multiple similarity between two spectra, find common peaks. If both ms2_ppm and ms2_da is defined, ms2_da will be used. :param spectrum_query: The query spectrum, need to be in numpy array format. :param spectrum_library: The library spectrum, need to be in numpy array format. :param methods: A list of method names. :param ms2_ppm: The MS/MS tolerance in ppm. :param ms2_da: The MS/MS tolerance in Da. :param need_clean_spectra: Normalize spectra before comparing, required for not normalized spectrum. :param need_normalize_result: Normalize the result into [0,1]. :return: A dict contains all similarity. """ if methods: result = {} for m in methods: s = similarity(spectrum_query=spectrum_query, spectrum_library=spectrum_library, method=m, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da) result[m] = float(s) return result else: return all_similarity(spectrum_query=spectrum_query, spectrum_library=spectrum_library, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da)
[docs]def distance(spectrum_query: Union[list, np.ndarray], spectrum_library: Union[list, np.ndarray], method: str, ms2_ppm: float = None, ms2_da: float = None, need_clean_spectra: bool = True, need_normalize_result: bool = True) -> float: """ Calculate the distance between two spectra, find common peaks. If both ms2_ppm and ms2_da is defined, ms2_da will be used. :param spectrum_query: The query spectrum, need to be in numpy array format. :param spectrum_library: The library spectrum, need to be in numpy array format. :param method: Supported methods: "entropy", "unweighted_entropy", "euclidean", "manhattan", "chebyshev", "squared_euclidean", "fidelity", \ "matusita", "squared_chord", "bhattacharya_1", "bhattacharya_2", "harmonic_mean", \ "probabilistic_symmetric_chi_squared", "ruzicka", "roberts", "intersection", \ "motyka", "canberra", "baroni_urbani_buser", "penrose_size", "mean_character", "lorentzian",\ "penrose_shape", "clark", "hellinger", "whittaker_index_of_association", "symmetric_chi_squared", \ "pearson_correlation", "improved_similarity", "absolute_value", "dot_product", "dot_product_reverse", \ "spectral_contrast_angle", "wave_hedges", "jaccard", "dice", "inner_product", "divergence", \ "avg_l", "vicis_symmetric_chi_squared_3", "ms_for_id_v1", "ms_for_id", "weighted_dot_product" :param ms2_ppm: The MS/MS tolerance in ppm. :param ms2_da: The MS/MS tolerance in Da. :param need_clean_spectra: Normalize spectra before comparing, required for not normalized spectrum. :param need_normalize_result: Normalize the result into [0,1]. :return: Distance between two spectra """ if ms2_ppm is None and ms2_da is None: raise ValueError("MS2 tolerance need to be defined!") spectrum_query = np.asarray(spectrum_query, dtype=np.float32) spectrum_library = np.asarray(spectrum_library, dtype=np.float32) if need_clean_spectra: spectrum_query = clean_spectrum(spectrum_query, ms2_ppm=ms2_ppm, ms2_da=ms2_da) spectrum_library = clean_spectrum(spectrum_library, ms2_ppm=ms2_ppm, ms2_da=ms2_da) # Calculate similarity if spectrum_query.shape[0] > 0 and spectrum_library.shape[0] > 0: function_name = method + "_distance" if hasattr(math_distance, function_name): f = getattr(math_distance, function_name) spec_matched = match_peaks_in_spectra(spec_a=spectrum_query, spec_b=spectrum_library, ms2_ppm=ms2_ppm, ms2_da=ms2_da) dist = f(spec_matched[:, 1], spec_matched[:, 2]) elif hasattr(ms_distance, function_name): f = getattr(ms_distance, function_name) dist = f(spectrum_query, spectrum_library, ms2_ppm=ms2_ppm, ms2_da=ms2_da) else: raise RuntimeError("Method name: {} error!".format(method)) # Normalize result if need_normalize_result: if method not in methods_range: dist_range = [0, 1] else: dist_range = methods_range[method] dist = normalize_distance(dist, dist_range) return dist else: if need_normalize_result: return 1 else: return np.inf
[docs]def all_distance(spectrum_query: Union[list, np.ndarray], spectrum_library: Union[list, np.ndarray], ms2_ppm: float = None, ms2_da: float = None, need_clean_spectra: bool = True, need_normalize_result: bool = True) -> dict: """ Calculate the distance between two spectra, find common peaks. If both ms2_ppm and ms2_da is defined, ms2_da will be used. :param spectrum_query: The query spectrum, need to be in numpy array format. :param spectrum_library: The library spectrum, need to be in numpy array format. :param ms2_ppm: The MS/MS tolerance in ppm. :param ms2_da: The MS/MS tolerance in Da. :param need_clean_spectra: Normalize spectra before comparing, required for not normalized spectrum. :param need_normalize_result: Normalize the result into [0,1]. :return: Distance between two spectra """ if ms2_ppm is None and ms2_da is None: raise ValueError("MS2 tolerance need to be defined!") spectrum_query = np.asarray(spectrum_query, dtype=np.float32) spectrum_library = np.asarray(spectrum_library, dtype=np.float32) if need_clean_spectra: spectrum_query = clean_spectrum(spectrum_query, ms2_ppm=ms2_ppm, ms2_da=ms2_da) spectrum_library = clean_spectrum(spectrum_library, ms2_ppm=ms2_ppm, ms2_da=ms2_da) # Calculate similarity result = {} if spectrum_query.shape[0] > 0 and spectrum_library.shape[0] > 0: spec_matched = match_peaks_in_spectra(spec_a=spectrum_query, spec_b=spectrum_library, ms2_ppm=ms2_ppm, ms2_da=ms2_da) for method in methods_name: function_name = method + "_distance" if hasattr(math_distance, function_name): f = getattr(math_distance, function_name) dist = f(spec_matched[:, 1], spec_matched[:, 2]) elif hasattr(ms_distance, function_name): f = getattr(ms_distance, function_name) dist = f(spectrum_query, spectrum_library, ms2_ppm=ms2_ppm, ms2_da=ms2_da) else: raise RuntimeError("Method name: {} error!".format(method)) # Normalize result if need_normalize_result: if method not in methods_range: dist_range = [0, 1] else: dist_range = methods_range[method] dist = normalize_distance(dist, dist_range) result[method] = dist else: for method in methods_name: if need_normalize_result: result[method] = 1 else: result[method] = np.inf return result
[docs]def multiple_distance(spectrum_query: Union[list, np.ndarray], spectrum_library: Union[list, np.ndarray], methods: list = None, ms2_ppm: float = None, ms2_da: float = None, need_clean_spectra: bool = True, need_normalize_result: bool = True) -> dict: """ Calculate multiple distance between two spectra, find common peaks. If both ms2_ppm and ms2_da is defined, ms2_da will be used. :param spectrum_query: The query spectrum, need to be in numpy array format. :param spectrum_library: The library spectrum, need to be in numpy array format. :param methods: A list of method names. :param ms2_ppm: The MS/MS tolerance in ppm. :param ms2_da: The MS/MS tolerance in Da. :param need_clean_spectra: Normalize spectra before comparing, required for not normalized spectrum. :param need_normalize_result: Normalize the result into [0,1]. :return: Distance between two spectra """ if methods: result = {} for m in methods: dist = distance(spectrum_query=spectrum_query, spectrum_library=spectrum_library, method=m, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da) result[m] = float(dist) return result else: return all_distance(spectrum_query=spectrum_query, spectrum_library=spectrum_library, need_clean_spectra=need_clean_spectra, need_normalize_result=need_normalize_result, ms2_ppm=ms2_ppm, ms2_da=ms2_da)