Source code for deimos.filters

import numpy as np
import pandas as pd
import scipy.ndimage as ndi
from ripser import ripser
from scipy import sparse
from scipy.spatial import KDTree
from sklearn.utils.sparsefuncs import _get_median

import deimos


[docs]def std(a, size):
    '''
    N-dimensional convolution of a standard deviation filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    c1 = ndi.uniform_filter(a, size, mode='reflect')
    c2 = ndi.uniform_filter(np.square(a), size, mode='reflect')
    return np.abs(np.lib.scimath.sqrt(c2 - np.square(c1)))


[docs]def std_pdf(edges, a, size):
    '''
    N-dimensional convolution of a standard deviation probability density
    function filter.

    Parameters
    ----------
    edges : list of :obj:`~numpy.array`
        Edges coordinates along each grid axis.
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    list of :obj:`~numpy.array`
        Filtered edge data.

    '''

    edges = np.meshgrid(*edges, indexing='ij')
    f = ndi.uniform_filter(a, size=size, mode='constant')

    res = []
    for e in edges:
        wmu = ndi.uniform_filter(a * e, size=size, mode='constant')
        mu = wmu / f

        wvar = ndi.uniform_filter(
            a * (e - mu) ** 2, size=size, mode='constant')
        var = wvar / f
        res.append(np.sqrt(var))

    return res


[docs]def maximum(a, size):
    '''
    N-dimensional convolution of a maximum filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    return ndi.maximum_filter(a, size=size, mode='constant', cval=-np.inf)


[docs]def minimum(a, size):
    '''
    N-dimensional convolution of a minimum filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    return ndi.minimum_filter(a, size=size, mode='constant', cval=np.inf)


[docs]def sum(a, size):
    '''
    N-dimensional convolution of a sum filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    size = deimos.utils.safelist(size)

    if len(size) == 1:
        size = size[0]
        ksize = size ** a.ndim
    else:
        ksize = np.prod(size)

    return ksize * ndi.uniform_filter(a, size, mode='constant', cval=0.0)


[docs]def mean(a, size):
    '''
    N-dimensional convolution of a mean filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    return ndi.uniform_filter(a, size=size, mode='constant', cval=0.0)


[docs]def mean_pdf(edges, a, size):
    '''
    N-dimensional convolution of a mean probability density function filter.

    Parameters
    ----------
    edges : list of :obj:`~numpy.array`
        Edges coordinates along each grid axis.
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    list of :obj:`~numpy.array`
        Filtered edge data.

    '''

    edges = np.meshgrid(*edges, indexing='ij')
    f = ndi.uniform_filter(a, size=size, mode='constant')

    res = []
    for e in edges:
        w = ndi.uniform_filter(a * e, size=size, mode='constant')
        res.append(w / f)

    return res


[docs]def matched_gaussian(a, size):
    '''
    N-dimensional convolution of a matched Gaussian filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    return np.square(ndi.gaussian_filter(a, size, mode='constant', cval=0.0))


[docs]def count(a, size, nonzero=False):
    '''
    N-dimensional convolution of a counting filter.

    Parameters
    ----------
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.
    nonzero : bool
        Only count nonzero values.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensity data.

    '''

    if nonzero is True:
        a = np.where(np.nan_to_num(a) > 0, 1.0, 0.0)
    else:
        a = np.where(np.isnan(a), 0.0, 1.0)

    return deimos.filters.sum(a, size)


[docs]def skew_pdf(edges, a, size):
    '''
    N-dimensional convolution of a skew probability density function filter.

    Parameters
    ----------
    edges : list of :obj:`~numpy.array`
        Edges coordinates along each grid axis.
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    list of :obj:`~numpy.array`
        Filtered edge data.

    '''

    edges = np.meshgrid(*edges, indexing='ij')
    f = ndi.uniform_filter(a, size=size, mode='constant')

    res = []
    for e in edges:
        wmu = ndi.uniform_filter(a * e, size=size, mode='constant')
        mu = wmu / f

        wvar = ndi.uniform_filter(
            a * (e - mu) ** 2, size=size, mode='constant')
        var = wvar / f
        sigma = np.sqrt(var)

        wskew = ndi.uniform_filter(
            a * ((e - mu) / sigma) ** 3, size=size, mode='constant')
        skew = wskew / f
        res.append(skew)

    return res


[docs]def kurtosis_pdf(edges, a, size):
    '''
    N-dimensional convolution of a kurtosis probability density function
    filter.

    Parameters
    ----------
    edges : list of :obj:`~numpy.array`
        Edges coordinates along each grid axis.
    a : :obj:`~numpy.array`
        N-dimensional array of intensity data.
    size : int or list
        Size of the convolution kernel in each dimension.

    Returns
    -------
    list of :obj:`~numpy.array`
        Filtered edge data.

    '''

    edges = np.meshgrid(*edges, indexing='ij')
    f = ndi.uniform_filter(a, size=size, mode='constant')

    res = []
    for e in edges:
        wmu = ndi.uniform_filter(a * e, size=size, mode='constant')
        mu = wmu / f

        wvar = ndi.uniform_filter(
            a * (e - mu) ** 2, size=size, mode='constant')
        var = wvar / f
        sigma = np.sqrt(var)

        wkurtosis = ndi.uniform_filter(
            a * ((e - mu) / sigma) ** 4, size=size, mode='constant')
        kurtosis = wkurtosis / f
        res.append(kurtosis - 3)

    return res


[docs]def embed_unique_indices(a):

    def count_tens(n):
        # Count tens
        ntens = (n - 1) // 10

        while True:
            ntens_test = (ntens + n - 1) // 10

            if ntens_test == ntens:
                return ntens
            else:
                ntens = ntens_test

    def arange_exclude_10s(n):
        # How many 10s will there be?
        ntens = count_tens(n)

        # Base array
        arr = np.arange(0, n + ntens)

        # Exclude 10s
        arr = arr[(arr == 0) | (arr % 10 != 0)][:n]

        return arr

    # Creates an array of indices, sorted by unique element
    idx_sort = np.argsort(a)
    idx_unsort = np.argsort(idx_sort)

    # Sorts records array so all unique elements are together
    sorted_a = a[idx_sort]

    # Returns the unique values, the index of the first occurrence,
    # and the count for each element
    vals, idx_start, count = np.unique(sorted_a, return_index=True, return_counts=True)

    # Splits the indices into separate arrays
    splits = np.split(idx_sort, idx_start[1:])

    # Creates unique indices for each split
    idx_unq = np.concatenate([arange_exclude_10s(len(x)) for x in splits])

    # Reorders according to input array
    idx_unq = idx_unq[idx_unsort]

    # Magnitude of each index
    exp = np.log10(idx_unq,
                   where=idx_unq > 0,
                   out=np.zeros_like(idx_unq, dtype=np.float64))
    idx_unq_mag = np.power(10, np.floor(exp) + 1)

    # Result
    return a + idx_unq / idx_unq_mag


[docs]def sparse_upper_star(idx, V):
    '''
    Sparse implementation of an upper star filtration.
    Parameters
    ----------
    idx : :obj:`~numpy.array`
        Edge indices for each dimension (MxN).
    V : :obj:`~numpy.array`
        Array of intensity data (Mx1).
    Returns
    -------
    idx : :obj:`~numpy.array`
        Index of filtered points (Mx1).
    persistence : :obj:`~numpy.array`
        Persistence of each filtered point (Mx1).
    '''

    # Invert
    V = -1 * V.copy().astype(int)

    # Embed indices
    V = embed_unique_indices(V)

    # Connectivity matrix
    cmat = KDTree(idx)
    cmat = cmat.sparse_distance_matrix(
        cmat, 1, p=np.inf, output_type='coo_matrix')
    cmat.setdiag(1)
    cmat = sparse.triu(cmat)

    # Pairwise minimums
    I, J = cmat.nonzero()
    d = np.maximum(V[I], V[J])

    # Delete connectiity matrix
    cmat_shape = cmat.shape
    del cmat

    # Sparse distance matrix
    sdm = sparse.coo_matrix((d, (I, J)), shape=cmat_shape)

    # Delete pairwise mins
    del d, I, J

    # Persistence homology
    ph = ripser(sdm, distance_matrix=True, maxdim=0)['dgms'][0]

    # Bound death values
    ph[ph[:, 1] == np.inf, 1] = np.max(V)

    # Construct tree to query against
    tree = KDTree(V.reshape((-1, 1)))

    # Get the indexes of the first nearest neighbor by birth
    _, nn = tree.query(ph[:, 0].reshape((-1, 1)), k=1, workers=-1)

    return nn, -(ph[:, 0] // 1 - ph[:, 1] // 1)


[docs]def sparse_mean_filter(idx, V, radius=[0, 1, 1]):
    '''
    Sparse implementation of a mean filter.

    Parameters
    ----------
    idx : :obj:`~numpy.array`
        Edge indices for each dimension (MxN).
    V : :obj:`~numpy.array`
        Array of intensity data (Mx1).
    radius : float or list
        Radius of the sparse filter in each dimension. Values less than
        zero indicate no connectivity in that dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensities (Mx1).

    '''

    # Copy indices
    idx = idx.copy().astype(V.dtype)

    # Scale
    for i, r in enumerate(radius):
        # Increase inter-index distance
        if r < 1:
            idx[:, i] *= 2

        # Do nothing
        elif r == 1:
            pass

        # Decrease inter-index distance
        else:
            idx[:, i] /= r

    # Connectivity matrix
    cmat = KDTree(idx)
    cmat = cmat.sparse_distance_matrix(
        cmat, 1, p=np.inf, output_type='coo_matrix')
    cmat.setdiag(1)

    # Pair indices
    I, J = cmat.nonzero()

    # Delete cmat
    cmat_shape = cmat.shape
    del cmat

    # Sum over columns
    V_sum = sparse.bsr_matrix((V[J], (I, I)),
                              shape=cmat_shape,
                              dtype=V.dtype).diagonal(0)

    # Count over columns
    V_count = sparse.bsr_matrix((np.ones_like(J), (I, I)),
                                shape=cmat_shape,
                                dtype=V.dtype).diagonal(0)

    return V_sum / V_count


[docs]def sparse_weighted_mean_filter(idx, V, w, radius=[1, 1, 1], pindex=None):
    '''
    Sparse implementation of a weighted mean filter.

    Parameters
    ----------
    idx : :obj:`~numpy.array`
        Edge indices for each dimension (MxN).
    V : :obj:`~numpy.array`
        Array of edge data (MxN).
    w : :obj:`~numpy.array`
        Array of intensity data (Mx1).
    radius : float or list
        Radius of the sparse filter in each dimension. Values less than
        one indicate no connectivity in that dimension.
    pindex : :obj:`~numpy.array`
        Index of points to evaluate the weighted mean.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered edges (MxN).

    '''

    # Copy indices
    idx = idx.copy().astype(w.dtype)

    # Scale
    for i, r in enumerate(radius):
        # Increase inter-index distance
        if r < 1:
            idx[:, i] *= 2

        # Do nothing
        elif r == 1:
            pass

        # Decrease inter-index distance
        else:
            idx[:, i] /= r

    # If not supplied, index is all points
    if pindex is None:
        pindex = np.arange(len(idx))

    # Connectivity matrix
    tree_all = KDTree(idx)
    tree_subset = KDTree(idx[pindex])
    cmat = tree_subset.sparse_distance_matrix(
        tree_all, 1, p=np.inf, output_type='coo_matrix')
    del tree_all, tree_subset

    # Pair indices
    I, J = cmat.nonzero()

    # Self
    I = np.concatenate((I, np.arange(len(pindex))))
    J = np.concatenate((J, pindex))

    # Delete connectivity matrix
    cmat_shape = cmat.shape
    del cmat

    # Sum weights over columns
    # Only need to do this once
    V_count = sparse.bsr_matrix((w[J], (I, I)),
                                shape=cmat_shape,
                                dtype=w.dtype).diagonal(0)

    # Reshape V if 1D
    if V.ndim == 1:
        V = V.reshape((-1, 1))

    # Output container
    V_out = np.empty_like(V[pindex, :], dtype=w.dtype)

    # Enumerate value columns
    for i in range(V_out.shape[1]):
        # Sum weighted values over columns
        V_sum = sparse.bsr_matrix((w[J] * V[J, i], (I, I)),
                                  shape=cmat_shape,
                                  dtype=w.dtype).diagonal(0)

        V_out[:, i] = V_sum / V_count

    # Flatten if 1D
    if V_out.shape[1] == 1:
        return V_out.flatten()

    return V_out


[docs]def sparse_median_filter(idx, V, radius=[0, 1, 1]):
    '''
    Sparse implementation of a median filter.

    Parameters
    ----------
    idx : :obj:`~numpy.array`
        Edge indices for each dimension (MxN).
    V : :obj:`~numpy.array`
        Array of intensity data (Mx1).
    radius : float or list
        Radius of the sparse filter in each dimension. Values less than
        zero indicate no connectivity in that dimension.

    Returns
    -------
    :obj:`~numpy.array`
        Filtered intensities (Mx1).

    '''

    # Copy indices
    idx = idx.copy().astype(V.dtype)

    # Scale
    for i, r in enumerate(radius):
        # Increase inter-index distance
        if r < 1:
            idx[:, i] *= 2

        # Do nothing
        elif r == 1:
            pass

        # Decrease inter-index distance
        else:
            idx[:, i] /= r

    # Connectivity matrix
    cmat = KDTree(idx)
    cmat = cmat.sparse_distance_matrix(
        cmat, 1, p=np.inf, output_type='coo_matrix')
    cmat.setdiag(1)

    # Pair indices
    I, J = cmat.nonzero()

    # Delete cmat
    cmat_shape = cmat.shape
    del cmat

    X = sparse.csc_matrix((V[I], (I, J)),
                          shape=cmat_shape,
                          dtype=V.dtype)

    indptr = X.indptr
    n_samples, n_features = X.shape
    median = np.empty(n_features, dtype=V.dtype)

    for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
        # Prevent modifying X in place
        data = np.copy(X.data[start:end])
        median[f_ind] = _get_median(data, 0)

    return median


[docs]def smooth(features, index=None, factors=None, dims=['mz', 'drift_time', 'retention_time'],
           radius=[0, 1, 1], iterations=1, tol=0.0):
    '''
    Smooth data by sparse mean filtration.

    Parameters
    ----------
    features : :obj:`~pandas.DataFrame`
        Input feature coordinates and intensities.
    index : dict
        Index of features in original data array.
    factors : dict
        Unique sorted values per dimension.
    dims : str or list
        Dimensions to perform peak detection in.
    radius : float or list
        Radius of the sparse filter in each dimension. Values less than
        zero indicate no connectivity in that dimension.
    iterations : int
        Maximum number of smoothing iterations to perform.
    tol : float
        Stopping criteria based on residual with previous iteration.
        Selecting zero will perform all specified iterations.

    Returns
    -------
    :obj:`~pandas.DataFrame`
        Smoothed feature coordinates and intensities.

    '''

    # Safely cast to list
    dims = deimos.utils.safelist(dims)
    radius = deimos.utils.safelist(radius)

    # Check dims
    deimos.utils.check_length([dims, radius])

    # Check factors and index mutually exclusive
    if (factors is not None) & (index is not None):
        raise ValueError('Specify either `index`, `factors`, or neither.')

    # Copy input
    features = features.copy()

    # Build index from features directly
    if (factors is None) & (index is None):
        index = {dim: pd.factorize(features[dim], sort=True)[0].astype(np.float32)
                 for dim in dims}

    # Build index from factors
    if (factors is not None) & (index is None):
        index = deimos.build_index(features, factors)

    # Index supplied directly
    index = np.vstack([index[dim] for dim in dims]).T

    # Values
    V = features['intensity'].values

    # Residual exit criteria
    resid = np.inf

    # Sparse mean filtration iterations
    for i in range(iterations):
        # Previous iteration
        V_prev = V.copy()
        resid_prev = resid

        # Compute smooth
        V = deimos.filters.sparse_mean_filter(index, V, radius=radius)

        # Calculate residual with previous iteration
        resid = np.sqrt(np.mean(np.square(V - V_prev)))

        # Evaluate convergence
        if i > 0:
            # Percent change in residual
            test = np.abs(resid - resid_prev) / resid_prev

            # Exit criteria
            if test <= tol:
                break

    # Overwrite values
    features['intensity'] = V

    return features