Source code for deimos.io

import os
import warnings
from collections import OrderedDict, defaultdict

import dask.dataframe as dd
import h5py
import numpy as np
import pandas as pd
import pymzml

import deimos


[docs]def load(path, key='ms1', columns=None, chunksize=1E7, meta=None, accession={}, dtype=np.float32): ''' Loads data from HDF5 or mzML file. Parameters ---------- path : str or list of str Path to input file (or files if HDF5). key : str Access this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. HDF5 format only. columns : list A list of columns names to return. HDF5 format only. chunksize : int Dask partition chunksize. HDF5 format only. Unused when loading single file. meta : dict Dictionary of meta data per path. HDF5 format only. Unused when loading single file. accession : dict Key-value pairs signaling which features to parse for in the mzML file. mzML format only. See :func:`~deimos.io.get_accessions` to obtain available values. dtype : data type Data type to encode values. mzML format only. Returns ------- :obj:`~pandas.DataFrame` or :obj:`dict` of :obj:`~pandas.DataFrame` Feature coordinates and intensities for the specified level. Pandas is used when loading a single file, Dask for multiple files. Loading an mzML file returns a dictionary with keys per MS level. ''' # Check number of inputs paths = deimos.utils.safelist(path) # Ensure extensions match exts = [os.path.splitext(x)[-1].lower() for x in paths] if not all(x == exts[0] for x in exts): raise ValueError('All inputs must have same filetype extension.') # Get the extension ext = exts[0] # Multi loader if len(paths) > 1: # Hdf5 if ext in ['.h5', '.hdf']: return deimos.io.load_hdf_multi(paths, key=key, columns=columns, chunksize=chunksize, meta=meta) # Other raise ValueError( 'Only HDF5 currently supported for multi-file loading.') # Single loader # Hdf5 if ext in ['.h5', '.hdf']: return deimos.io.load_hdf_single(path, key=key, columns=columns) # Mzml if ext in ['.gz', '.mzml']: return deimos.io.load_mzml(path, accession=accession, dtype=dtype) # Other raise ValueError('Only HDF5 and mzML currently supported.')
[docs]def build_factors(data, dims='detect'): ''' Determine sorted unique elements (factors) for each dimension in data. Parameters ---------- data : :obj:`~pandas.DataFrame` Feature coordinates and intensities. dims : str or list Dimensions to determine factors for. Attempts to autodetect by default. Returns ------- :obj:`dict` of :obj:`~numpy.array` Unique sorted values per dimension. ''' # Autodetect if dims == 'detect': dims = deimos.utils.detect_dims(data) # Safely cast to list dims = deimos.utils.safelist(dims) # Construct per-dimension factor arrays return {dim: pd.factorize(data[dim], sort=True)[1].astype(np.float32) for dim in dims}
[docs]def build_index(data, factors): ''' Construct data index from precomputed factors. Parameters ---------- data : :obj:`~pandas.DataFrame` Feature coordinates and intensities. factors : dict Per-dimension arrays of unique values. Returns ------- :obj:`dict` of :obj:`~numpy.array` Index per dimension. ''' return {dim: np.searchsorted(factors[dim], data[dim]).astype(np.float32) for dim in factors}
[docs]def save(path, data, key='ms1', **kwargs): ''' Saves :obj:`~pandas.DataFrame` to HDF5 or MGF container. Parameters ---------- path : str Path to output file. data : :obj:`~pandas.DataFrame` Feature coordinates and intensities to be saved. Precursor m/z and intensities should be paired to MS2 spectra for MGF format. key : str Save to this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. HDF5 format only. kwargs Keyword arguments exposed by :meth:`~pandas.DataFrame.to_hdf` or :func:`~deimos.io.save_mgf`. ''' # Path extension ext = os.path.splitext(path)[-1].lower() # Hdf5 if ext in ['.h5', '.hdf']: return deimos.io.save_hdf(path, data, key=key, **kwargs) # MGF if ext in ['.mgf']: return deimos.io.save_mgf(path, data, **kwargs) # MGF if ext in ['.msp']: return deimos.io.save_msp(path, data, **kwargs) # CSV if ext in ['.csv']: return data.to_csv(path, sep=',', index=False, **kwargs) # TSV if ext in ['.tsv', '.txt']: return data.to_csv(path, sep='\t', index=False, **kwargs) # Other raise ValueError('Only HDF5, MGF, MSP, TSV, and CSV formats currently supported.')
[docs]def get_accessions(path): ''' Determines accession fields available in the mzML file. Parameters ---------- path : str Path to mzML file. Returns ------- :obj:`dict` of str Dictionary of accession fields. ''' # Open file data = pymzml.run.Reader(path) # Iterate single spec instance for spec in data: spec._read_accessions() break # Return accessions return spec.accessions
[docs]def load_mzml(path, accession={}, dtype=np.float32): ''' Loads in an mzML file, parsing for accession values, to yield a :obj:`~pandas.DataFrame`. Parameters ---------- path : str Path to input mzML file. accession : dict Key-value pairs signaling which features to parse for in the mzML file. See :func:`~deimos.io.get_accessions` to obtain available values. Scan, frame, m/z, and intensity are parsed by default. dtype : data type Data type to encode values. Returns ------- :obj:`dict` of :obj:`~pandas.DataFrame` Dictionary containing parsed feature coordinates and intensities, indexed by keys per MS level. ''' # Open file data = pymzml.run.Reader(path) # Ordered dict accession = OrderedDict(accession) # Result container res = {} # Row count container counter = {} # Column name container cols = {} # First pass: get nrows N = defaultdict(lambda: 0) for i, spec in enumerate(data): # Get ms level level = 'ms{}'.format(spec.ms_level) # Number of rows N[level] += spec.mz.shape[0] # Second pass: parse for i, spec in enumerate(data): # Number of rows n = spec.mz.shape[0] # No measurements if n == 0: continue # Dimension check if len(spec.mz) != len(spec.i): warnings.warn("m/z and intensity array dimension mismatch") continue # Scan/frame info id_dict = spec.id_dict # Check for precursor precursor_info = {} has_precursor = False if spec.selected_precursors: has_precursor = True precursor_info = { 'precursor_mz': spec.selected_precursors[0].get('mz', None)} # Get ms level level = 'ms{}'.format(spec.ms_level) # Columns cols[level] = list(id_dict.keys()) \ + list(accession.keys()) \ + ['mz', 'intensity'] \ + list(precursor_info.keys()) m = len(cols[level]) # Subarray init arr = np.empty((n, m), dtype=dtype) inx = 0 # Populate scan/frame info for k, v in id_dict.items(): arr[:, inx] = v inx += 1 # Populate accession fields for k, v in accession.items(): arr[:, inx] = spec.get(v) inx += 1 # Populate m/z arr[:, inx] = spec.mz inx += 1 # Populate intensity arr[:, inx] = spec.i inx += 1 # Populate precursor information if has_precursor: for k, v in precursor_info.items(): arr[:, inx] = v inx += 1 # Initialize output container if level not in res: res[level] = np.empty((N[level], m), dtype=dtype) counter[level] = 0 # Insert subarray res[level][counter[level]:counter[level] + n, :] = arr counter[level] += n # Construct data frames for level in res.keys(): res[level] = pd.DataFrame(res[level], columns=cols[level]) return res
[docs]def save_hdf(path, data, key='ms1', complevel=5, **kwargs): ''' Saves :obj:`~pandas.DataFrame` to HDF5 container. Parameters ---------- path : str Path to output file. data : :obj:`~pandas.DataFrame` Feature coordinates and intensities to be saved. key : str Save to this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. kwargs Keyword arguments exposed by :meth:`~pandas.DataFrame.to_hdf`. ''' data.to_hdf(path, key, format='table', complib='blosc', complevel=complevel, **kwargs)
[docs]def load_hdf(path, key='ms1', columns=None, chunksize=1E7, meta=None): ''' Loads data frame from HDF5 container(s). Parameters ---------- path : str or list of str Path to input HDF5 file or files. key : str Access this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. columns : list A list of columns names to return. chunksize : int Dask partition chunksize. Unused when loading single file. meta : dict Dictionary of meta data per path. Unused when loading single file. Returns ------- :obj:`~pandas.DataFrame` Feature coordinates and intensities for the specified level. Pandas is used when loading a single file, Dask for multiple files. ''' # Check number of inputs paths = deimos.utils.safelist(path) # Ensure extensions match exts = [os.path.splitext(x)[-1].lower() for x in paths] if not all(x == exts[0] for x in exts): raise ValueError('All inputs must have same filetype extension.') # Multi loader if len(paths) > 1: return deimos.io.load_hdf_multi(paths, key=key, columns=columns, chunksize=chunksize, meta=meta) # Single loader return deimos.io.load_hdf_single(paths, key=key, columns=columns)
[docs]def load_hdf_single(path, key='ms1', columns=None): ''' Loads data frame from HDF5 container. Parameters ---------- path : str Path to input HDF5 file. key : str Access this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. columns : list A list of columns names to return. Returns ------- :obj:`~pandas.DataFrame` Feature coordinates and intensities for the specified level. ''' return pd.read_hdf(path, key=key, columns=columns)
[docs]def load_hdf_multi(paths, key='ms1', columns=None, chunksize=1E7, meta=None): ''' Loads data frame from HDF5 containers using Dask. Appends column to indicate source filenames. Parameters ---------- paths : list of str Paths to input HDF5 files. key : str Access this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. columns : list A list of columns names to return. chunksize : int Dask partition chunksize. meta : dict Dictionary of meta data per path. Returns ------- :obj:`~dask.dataframe.DataFrame` Feature coordinates and intensities for the specified level. ''' # Load as dask df = [dd.read_hdf(x, key=key, chunksize=int( chunksize), columns=columns) for x in paths] # Label each sample for i in range(len(paths)): df[i]['sample_idx'] = i # force unique label in toy case df[i]['sample_id'] = os.path.splitext(os.path.basename(paths[i]))[0] if meta is not None: for k, v in meta.items(): df[i][k] = v[i] # Concatenate results return dd.concat(df, axis=0)
[docs]def _save_hdf(path, data, dtype={}, compression_level=5): ''' Deprecated version. Saves dictionary of :obj:`~pandas.DataFrame` to HDF5 container. Parameters ---------- path : str Path to output file. data : dict of :obj:`~pandas.DataFrame` Dictionary of feature coordinates and intensities to be saved. Dictionary keys are saved as "groups" (e.g., MS level) and data frame columns are saved as "datasets" in the HDF5 container. dtype : dict Specifies what data type to save each column, provided as column:dtype pairs. Defaults to 32-bit float if unspecified. compression_level : int A value from 0-9 signaling the number of compression operations to apply. Higher values result in greater compression at the expense of computational overhead. ''' with h5py.File(path, 'w') as f: for level in data.keys(): f.create_group(level) for c in data[level].columns: if c not in dtype.keys(): dtype[c] = float f[level].create_dataset(c, data=data[level][c].values, dtype=dtype[c], compression="gzip", compression_opts=compression_level)
[docs]def _load_hdf(path, level='ms1'): ''' Deprecated version. Loads data frame from HDF5 container. Parameters ---------- path : str Path to input HDF5 file. level : str Access this level (group) of the HDF5 container. E.g., "ms1" or "ms2" for MS levels 1 or 2, respectively. Returns ------- :obj:`~pandas.DataFrame` Feature coordinates and intensities for the specified level. ''' with h5py.File(path, 'r') as f: g = f[level] return pd.DataFrame({k: g[k] for k in list(g.keys())})
[docs]def save_mgf(path, features, groupby='index_ms1', precursor_mz='mz_ms1', fragment_mz='mz_ms2', fragment_intensity='intensity_ms2', precursor_metadata=None, sample_metadata=None): ''' Saves data to MGF format. Parameters ---------- path : str Path to output file. features : :obj:`~pandas.DataFrame` Precursor m/z and intensities paired to MS2 spectra. groupby : str or list of str Column(s) to group fragments by. precursor_mz : str Column containing precursor m/z values. fragment_mz : str Column containing fragment m/z values. fragment_intensity : str Column containing fragment intensity values. precursor_metadata : dict Precursor metadata key:value pairs of {MGF entry name}:{column name}. sample_metadata : dict Sample metadata key:value pairs of {MGF entry name}:{value}. ''' # Initialize default fields metadata = ['TITLE', 'PEPMASS', 'PEPINTENSITY', 'CHARGE', 'PRECURSORTYPE', 'INSTRUMENTTYPE', 'INSTRUMENT', 'IONMODE', 'COLLISIONENERGY', 'SMILES', 'INCHI', 'INCHIKEY', 'FORMULA', 'RETENTIONTIME', 'DRIFTTIME', 'CCS'] metadata = OrderedDict([(x, None) for x in metadata]) # Initialize precursor metadata dict if precursor_metadata is None: precursor_metadata = {} # Initialize sample metadata dict if sample_metadata is None: sample_metadata = {} # Add required field precursor_metadata['PEPMASS'] = precursor_mz # Update defaults metadata.update(precursor_metadata) metadata.update(sample_metadata) # Build template template = 'BEGIN IONS\n' columns = [] for k in metadata: # Key from precursor metadata if k in precursor_metadata: template += k + '={}\n' columns.append(metadata[k]) # Key from sample metadata elif k in sample_metadata: template += k + '={}\n'.format(metadata[k]) # Key was not specified else: pass # Append MS2 template information template += ('{}\n' 'END IONS\n\n') # Open file object with open(path, 'w') as f: # Enumerate groups for name, grp in features.groupby(by=groupby): # Format MS2 string ms2_str = '\n'.join('{}\t{}'.format(a, b) for a, b in zip(grp[fragment_mz].values, grp[fragment_intensity].values)) # Precursor metadata values values = list(grp[columns].values[0]) # Add MS2 info values += [ms2_str] # Write to template f.write(template.format(*values))
[docs]def save_msp(path, features, groupby='index_ms1', precursor_mz='mz_ms1', fragment_mz='mz_ms2', fragment_intensity='intensity_ms2', precursor_metadata=None, sample_metadata=None): ''' Saves data to MSP format. Parameters ---------- path : str Path to output file. features : :obj:`~pandas.DataFrame` Precursor m/z and intensities paired to MS2 spectra. groupby : str or list of str Column(s) to group fragments by. precursor_mz : str Column containing precursor m/z values. fragment_mz : str Column containing fragment m/z values. fragment_intensity : str Column containing fragment intensity values. precursor_metadata : dict Precursor metadata key:value pairs of {MSP entry name}:{column name}. sample_metadata : dict Sample metadata key:value pairs of {MSP entry name}:{value}. ''' # Initialize default fields metadata = ['NAME', 'PRECURSORMZ', 'PRECURSORINTENSITY', 'PRECURSORTYPE', 'INSTRUMENTTYPE', 'INSTRUMENT', 'IONMODE', 'COLLISIONENERGY', 'SMILES', 'INCHI', 'INCHIKEY', 'FORMULA', 'RETENTIONTIME', 'DRIFTTIME', 'CCS'] metadata = OrderedDict([(x, None) for x in metadata]) # Initialize precursor metadata dict if precursor_metadata is None: precursor_metadata = {} # Initialize sample metadata dict if sample_metadata is None: sample_metadata = {} # Add required field precursor_metadata['PRECURSORMZ'] = precursor_mz # Update defaults metadata.update(precursor_metadata) metadata.update(sample_metadata) # Build template template = '' columns = [] for k in metadata: # Key from precursor metadata if k in precursor_metadata: template += k + ': {}\n' columns.append(metadata[k]) # Key from sample metadata elif k in sample_metadata: template += k + ': {}\n'.format(metadata[k]) # Key was not specified else: pass # Append MS2 template information template += ('Num Peaks: {}\n' '{}\n\n') # Open file object with open(path, 'w') as f: # Enumerate groups for name, grp in features.groupby(by=groupby): # Number of MS2 n = len(grp.index) # Format MS2 string ms2_str = '\n'.join('{}\t{}'.format(a, b) for a, b in zip(grp[fragment_mz].values, grp[fragment_intensity].values)) # Precursor metadata values values = list(grp[columns].values[0]) # Add MS2 info values += [n, ms2_str] # Write to template f.write(template.format(*values))