Source code for magine.mappings.chemical_mapper

from bioservices import UniChem
import pandas as pd
from sortedcontainers import SortedSet, SortedDict

from magine.mappings.databases.download_libraries import HMDB

try:
    import cPickle as pickle
except:  # python3 doesnt have cPickle
    import pickle
try:
    basestring
# Allows isinstance(foo, basestring) to work in Python 3
except:
    basestring = str
chem = UniChem()


[docs]class ChemicalMapper(object):
    """ Convert chemical species across various ids.

    Database was creating using HMDB

    """

    valid_columns = ['kegg_id', 'name', 'accession', 'chebi_id', 'inchikey',
                     'chemspider_id', 'biocyc_id', 'synonyms', 'iupac_name',
                     'pubchem_compound_id', 'protein_associations',
                     'ontology', 'drugbank_id', 'chemical_formula',
                     'smiles', 'metlin_id', 'average_molecular_weight',
                     'secondary_accessions'
                     ]

    def __init__(self, fresh_download=False):
        """

        Parameters
        ----------
        fresh_download: bool
            download new copy of database

        """

        self.database = None
        self._hmdb_to_chem_name = None
        self._chem_name_to_hmdb = None
        self._hmdb_to_kegg = None
        self._kegg_to_hmdb = None
        self.synonyms_to_hmdb = None
        self._drugbank_to_hmdb = None
        self._hmdb_to_protein = None
        self._hmdb_main_to_protein = None
        self._hmdb_accession_to_main = None
        hmdb_database = HMDB().load_db(fresh_download=fresh_download)
        self.database = hmdb_database.where((pd.notnull(hmdb_database)), None)
        self.database['main_accession'] = self.database['accession']
        sub_db = self.database[
            self.database['secondary_accessions'].str.contains('|', na=False)]
        new_df = tidy_split(sub_db, 'secondary_accessions', '|')
        new_df['accession'] = new_df['secondary_accessions']
        self.database = pd.concat([self.database, new_df])
        self.kegg_hmdb = chem.get_mapping("kegg_ligand", "hmdb")
        self.kegg_to_hmdb = self._to_dict("kegg_id", "main_accession")
        self.hmdb_to_chem_name = self._to_dict("main_accession", "name")

    @property
    def hmdb_to_kegg(self):
        if self._hmdb_to_kegg is None:
            self._hmdb_to_kegg = self._to_dict("accession", "kegg_id")
        return self._hmdb_to_kegg

    @property
    def chem_name_to_hmdb(self):
        if self._chem_name_to_hmdb is None:
            self._chem_name_to_hmdb = self._to_dict("name", "main_accession")
        return self._chem_name_to_hmdb

    @property
    def drugbank_to_hmdb(self):
        if self._drugbank_to_hmdb is None:
            self._drugbank_to_hmdb = self._to_dict("drugbank_id",
                                                   "main_accession")
        return self._drugbank_to_hmdb

    @property
    def hmdb_to_protein(self):
        if self._hmdb_to_protein is None:
            self._hmdb_to_protein = self._from_list_dict(
                "accession", "protein_associations"
            )
        return self._hmdb_to_protein

    @property
    def hmdb_main_to_protein(self):
        if self._hmdb_main_to_protein is None:
            self._hmdb_main_to_protein = self._from_list_dict(
                "main_accession", "protein_associations"
            )
        return self._hmdb_main_to_protein

    @property
    def hmdb_accession_to_main(self):
        if self._hmdb_accession_to_main is None:
            self._hmdb_accession_to_main = self._from_list_dict(
                "accession", "main_accession"
            )
        return self._hmdb_accession_to_main

    def _to_dict(self, key, value):
        """ creates a dictionary with a list of values for each key

        Parameters
        ----------
        key : str
        value : str

        Returns
        -------
        dict

        """
        d = self.database[[key, value]].copy()
        d.dropna(how='any', inplace=True)
        return_dict = SortedDict()
        for i, j in d.values:
            i = i.strip()
            if i not in return_dict:
                return_dict[i] = set()
            return_dict[i].add(j)
        return return_dict

    def _from_list_dict(self, key, value):
        d = self.database[[key, value]].copy()
        d.dropna(how='any', inplace=True)
        return_dict = SortedDict()
        for i, j in d.values:
            if i in return_dict:
                return_dict[i].update(j.split('|'))
            else:
                return_dict[i] = SortedSet(j.split('|'))
        return return_dict

[docs]    def check_synonym_dict(self, term, format_name):
        """ checks hmdb database for synonyms and returns formatted name

        Parameters
        ----------
        term : str
        format_name : str

        Returns
        -------
        dict


        Examples
        --------
        >>> cm = ChemicalMapper()
        >>> cm.check_synonym_dict(term='dodecene', format_name='main_accession')
        ['HMDB0000933', 'HMDB0059874']

        """
        synonyms = self.database[['synonyms', format_name]].copy()
        synonyms.dropna(how='any', inplace=True)
        synonyms['synonyms'] = synonyms['synonyms'].str.lower()

        hits = synonyms[synonyms['synonyms'].str.contains(term.lower())]
        matches = sorted(set(hits[format_name].values))
        return matches

[docs]    def print_info(self):
        """ print information about the dataframe

        Returns
        -------

        """
        print('Number of HMDB accessions = {0}'.format(
            len(self.database['accession'].unique())))
        print('Number of unique KEGG ids = {0}'.format(
            len(self.hmdb_to_kegg.keys())))
        print('Number of HMDB to KEGG mappings = {0}'.format(
            len(self.kegg_to_hmdb.values())))

[docs]    def convert_kegg_nodes(self, network):
        """
        Maps network from kegg to gene names

        Parameters
        ----------
        network : nx.DiGraph

        Returns
        -------
        dict

        """

        still_unknown = []
        hits = [i for i in set(network.nodes) if i.startswith('cpd:')]
        net_kegg_names = dict()
        net_chem_names = dict()
        net_cpd_to_hmdb = dict()
        for i in hits:

            name_stripped = i.lstrip('cpd:')
            net_kegg_names[i] = name_stripped

            if name_stripped in self.kegg_to_hmdb:
                mapping = self.kegg_to_hmdb[name_stripped]
                if isinstance(mapping, (list, set, SortedSet)):
                    names = '|'.join(set(mapping))
                    chem_names = set()
                    for name in mapping:
                        try:
                            chem_names.update(self.hmdb_to_chem_name[name])
                        except:
                            continue
                    net_cpd_to_hmdb[i] = names
                    net_chem_names[i] = order_merge(chem_names)

                elif isinstance(mapping, basestring):

                    chem_n = self.hmdb_to_chem_name[mapping]
                    net_cpd_to_hmdb[i] = mapping
                    net_chem_names[i] = '|'.join(chem_n.encode('ascii',
                                                               'ignore'))
                else:
                    print('Returned something else...', mapping)

            elif i in compound_manual:
                loc = compound_manual[i]
                net_cpd_to_hmdb[i] = loc
                if loc in self.hmdb_to_chem_name:
                    net_chem_names[i] = order_merge(
                        self.hmdb_to_chem_name[loc])
            else:
                still_unknown.append(i)
        if len(still_unknown):

            for i in still_unknown:
                name_stripped = i.lstrip('cpd:')
                if name_stripped in self.kegg_hmdb:
                    net_cpd_to_hmdb[i] = self.kegg_hmdb[name_stripped]
                # else:
                #     print("Cannot find a HMDB mapping for %s " % i)
        return net_cpd_to_hmdb, net_kegg_names, net_chem_names


# manually created based on missing in KEGG
compound_manual = {
    'cpd:C07909': 'HMDB0015015',
    'cpd:C16844': 'HMDB0001039',
    'cpd:C00076': 'HMDB0000464',
    'cpd:C00154': 'HMDB0001338',
    'cpd:C01561': 'HMDB0003550',
    'cpd:C04043': 'HMDB0003791',
    'cpd:C01165': 'HMDB0002104',
    'cpd:C00025': 'HMDB0000148',
    'cpd:C00696': 'HMDB0001403',
    'cpd:C00124': 'HMDB0000143',
}


def order_merge(species_set):
    return '|'.join(sorted(species_set))


def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df


if __name__ == "__main__":
    cm = ChemicalMapper()
    print(cm.check_synonym_dict(term='dodecene', format_name='main_accession'))
    print(cm.hmdb_accession_to_main['HMDB15015'])
    print(cm.hmdb_accession_to_main['HMDB0015015'])
    # print(cm.hmdb_to_kegg['HMDB0015015'])
    print(cm.kegg_to_hmdb.keys())
    print(cm.kegg_to_hmdb['C07467'])