Source code for magine.mappings.gene_mapper

try:
    basestring
# Allows isinstance(foo, basestring) to work in Python 3
except:
    basestring = str

from bioservices import HGNC, UniProt
import pandas as pd
from sortedcontainers import SortedSet, SortedDict

from magine.mappings.databases import load_hgnc, load_uniprot, load_ncbi

pd.set_option('display.width', 20000)


[docs]class GeneMapper(object):
    """
    Mapping class between common gene ids

    Database was creating by pulling down from NCBI, UNIPROT, HGNC

    """

    def __init__(self, species='hsa'):
        self.species = species
        self.hgnc = load_hgnc()
        self.ncbi = load_ncbi()
        self.uniprot = load_uniprot()
        self._gene_name_to_uniprot = None
        self._gene_name_to_alias_name = None
        self._gene_name_to_ensembl = None
        self._gene_name_to_kegg = None
        self._uniprot_to_gene_name = None
        self._uniprot_to_kegg = None
        self._kegg_to_gene_name = None
        self._kegg_to_uniprot = None
        self._ncbi_to_symbol = None
        # hackish way to initilize
        if 'x' in self.kegg_to_gene_name:
            pass
        if 'x' in self.ncbi_to_symbol:
            pass
        if 'x' in self.gene_name_to_uniprot:
            pass

    @property
    def gene_name_to_uniprot(self):
        if self._gene_name_to_uniprot is None:
            self._gene_name_to_uniprot = _dict(self.hgnc, 'symbol',
                                               'uniprot_ids')
        return self._gene_name_to_uniprot

    @property
    def gene_name_to_alias_name(self):
        if self._gene_name_to_alias_name is None:
            self._gene_name_to_alias_name = _dict(self.hgnc, 'symbol',
                                                  'alias_name')
        return self._gene_name_to_alias_name

    @property
    def gene_name_to_ensembl(self):
        if self._gene_name_to_ensembl is None:
            self._gene_name_to_ensembl = _dict(self.hgnc, 'symbol',
                                               'ensembl_gene_id')
        return self._gene_name_to_ensembl

    @property
    def uniprot_to_gene_name(self):
        if self._uniprot_to_gene_name is None:
            self._uniprot_to_gene_name = _dict(self.hgnc, 'uniprot_ids',
                                               'symbol')
        return self._uniprot_to_gene_name

    @property
    def gene_name_to_kegg(self):
        if self._gene_name_to_kegg is None:
            self._gene_name_to_kegg = _dict(self.uniprot, 'Gene_Name', 'KEGG')
        return self._gene_name_to_kegg

    @property
    def uniprot_to_kegg(self):
        if self._uniprot_to_kegg is None:
            self._uniprot_to_kegg = _dict(self.uniprot, 'uniprot', 'KEGG')
        return self._uniprot_to_kegg

    @property
    def kegg_to_gene_name(self):
        if self._kegg_to_gene_name is None:
            self._kegg_to_gene_name = _dict(self.uniprot, 'KEGG', 'Gene_Name')
            self._kegg_to_gene_name.update(manual_dict)
        return self._kegg_to_gene_name

    @property
    def kegg_to_uniprot(self):
        if self._kegg_to_uniprot is None:
            self._kegg_to_uniprot = _dict(self.uniprot, 'KEGG', 'uniprot')
        return self._kegg_to_uniprot

    @property
    def ncbi_to_symbol(self):
        if self._ncbi_to_symbol is None:
            self._ncbi_to_symbol = _dict(self.ncbi, 'GeneID', 'Symbol')
        return self._ncbi_to_symbol

[docs]    def check_synonym_dict(self, term, format_name):
        """ checks hmdb database for synonyms and returns formatted name

        Parameters
        ----------
        term : str
        format_name : str

        Returns
        -------
        dict

        """
        index = 'alias_symbol'
        synonyms = self.hgnc.copy()
        synonyms = synonyms.loc[~synonyms[index].isna()]
        synonyms[index] = synonyms[index].str.upper()

        hits = synonyms.loc[synonyms[index].str.contains(term.upper())].copy()
        hits[index] = hits[index].str.split('|')

        for i, row in hits.iterrows():
            if term in row[index]:
                return [row[format_name]]
        matches = sorted(set(hits[format_name].values))
        return matches

[docs]    def convert_kegg_nodes(self, network, species='hsa'):
        """ Convert kegg ids to HGNC gene symbol.

        Parameters
        ----------
        network : nx.DiGraph
        species : str {'hsa'}
            Main support for humans only.

        Returns
        -------
        kegg_to_gene_name, kegg_short : dict, dict
        """

        # Create the dictionary to store all conversions to be returned
        missing = set()
        unknown_genes = set()
        hits = {i for i in set(network.nodes) if i.startswith(species)}
        prefix = species + ':'
        kegg_short = {i: i.lstrip(prefix) for i in hits}

        kegg_to_gene_name = {i: self.kegg_to_gene_name[i]
                             for i in hits if i in self.kegg_to_gene_name}
        for gene, gn in kegg_to_gene_name.items():
            if isinstance(gn, basestring):
                kegg_to_gene_name[gene] = gn
            elif len(gn) == 1:
                kegg_to_gene_name[gene] = gn[0]
            else:
                found = False
                for g in gn:
                    if g in self.gene_name_to_uniprot:
                        kegg_to_gene_name[gene] = g
                        found = True
                if not found:
                    print("Found species that cannot be mapped to HGNC.")
                    print(gene, gn)
                    # kegg_to_gene_name[gene] = gn[0]
                    missing.add(gene)
        missing.update(hits.difference(kegg_to_gene_name))

        # check stores dictionaries
        for gene in missing:
            name_stripped = gene.lstrip(prefix)
            if int(name_stripped) in self.ncbi_to_symbol:
                new = self.ncbi_to_symbol[int(name_stripped)][0]
                if not isinstance(new, float):
                    kegg_to_gene_name[gene] = new
                else:
                    unknown_genes.add(gene)
            else:
                unknown_genes.add(gene)

        if len(unknown_genes) > 0:
            add_dict, missed = self.kegg_to_symbol_through_uniprot(
                unknown_genes)
            kegg_to_gene_name.update(add_dict)

            add_dict, final_missed = self.kegg_to_hugo(missed, species)
            kegg_to_gene_name.update(add_dict)

            print("{} mappings not found from kegg to"
                  " gene name".format(len(final_missed)))
            print(final_missed)

        return kegg_to_gene_name, kegg_short

[docs]    def kegg_to_symbol_through_uniprot(self, unknown_genes):
        # create string to call uniprot for mapping
        search_string = '\t'.join(unknown_genes)
        kegg_to_gene_name = dict()
        missing = set()
        uniprot = UniProt(verbose=True)
        # This is where it gets tricky. Checking to see if there is a uniprot
        # mapping for the species, if not, trying from KEGG side. Sometimes
        # kegg  links to a different uniprot, or uniprot links to a diff kegg.
        uni_dict = dict(uniprot.mapping("KEGG_ID", "ACC", query=search_string))
        for i in unknown_genes:
            if i in uni_dict:
                for n in uni_dict[i]:
                    x = uniprot.search("accession:{}".format(n),
                                       columns='genes(PREFERRED),reviewed,id',
                                       limit=1)
                    _, data = x.rstrip('\n').split('\n')
                    name, review, entry = data.split('\t')
                    if n != entry:
                        print(i, n, entry, x, "dont match")
                    elif review == 'reviewed':
                        kegg_to_gene_name[i] = name

            else:
                missing.add(i)
        print("{} mappings not found from kegg to"
              " gene name".format(len(missing)))
        print(missing)
        return kegg_to_gene_name, missing

[docs]    def kegg_to_hugo(self, genes, species='hsa'):
        """
        Converts all KEGG names to HGNC

        Parameters
        ----------
        genes : list
        species : str

        Returns
        -------
        dict
        """
        prefix = species + ':'
        hugo = HGNC(verbose=True)
        hugo_dict = {}
        not_found = set()
        for i in genes:
            tmp_name = i.lstrip(prefix)
            mapping = hugo.search(tmp_name)
            if 'response' in mapping:
                response = mapping['response']
                if 'numFound' in response:
                    if response['numFound'] == 0:
                        not_found.add(i)
                        continue
                    elif response['numFound'] == 1:
                        docs = response['docs'][0]
                        hugo_dict[i] = docs['symbol']
                        continue
                    else:
                        if 'symbol' in response['docs'][0]:
                            hugo_dict[i] = response['docs'][0]['symbol']
            else:
                not_found.add(i)
        if not_found != 0:
            print("{} not found after HGNC mapping".format(len(not_found)))
            print("{} ".format(not_found))
        return hugo_dict, not_found


def _dict(data, key, value):
    """
    creates a dictionary with a list of values for each key

    Parameters
    ----------
    data : pandas.DataFrame
    key : str
    value : str

    Returns
    -------

    """
    return_dict = SortedDict()

    d = data[[key, value]].copy()
    d.dropna(how='any', inplace=True)

    for i, j in d.values:
        if i in return_dict:
            return_dict[i].add(j)
        else:
            return_dict[i] = SortedSet([j])
    return return_dict


manual_dict = {'hsa:857': 'CAV1',
               'hsa:2250': 'FGF5',
               'hsa:5337': 'PLD1',
               'hsa:4312': 'MMP1',
               'hsa:102723407': 'IGHV4OR15-8',
               'hsa:100132074': 'FOXO6',
               'hsa:728635': 'DHRS4L1',
               'hsa:10411': 'RAPGEF3',
               'hsa:100101267': 'POM121C',
               'hsa:2768': 'GNA12',
               'hsa:2044': 'EPHA5',
               'hsa:100533467': 'BIVM-ERCC5',
               'hsa:7403': 'KDM6A',
               'hsa:1981': 'EIF4G1',
               'hsa:2906': 'GRIN2D',
               'hsa:4088': 'SMAD3',
               'hsa:6776': 'STAT5A',
               'hsa:182': 'JAG1',
               'hsa:3708': 'ITPR1',
               'hsa:1293': 'COL6A3',
               'hsa:93034': 'NT5C1B',
               'hsa:574537': 'UGT2A2',
               'hsa:11044': 'PAPD7',
               'hsa:57292': 'KIR2DL5A',
               'hsa:107984026': 'ACAP3'
               }

if __name__ == '__main__':
    gm = GeneMapper()