Source code for magine.enrichment.enrichr

import json
import logging
import os
import re

import numpy as np
import pandas as pd
import requests

from magine.enrichment.enrichment_result import EnrichmentResult
from magine.logging import get_logger
from magine.plotting.species_plotting import write_table_to_html

logger = get_logger(__name__, log_level=logging.INFO)

_valid_libs = set()
with open(os.path.join(os.path.dirname(__file__),
                       '_valid_enricher_libs.txt'), 'r') as f:
    for n in f.read().splitlines():
        _valid_libs.add(n)

gene = 'gene'

db_types = {
    'histone': [
        'Epigenomics_Roadmap_HM_ChIP-seq',
        'ENCODE_Histone_Modifications_2015',
        'ESCAPE',
    ],
    'mrna': [
        'TargetScan_microRNA_2017',
        'miRTarBase_2017',
    ],
    'kinases': [
        'KEA_2015',
        'Kinase_Perturbations_from_GEO_down',
        'Kinase_Perturbations_from_GEO_up',
        'Phosphatase_Substrates_from_DEPOD',
        'ARCHS4_Kinases_Coexp',
        'ARCHS4_IDG_Coexp',
    ],
    'transcription': [
        'ChEA_2016',
        'TRANSFAC_and_JASPAR_PWMs',
        'ARCHS4_TFs_Coexp',
        'Enrichr_Submissions_TF-Gene_Coocurrence',
        'Genome_Browser_PWMs',
        'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
        'ENCODE_TF_ChIP-seq_2015',
        'TF-LOF_Expression_from_GEO',
        'Transcription_Factor_PPIs'
    ],
    'pathways': [
        'KEGG_2016',
        'WikiPathways_2016',
        'Reactome_2016',
        'BioCarta_2016',
        'NCI-Nature_2016',
        'Panther_2016',
    ],
    'complex': [
        'NURSA_Human_Endogenous_Complexome',
        'CORUM',
        'PPI_Hub_Proteins',
        'BioPlex_2017',
    ],
    'ontologies': [
        'GO_Cellular_Component_2018',
        'GO_Biological_Process_2018',
        'GO_Molecular_Function_2018',
        'MGI_Mammalian_Phenotype_2017',
        'Jensen_COMPARTMENTS',
    ],
    'drug': [
        'DrugMatrix',
        'Drug_Perturbations_from_GEO_2014',
        'Old_CMAP_up',
        'Old_CMAP_down',
        'LINCS_L1000_Chem_Pert_up',
        'LINCS_L1000_Chem_Pert_down',
        'LINCS_L1000_Ligand_Perturbations_up',
        'LINCS_L1000_Ligand_Perturbations_down',
    ],
    'disease': [
        'OMIM_Disease',
        'OMIM_Expanded',
        'Jensen_DISEASES',
        'Human_Phenotype_Ontology'
    ],
    'cell_type': [

        'ARCHS4_Tissues',
        'ARCHS4_Cell-lines',
        'Allen_Brain_Atlas_up',
        'Allen_Brain_Atlas_down',
        'Cancer_Cell_Line_Encyclopedia',
        'Human_Gene_Atlas',
        'NCI-60_Cancer_Cell_Lines',
        # 'Jensen_TISSUES',
    ],
    'crowd': [
        'Disease_Perturbations_from_GEO_down',
        'Disease_Perturbations_from_GEO_up',
        'Drug_Perturbations_from_GEO_down',
        'Drug_Perturbations_from_GEO_up',
        'RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO',
        'Aging_Perturbations_from_GEO_up',
        'Aging_Perturbations_from_GEO_down',
        'Ligand_Perturbations_from_GEO_up',
        'Ligand_Perturbations_from_GEO_down',
        'MCF7_Perturbations_from_GEO_up',
        'MCF7_Perturbations_from_GEO_down',
        'Microbe_Perturbations_from_GEO_up',
        'Microbe_Perturbations_from_GEO_down',
        'SysMyo_Muscle_Gene_Sets',
    ]

}


def get_libraries():
    url = 'http://amp.pharm.mssm.edu/Enrichr/datasetStatistics'
    libs_json = json.loads(requests.get(url).text)
    return [lib['libraryName'] for lib in libs_json['statistics']]


[docs]class Enrichr(object):
    _query = '{url}/enrich?userListId={list_id}&backgroundType={lib}'

    def __init__(self, verbose=False):
        self._url = 'http://maayanlab.cloud/Enrichr/'
        self._valid_libs = _valid_libs
        self.verbose = verbose

[docs]    def print_valid_libs(self):
        """
        Print a list of all available libraries EnrichR has to offer.



        """
        for lib_name in sorted(self._valid_libs):
            print(lib_name)

[docs]    def run(self, list_of_genes, gene_set_lib='GO_Biological_Process_2017'):
        """

        Parameters
        ----------
        list_of_genes : list_like
            List of genes using HGNC gene names
        gene_set_lib : str or list
            Name of gene set library
            To print options use Enrichr.print_valid_libs


        Examples
        --------
        >>> import pandas as pd
        >>> pd.set_option('display.max_colwidth', 40)
        >>> pd.set_option('precision', 3)
        >>> e = Enrichr()
        >>> df = e.run(['BAX', 'BCL2', 'CASP3', 'CASP8'], \
                        gene_set_lib='Reactome_2016')
        >>> print(df[['term_name','combined_score']].head(5))#doctest: +NORMALIZE_WHITESPACE
                                             term_name  combined_score
            0  intrinsic pathway for apoptosis hsa ...       11814.410
            1               apoptosis hsa r-hsa-109581        2365.141
            2  programmed cell death hsa r-hsa-5357801        2313.527
            3  caspase-mediated cleavage of cytoske...       10944.261
            4  caspase activation via extrinsic apo...        4245.542

        Returns
        -------
        df : EnrichmentResult
            Results from enrichR

        """
        if not isinstance(list_of_genes, (list, set)):
            raise AssertionError("list_of_genes must be list like")
        logger.debug("Running Enrichr with gene set {}".format(gene_set_lib))
        user_list_id = self._add_gene_list(list_of_genes)
        if isinstance(gene_set_lib, str):
            df = self._run_id(user_list_id, gene_set_lib)
        else:
            df = self._run_list_of_dbs(user_list_id, gene_set_lib)

        init_size = len(df)
        if init_size == 0:
            logger.debug("No terms returned")
            return df


        # set all terms with adj_p_value less than 0.05 to significant.
        df['significant'] = False
        crit = (df['adj_p_value'] <= 0.05)
        df.loc[crit, 'significant'] = True

        after_size = len(df)
        if init_size != after_size:
            raise AssertionError('not the same shape {}'.format(gene_set_lib))
        logger.debug("Done calling Enrichr.")
        return df

[docs]    def run_samples(self, sample_lists, sample_ids,
                    gene_set_lib='GO_Biological_Process_2017', save_name=None,
                    create_html=False, out_dir=None, run_parallel=False,
                    exp_data=None, pivot=False):
        """ Run enrichment analysis on a list of samples.

        Parameters
        ----------
        sample_lists : list_like
            List of lists of genes for enrichment analysis
        sample_ids : list
            list of ids for the provided sample list
        gene_set_lib : str, list
            Type of gene set, refer to Enrichr.print_valid_libs
        save_name : str, optional
            if provided it will save a file as a pivoted table with
            the term_ids vs sample_ids
        create_html : bool
            Creates html of output with plots of species across sample
        out_dir : str
            If create_html, it will place all html plots into this directory
        run_parallel : bool
            If create_html, it will create plots using multiprocessing
        exp_data : magine.data.ExperimentalData
            Must be provided if create_html=True
        pivot : bool

        Examples
        --------
        .. plot::
            :context: close-figs

            >>> import pandas as pd
            >>> import matplotlib.pyplot as plt
            >>> from magine.enrichment.enrichr import Enrichr
            >>> pd.set_option('display.max_colwidth', 40)
            >>> pd.set_option('precision', 3)
            >>> samples = [['BAX', 'BCL2', 'CASP3', 'CASP8'], \
['ATR', 'ATM', 'TP53', 'CHEK1']]
            >>> sample_ids = ['apoptosis', 'dna_repair']
            >>> e = Enrichr()
            >>> df = e.run_samples(samples, sample_ids, \
gene_set_lib='Reactome_2016')
            >>> print(df[['term_name','combined_score']].head(5))#doctest: +NORMALIZE_WHITESPACE
                                                 term_name  combined_score
                0  intrinsic pathway for apoptosis hsa ...       11814.410
                1               apoptosis hsa r-hsa-109581        2365.141
                2  programmed cell death hsa r-hsa-5357801        2313.527
                3  caspase-mediated cleavage of cytoske...       10944.261
                4  caspase activation via extrinsic apo...        4245.542
            >>> df.filter_multi(rank=10, inplace=True)
            >>> df['term_name'] = df['term_name'].str.split('_').str.get(0)
            >>> fig = df.sig.heatmap(figsize=(6, 6), linewidths=.05)

        Returns
        -------
        EnrichmentResult
        """
        if not isinstance(sample_lists, list):
            raise AssertionError("List required")
        if not isinstance(sample_lists[0], (list, set)):
            raise AssertionError("List of lists required")
        df_final = self.run(sample_lists[0], gene_set_lib)
        df_final['sample_id'] = sample_ids[0]
        for count, (i, j) in enumerate(zip(sample_lists[1:], sample_ids[1:])):
            df = self.run(i, gene_set_lib)
            df['sample_id'] = j
            df_final = df_final.append(df, ignore_index=True)

        # removes terms that do not have at least 1 signficant term across any
        # of the samples list provided
        df_final.require_n_sig(n_sig=1, inplace=True)
        df_final = EnrichmentResult(df_final)
        if save_name:
            s_name = '{}_enrichr'.format(save_name)
            if pivot:
                # pivot output
                p_df = pd.pivot_table(
                    df_final, index=['term_name', 'db'],
                    columns='sample_id',
                    aggfunc='first',
                    fill_value=np.nan,
                    values=['term_name', 'rank', 'p_value', 'z_score',
                            'combined_score', 'adj_p_value', 'genes',
                            'significant'],
                )
                # save files
                p_df.to_excel('{}.xlsx'.format(s_name), merge_cells=True)
            df_final.to_csv('{}.csv'.format(s_name), index=False)

        if create_html:
            if exp_data is None:
                raise AssertionError("exp_data required for plots")

            write_table_to_html(data=df_final, save_name=save_name,
                                out_dir=out_dir, run_parallel=run_parallel,
                                exp_data=exp_data)
        return df_final

    def _run_id(self, list_id, gene_set_lib):
        if gene_set_lib not in _valid_libs:
            raise AssertionError("{} not in valid ids {}".format(gene_set_lib,
                                                                 _valid_libs))

        q = self._query.format(url=self._url, list_id=list_id,
                               lib=gene_set_lib)
        response = requests.get(q, timeout=300)

        if not response.ok:
            logger.warn("{} library failed to run on enrichRs side. "
                        "View their response for more info {}"
                        "".format(gene_set_lib, q))
            return EnrichmentResult()
        data = json.loads(response.text)
        if gene_set_lib not in data:
            raise Exception("{} not in enrichR".format(gene_set_lib))
        if len(data[gene_set_lib]) == 0:
            return EnrichmentResult()
        #####
        # ENRICHR return a list of entries with each entry having these terms
        # Rank, Term name, P-value, Z-score, Combined score, Overlapping genes,
        # Adjusted p-value, Old p-value, Old adjusted p-value
        #####

        df = EnrichmentResult(
            data[gene_set_lib],
            columns=['rank', 'term_name', 'p_value', 'z_score',
                     'combined_score', 'gene_hits', 'adj_p_value', '_', '_']
        )

        def compress_genes(row):
            return ','.join(g for g in sorted(row['gene_hits']))

        def get_length(row):
            return len(row['gene_hits'])

        # compress genes into string and get count
        df['genes'] = df.apply(compress_genes, axis=1)
        df['n_genes'] = df.apply(get_length, axis=1)

        # remove term names if null
        df = df[~df['term_name'].isnull()].copy()

        # keep only desired columns
        cols = ['term_name', 'rank', 'p_value', 'z_score', 'combined_score',
                'adj_p_value', 'genes', 'n_genes']

        df = df[cols]
        df['db'] = gene_set_lib
        df['combined_score'] = pd.to_numeric(df['combined_score'],
                                             errors='coerce')

        with pd.option_context('mode.use_inf_as_null', True):
            # remove rows without a term name, nans, or infinite values
            df = df.dropna(
                subset=['term_name', 'adj_p_value', 'combined_score']
            )
        try:
            df = _prepare_output(df)
        except:
            logger.warning("Unable to simplify enrichR term names. This could "
                           "be due to a change in enrichR formatting. "
                           "Returning as default output.")
        return df

    def _run_list_of_dbs(self, gene_list_id, databases):
        data = self._run_id(gene_list_id, databases[0])
        for db in databases[1:]:
            data = data.append(self._run_id(gene_list_id, db),
                               ignore_index=True)
            logger.debug('\t\t{}/{} databases'.format(db, len(databases)))
        return data

    def _add_gene_list(self, gene_list):
        """ Upload to enrichr

        Parameters
        ----------
        gene_list : list

        Returns
        -------
        str : userListId that maps to gene list provided, used within enrichR
        to gather enrichment results.
        """

        genes_str = '\n'.join(gene_list)

        payload = {
            'list': (None, genes_str),
            'description': (None, 'MAGINE analysis')
        }
        response = requests.post(self._url + '/addList', files=payload)
        if not response.ok:
            raise Exception('Error analyzing gene list', response.ok)

        data = json.loads(response.text)
        return data['userListId']


def _prepare_output(df):
    df['term_name'] = df.apply(clean_term_names, axis=1)

    return df


[docs]def clean_term_names(row):
    term_name = row['term_name']
    if not isinstance(term_name, str):
        return term_name

    db = row['db']

    if db in [
        'GO_Biological_Process_2018', 'GO_Molecular_Function_2018',
        'GO_Cellular_Component_2018',
        'GO_Biological_Process_2017', 'GO_Molecular_Function_2017',
        'GO_Cellular_Component_2017',
        'GO_Biological_Process_2017b', 'GO_Molecular_Function_2017b',
        'GO_Cellular_Component_2017b'
    ]:

        if 'GO:' in term_name:
            term_name = term_name.split('(GO:', 1)[0]
        elif 'go:' in term_name:
            term_name = term_name.split('(go:', 1)[0]
    elif db == 'Human_Phenotype_Ontology':
        if 'HP:' in term_name:
            term_name = term_name.split('(HP:', 1)[0]

    elif db == 'MGI_Mammalian_Phenotype_2017':
        if term_name.startswith('MP:'):
            term_name = term_name.split(' ', 1)[1]

    elif db == 'DrugMatrix':
        try:
            drug_name = re.search(r'^(.*)(-\d*.*\d_)', term_name).group(1)
            direction = re.search(r'-(.{2})$', term_name).group(0)
            term_name = drug_name + direction
        except:
            term_name = term_name

    elif db in ['Old_CMAP_down', 'Old_CMAP_up']:
        term_name = term_name.rsplit('-', 1)[0]

    elif db in ['Ligand_Perturbations_from_GEO_down',
                'Ligand_Perturbations_from_GEO_up']:
        term_name = term_name.split('_', 1)[0]

    term_name = term_name.strip()
    term_name = term_name.lower()
    for i, j in replace_pairs:
        if i in term_name:
            term_name = term_name.replace(i, j)

    return term_name


[docs]def clean_lincs(df):
    """ Cleans the lincs databases term_names from enrichR.

    Parameters
    ----------
    df

    Returns
    -------

    """
    pattern = re.compile(
        r'(?P<id>\w+) (?P<cell>\w+) (?P<time>\w+)-(?P<drug>.*)-(\S*)')

    def get_drug(row):
        db = row['db']
        term_name = row['term_name']
        if db not in ['LINCS_L1000_Chem_Pert_up',
                      'LINCS_L1000_Chem_Pert_down']:
            return term_name
        try:
            drug_name = pattern.search(term_name).group('drug')
            return drug_name
        except:
            # If the string doesnt have the pattern above, there is no way
            # to properly parse it. Ran into a few examples where they used
            # spaces instead of "_". Hard to parse since the names of the drugs
            # can also have spaces ("sulfide salts")
            # ex. cpc006 snuc5 6h-quinine hemisulfate salt monohydrate-10.0
            return term_name

    df['term_name'] = df.apply(get_drug, axis=1)
    return df


[docs]def clean_drug_pert_geo(df):
    def get_drug(row):
        db = row['db']
        term_name = row['term_name']
        if db != 'Drug_Perturbations_from_GEO_2014':
            return term_name
        try:
            segs = term_name.split('_')
            drug_name = segs[0] + '_' + segs[-1][0] + segs[-1][-1]
        except NameError:
            drug_name = term_name
        return drug_name

    df['term_name'] = df.apply(get_drug, axis=1)
    return df


def clean_drug_dbs(data):
    df_copy = data.copy()
    if 'Drug_Perturbations_from_GEO_2014' in df_copy.db.unique():
        df_copy = clean_drug_pert_geo(df_copy)
    if 'LINCS_L1000_Chem_Pert_up' in df_copy.db.unique() or \
            'LINCS_L1000_Chem_Pert_down' in df_copy.db.unique():
        df_copy = clean_lincs(df_copy)
    return df_copy


[docs]def clean_tf_names(data):
    """
    Cleans transcription factors databases by removing everything after '_'.

    Parameters
    ----------
    data : pd.DataFrame

    Returns
    -------

    """

    def return_single_tf(row):
        """
        Gets TF name only
        """
        tf = row['term_name']
        if '_' in tf:
            tf = tf.split('_')[0]
        return tf.upper()

    tf_dbs = ['ARCHS4_TFs_Coexp',
              'ChEA_2016',
              'ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
              'ENCODE_TF_ChIP-seq_2015',
              'Enrichr_Submissions_TF-Gene_Coocurrence',
              'TRANSFAC_and_JASPAR_PWMs',
              'TF-LOF_Expression_from_GEO',
              'Transcription_Factor_PPIs']
    tfs = data[data['db'].isin(tf_dbs)].copy()
    tfs['term_name'] = tfs.apply(return_single_tf, axis=1)
    return tfs


replace_pairs = [
    ('mus musculus', 'mus'),
    ('homo sapiens', 'hsa'),
    ('rattus norvegicus', 'rat'),
    ('oncorhynchus mykiss', 'onc_mykiss'),
    ('macaca fascicularis', 'mfa'),
    ('oryzias latipes', 'ola'),
    ('sus scrofa', 'ssc'),
    ('danio rerio', 'dre'),
    ('bos taurus', 'bta'),
    ('dictyostelium discoideum', 'dicty'),
    ('myzus persicae', 'm.persicae'),
    ('staphylococcus aureus', 's.aureus'),
    ('escherichia coli', 'e.coli'),
    ('pseudomonas aeruginosa', 'p.aeruginosa'),
    ('drosophila melanogaste', 'fly'),
    ('mycobacterium tuberculosis', 'm.tuberculosis'),
    ('hordeum vulgare', 'h.vulgare'),
    (' (mouse)', '_mus'),
    (' (human)', '_hsa'),
    ('Homo sapiens', 'hsa'),
    ('Mus musculus', 'mus'),
    ('hg19', '_hsa'),
    ('mm9', '_mus'),
    (' hsa r-hsa', '_hsa')

]

standard_dbs = []
for db in ['drug', 'disease', 'ontologies', 'pathways', 'transcription',
           'kinases', 'histone', 'cell_type']:
    standard_dbs += db_types[db]


[docs]def get_background_list(lib_name):
    """
    Return reference list for given gene referecen set

    Parameters
    ----------
    lib_name : str

    Returns
    -------

    """
    enrichment_url = 'http://amp.pharm.mssm.edu/Enrichr/geneSetLibrary'
    query_string = '?userListId&libraryName={}'
    response = requests.get(
        enrichment_url + query_string.format(lib_name)
    )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    results = json.loads(response.text)

    term_to_gene = []
    if lib_name not in results:
        raise AssertionError("Something happened when calling enrichR.")

    for term, genes_dict in results[lib_name]['terms'].items():
        genes = sorted(set(i for i in genes_dict))
        term_to_gene.append(
            dict(term=term.lower(), gene_list=genes, n_genes=len(genes))
        )

    return term_to_gene


[docs]def run_enrichment_for_project(exp_data, project_name, databases=None,
                               output_path=None):
    """

    Parameters
    ----------
    exp_data : magine.data.experimental_data.ExprerimentalData
    project_name : str
    databases : list
    output_path : str
        Location to save all individual enrichment output files created.

    """

    if databases is None:
        databases = standard_dbs

    logger.info("Running enrichment on project")
    logger.info("Running {} databases".format(len(databases)))

    e = Enrichr(verbose=True)
    all_df = []
    if output_path is None:
        _dir = os.path.join(os.getcwd(), 'enrichment_output')
    else:
        _dir = output_path

    if not os.path.exists(_dir):
        logger.info("Creating output directory: {}".format(_dir))
        os.mkdir(_dir)

    def _run_new(samples, timepoints, category):
        logger.info("Running {}".format(category))
        for genes, sample_id in zip(samples, timepoints):
            if not len(genes):
                continue
            logger.info('sample_id = {}'.format(sample_id))
            current = "{}_{}_{}".format(category, sample_id, project_name)
            name = os.path.join(_dir, current + '.csv.gz')
            try:
                df = pd.read_csv(name, index_col=None, encoding='utf-8')
            except:
                df = e.run(genes, databases)
                df['sample_id'] = sample_id
                df['category'] = category
                df.to_csv(name, index=False, encoding='utf-8',
                          compression='gzip')
            df['sample_id'] = sample_id
            df['category'] = category
            all_df.append(df)

    #  run each experimental 'source' by time point
    #  ( label-free, ph-silac, etc are all separate)
    for source in exp_data.exp_methods:
        df = exp_data[source].sig
        col_options = df['species_type'].unique()
        # make sure there is only a single species type and it is protein
        # this makes sure that the RNA seq doesnt get ran twice
        if len(col_options) == 1 and col_options[0] == 'protein':
            _run_new(df.by_sample, df.sample_ids, '{}_both'.format(source))
            _run_new(df.up_by_sample, df.sample_ids, '{}_up'.format(source))
            _run_new(df.down_by_sample, df.sample_ids,
                     '{}_down'.format(source))

    #  run all protein labeled species grouped by time point
    #  ( label-free, ph-silac, etc are all combined)
    if len(exp_data.proteins.sample_ids) != 0:
        sample = exp_data.proteins.sig
        if len(sample.source.unique()) == 1:
            print("Only single source found. "
                  "No aggregation enrichment to perform")
        else:
            _run_new(sample.by_sample, sample.sample_ids, 'proteomics_both')
            _run_new(sample.up_by_sample, sample.sample_ids, 'proteomics_up')
            _run_new(sample.down_by_sample, sample.sample_ids,
                     'proteomics_down')

    #  run all RNA labeled species grouped by time point
    if len(exp_data.rna.sample_ids) != 0:
        sample = exp_data.rna.sig
        _run_new(sample.by_sample, sample.sample_ids, 'rna_both')
        _run_new(sample.down_by_sample, sample.sample_ids, 'rna_down')
        _run_new(sample.up_by_sample, sample.sample_ids, 'rna_up')

    # merge all outputs
    final_df = pd.concat(all_df, ignore_index=True)

    final_df = final_df[
        ['term_name', 'rank', 'combined_score', 'adj_p_value', 'genes',
         'n_genes', 'sample_id', 'category', 'db']
    ]
    final_df.dropna(inplace=True)

    # enrichR started returning Infinite values, which are returned as str
    # rather than numeric. To filter, this must be converted to numeric
    final_df['combined_score'] = pd.to_numeric(final_df['combined_score'],
                                               errors='coerce')

    with pd.option_context('mode.use_inf_as_null', True):
        # remove rows without a term name, nans, or infinite values
        final_df = final_df.dropna(
            subset=['term_name', 'adj_p_value', 'combined_score']
        )

    # Adds significant column
    final_df.loc[(final_df['adj_p_value'] <= 0.05) &
                 (final_df['combined_score'] > 0.0), 'significant'] = True

    logger.info("Saving output: {}_enrichment.csv.gz".format(project_name))
    final_df.to_csv('{}_enrichment.csv.gz'.format(project_name),
                    encoding='utf-8', compression='gzip', index=False)