Source code for magine.enrichment.enrichment_result

import itertools
import operator
from collections import OrderedDict
from itertools import combinations

import numpy as np
import pandas as pd

from magine.data.base import BaseData
from magine.plotting.heatmaps import cluster_distance_mat

# Will be OK in Python 2
try:
    basestring
# Allows isinstance(foo, basestring) to work in Python 3
except:
    basestring = str

sig = 'significant'


[docs]def load_enrichment_csv(file_name, **args):
    """ Load data into EnrichmentResult data class

    Parameters
    ----------
    file_name : str

    Returns
    -------
    EnrichmentResult

    """
    d = pd.read_csv(file_name, **args)
    return EnrichmentResult(d)


[docs]class EnrichmentResult(BaseData):

    def __init__(self, *args, **kwargs):
        super(EnrichmentResult, self).__init__(*args, **kwargs)
        self._index = 'term_name'
        self._identifier = 'term_name'
        self._value_name = 'combined_score'
        self._sample_id_name = 'sample_id'

    @property
    def _constructor(self):
        return EnrichmentResult

[docs]    def filter_rows(self, column, options, inplace=False):
        """
        Filters a pandas dataframe provides a column and filter selection.

        Parameters
        ----------
        column : str
        options : str, list
            Can be a single entry or a list
        inplace : bool
            Filter inplace
        Returns
        -------
        pd.DataFrame
        """

        new_data = self.copy()
        valid_opts = sorted(new_data[column].unique())
        if isinstance(options, str):
            if options not in valid_opts:
                print('{} not in {}'.format(options, valid_opts))
            else:
                new_data = new_data[new_data[column] == options]
        elif isinstance(options, list):
            for i in options:
                if i not in valid_opts:
                    print('{} not in {}'.format(i, valid_opts))
            new_data = new_data[new_data[column].isin(options)]

        if inplace:
            self._update_inplace(new_data)
        else:
            return new_data

[docs]    def filter_multi(self, p_value=None, combined_score=None, db=None,
                     sample_id=None, category=None, rank=None, inplace=False):
        """
        Filters an enrichment array.

        This is an aggregate function that allows ones to filter an entire
        dataframe with a single function call.

        Parameters
        ----------
        p_value : float
            filters all values less than or equal
        combined_score : float
            filters all values greater than or equal
        db : str, list
        sample_id : str, list
        category : str, list
        rank : int
        inplace : bool
            Filter inplace

        Returns
        -------
        new_data : EnrichmentResult
        """
        new_data = self.copy()
        if p_value is not None:
            if not isinstance(p_value, (int, float)):
                raise AssertionError("p_value must be a float or int")
            new_data = new_data[new_data['adj_p_value'] <= p_value]
        if combined_score is not None:
            if not isinstance(combined_score, (int, float)):
                raise AssertionError("combined_score must be a float or int")
            new_data = new_data[new_data['combined_score'] >= combined_score]
        if isinstance(rank, (int, float)):
            new_data = new_data[new_data['rank'] <= rank]
        if db is not None:
            new_data.filter_rows('db', db, inplace=True)
        if sample_id is not None:
            new_data.filter_rows('sample_id', sample_id, inplace=True)
        if category is not None:
            new_data.filter_rows('category', category, inplace=True)
        if inplace:
            self._update_inplace(new_data)
        else:
            return new_data

[docs]    def term_to_genes(self, term):
        """ Get set of genes of provides term(s)

        Parameters
        ----------
        term : str, list

        Returns
        -------
        set

        """
        if isinstance(term, basestring):
            genes = self[self['term_name'].isin([term])]['genes']
        else:
            genes = self[self['term_name'].isin(term)]['genes']
        return set(itertools.chain.from_iterable(genes.str.split(',').values))

[docs]    def term_to_genes_dict(self, term_list=None):
        """

        Parameters
        ----------
        term_list : list

        Returns
        -------
        OrderedDict

        """
        if term_list is None:
            term_list = set(self['term_name'].values)
        elif isinstance(term_list, basestring):
            term_list = [term_list]
        gene_to_term = {}
        for term in term_list:
            gene_list = self.term_to_genes(term)
            for g in gene_list:
                if g not in gene_to_term:
                    gene_to_term[g] = set()
                gene_to_term[g].add(term)
        term_to_gene = {}
        for i, j in gene_to_term.items():
            name = ','.join(sorted(j))
            if name not in term_to_gene:
                term_to_gene[name] = set()
            term_to_gene[name].add(i)
        return OrderedDict(
            sorted(term_to_gene.items(), key=operator.itemgetter(0))
        )

[docs]    def all_genes_from_df(self):
        """ Returns all genes from gene columns in a set

        Returns
        -------
        set
        """
        return set(
            itertools.chain.from_iterable(self['genes'].str.split(',').values)
        )

[docs]    def filter_based_on_words(self, words, inplace=False):
        """ Filter term_name based on key terms

        Parameters
        ----------
        words : list, str
            List of words to use to keep rows in dataframe
        inplace : bool
            Filter the dataframe in place or return filtered copy

        Returns
        -------
        pandas.DataFrame

        """
        if isinstance(words, str):
            words = [words]
        df = self.copy()
        df = df[df['term_name'].str.lower().str.contains('|'.join(words))]
        if inplace:
            self._update_inplace(df)
        else:
            return df

[docs]    def find_similar_terms(self, term, level='sample', remove_subset=True):
        """ Calculates similarity of all other terms to given term

        Parameters
        ----------
        term : str
        level : str
            Sample or dataframe level, flattens all terms to one set of genes
        remove_subset : bool
            If any term is a subset of the other term, a score of 1 will be
            used instead of jaccard index.


        Returns
        -------
        pd.DataFrame
        """

        rest_of_df = self[~(self['term_name'] == term)].copy()
        first_genes = self.term_to_genes(term)

        if level == 'dataframe':
            vals = [[i, self.term_to_genes(i)] for i in
                    rest_of_df['term_name'].unique()]
        else:
            vals = [[i, j.split(',')] for i, j in
                    rest_of_df[['term_name', 'genes']].values]

        dist_m = [[i, jaccard_index(first_genes, j, remove_subset)]
                  for i, j in vals]

        df = pd.DataFrame(dist_m, columns=['term_name', 'similarity_score'])
        df.sort_values('similarity_score', inplace=True, ascending=False)
        return df

[docs]    def show_terms_below(self, term, level='dataframe', threshold=.7,
                         remove_subset=True):
        """
        Find terms that were removed by remove_redundant

        Parameters
        ----------
        term : str
        level : str
        threshold : float
        remove_subset : bool

        Returns
        -------
        EnrichmentResult
        """
        temp_df = self.copy()
        # calculate similarity of term to all terms
        sim_terms = temp_df.find_similar_terms(term,
                                               remove_subset=remove_subset,
                                               level=level)
        # gather terms that are highly similar
        sim_terms = sim_terms.loc[sim_terms.similarity_score >= threshold]
        high_similar_terms = set(sim_terms.term_name.values)

        # this shows all terms that remained after filtering
        # Will be used to compare against
        term_kept = temp_df.remove_redundant(
            threshold=threshold,
            level=level,
            inplace=False,
            sort_by='combined_score'
        )

        term_kept = set(term_kept.term_name.values)
        terms_removed = high_similar_terms.difference(term_kept)
        # Adding the original term to show similarity
        terms_removed.add(term)
        return temp_df.loc[temp_df.term_name.isin(terms_removed)]

[docs]    def remove_redundant(self, threshold=0.75, verbose=False, level='sample',
                         sort_by='combined_score', inplace=False):
        """
        Calculate similarity between all term sets and removes redundant terms.

        Parameters
        ----------
        threshold : float, default 0.75
        verbose : bool, default False
            Print similarity scores and removed terms.
        level : {'sample', 'dataframe'}, default 'sample'
            Level to filter dataframe. 'sample' will pivot the dataframe and
            filter each group of 'sample_id' individually. 'dataframe' will
            merge all genes that share the same 'term_name'.
        sort_by : {'combined_score', 'rank', 'adj_p_value', 'n_genes'},
            default 'combined_score'
            Keyword to sort the dataframe. The scoring starts at the top term and
            compares to all the lower terms. Options are
        inplace : bool
            Filter the dataframe in place or return filtered copy

        Returns
        -------
        pandas.DataFrame

        """

        if sort_by in ('rank', 'adj_p_value'):
            ascending = True
        else:
            ascending = False

        self.sort_values(sort_by, inplace=True, ascending=ascending)
        data_copy = self.copy()
        if 'sample_id' not in data_copy.columns or level == 'dataframe':
            to_keep = data_copy.unique_terms(threshold, verbose, level=level)
        else:
            to_keep = set()
            for i in sorted(data_copy['sample_id'].unique()):
                tmp = data_copy[data_copy['sample_id'] == i]
                to_keep.update(
                    tmp.unique_terms(threshold, verbose, level=level)
                )

        data_copy = data_copy[(data_copy['term_name'].isin(to_keep))]
        print("Number of rows went from {} to {}"
              "".format(len(self.term_name.unique()),
                        len(data_copy.term_name.unique()))
              )
        if inplace:
            self._update_inplace(data_copy)
        else:
            return data_copy

[docs]    def unique_terms(self, threshold=0.75, verbose=False, level='dataframe'):
        """

        Parameters
        ----------
        threshold : float
        verbose : bool
        level : str, {'dataframe', 'each'}

        Returns
        -------

        """
        if level == 'dataframe':
            names = self['term_name'].unique()
            scores = self._get_distance_all()
        else:
            names = self['term_name'].values
            scores = self._get_distance_each()
        to_remove, to_keep = set(), set()

        n_dim = len(names)
        ind = 0
        for i, term_1 in enumerate(names):
            if term_1 not in to_remove:
                to_keep.add(term_1)
            else:
                for j in range(n_dim):
                    if i >= j:
                        continue
                    ind += 1
                continue
            if verbose:
                print("Finding matches for {}".format(term_1))
            for j, term_2 in enumerate(names):
                if i >= j:
                    continue
                score = scores[ind]
                ind += 1
                if score > threshold:
                    to_remove.add(term_2)
                if verbose:
                    print("\tScore for {} is {:.3f}".format(term_2, score))
                    if verbose:
                        print("\t\tRemoving {}".format(term_2))

        return to_keep

[docs]    def dist_matrix(self, figsize=(8, 8), level='dataframe'):
        """ Create a distance matrix of all term similarity

        Parameters
        ----------
        figsize : tuple
            Size of figure
        level : str, {'dataframe', 'each'}
            How to treats term_name to genes. Dataframe compresses all genes
            from all sample_ids into same term. 'each' treats each term_name
            individually.

        Returns
        -------
        matplotlib.Figure

        """
        mat, names = self.calc_dist(level)
        return cluster_distance_mat(mat, names, figsize)

[docs]    def calc_dist(self, level='datafame'):
        if level == 'each':
            names = self['term_name'].values
            scores = self._get_distance_each()
        else:
            names = self['term_name'].unique()
            scores = self._get_distance_all()
        n_dim = len(names)

        mat = np.ones((n_dim, n_dim), dtype=float)
        ind = 0
        for i in range(n_dim):
            for j in range(n_dim):
                if i >= j:
                    continue
                mat[i, j] = scores[ind]
                mat[j, i] = scores[ind]
                ind += 1
        return mat, names

    def _get_distance_each(self):
        vals = [set(i.split(',')) for i in self['genes'].values]
        return list(map(_score, combinations(vals, 2)))

    def _get_distance_all(self):
        vals = [self.term_to_genes(i) for i in self['term_name'].unique()]
        return list(map(_score, combinations(vals, 2)))


def _score(vals):
    return jaccard_index(vals[0], vals[1])


def jaccard_index(set1, set2, remove_subset=True):
    """
    Computes the similarity between two sets.
        https://en.wikipedia.org/wiki/Jaccard_index

    Parameters
    ----------
    set1 : set
    set2 : set
    remove_subset : bool
        If a set is a subset of the other, return 1.


    Returns
    -------
    index : float


    References
    ----------
    .. [1] `Wikipedia entry for the Jaccard index
           <https://en.wikipedia.org/wiki/Jaccard_index>`_
    """
    union = len(set1.union(set2))
    max_size = max(len(set1), len(set2))
    if union == max_size and remove_subset:
        return 1.
    return float(len(set1.intersection(set2))) / float(union)