Source code for magine.data.experimental_data

import os
import subprocess
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import table

from magine.data.base import BaseData
from magine.data.tools import log2_normalize_df
from magine.plotting import volcano_plots as v_plot
from magine.plotting.species_plotting import plot_dataframe, plot_species

# pandas.set_option('display.max_colwidth', -1)
# column definitions

fold_change = 'fold_change'
flag = 'significant'
exp_method = 'source'
p_val = 'p_value'
rna = 'rna_seq'
gene = 'gene'
protein = 'protein'
metabolites = 'metabolites'
species_type = 'species_type'
sample_id = 'sample_id'
identifier = 'identifier'
label = 'label'
valid_cols = [fold_change, flag, p_val, species_type, sample_id]


def load_data_csv(file_name, **kwargs):
    """ Deprecated; use :class:`load_data` instead. """

    warnings.warn("load_data_csv will be removed in a future "
                  "version of MAGINE. Use load_data instead.",
                  DeprecationWarning, stacklevel=2)
    return load_data(file_name, **kwargs)


def load_data(file_name, **kwargs):
    """ Load data into EnrichmentResult data class

    Parameters
    ----------
    file_name : str
    kwargs :
        Flags to pass to pandas.

    Returns
    -------
    df : EnrichmentResult

    """
    df = pd.read_csv(file_name, **kwargs)
    df = df[df[fold_change].notnull()]
    return ExperimentalData(df)


[docs]class Sample(BaseData):
    """ Provides tools for subsets of data types

    """

    def __init__(self, *args, **kwargs):
        super(Sample, self).__init__(*args, **kwargs)
        # self.drop_duplicates(inplace=True)
        self._index = identifier
        self._identifier = identifier
        self._value_name = fold_change
        self._sample_id_name = sample_id
        self._label = label
        self._up = None
        self._down = None
        self._sig = None

    @property
    def _constructor(self):
        return Sample

    @property
    def exp_methods(self):
        """ List of sample_ids in data"""
        return sorted(set(self[exp_method].values))

    @property
    def sample_ids(self):
        """ List of sample_ids in data"""
        return sorted(set(self[sample_id].values))

    @property
    def up(self):
        """return up regulated species"""
        return self.loc[self[flag] & (self[fold_change] > 0)]

    @property
    def down(self):
        """return down regulated species"""
        return self.loc[self[flag] & (self[fold_change] < 0)]

    @property
    def id_list(self):
        """ Set of species identifiers """
        return set(self[self._identifier].values)

    @property
    def label_list(self):
        """ Set of species labels """
        return set(self[self._label].values)

    @property
    def up_by_sample(self):
        """List of up regulated species by sample"""
        return [self.loc[self[sample_id] == i].up.id_list
                for i in self.sample_ids]

    @property
    def down_by_sample(self):
        """List of down regulated species by sample"""
        return [self.loc[self[sample_id] == i].down.id_list
                for i in self.sample_ids]

    @property
    def by_sample(self):
        """List of significantly flagged species by sample"""
        return [self.loc[self[sample_id] == i].id_list
                for i in self.sample_ids]

[docs]    def subset(self, species=None, index='identifier', sample_ids=None,
               exp_methods=None):
        """

        Parameters
        ----------
        species : list, str
            List of species to create subset dataframe from
        index : str
            Index to filter based on provided 'species' list
        sample_ids : str, list
            List or string to filter sample
        exp_methods : str, list
            List or string to filter sample

        Returns
        -------
        magine.data.experimental_data.Species
        """
        df = self.copy()
        if isinstance(species, str):
            df = df.loc[df[index].str.contains(species)]
        elif isinstance(species, (list, tuple, set)):
            df = df.loc[df[index].isin(species)]
        if sample_ids is not None:
            if isinstance(species, str):
                df = df.loc[df[sample_id].str.contains(sample_ids)]
            else:
                df = df.loc[df[sample_id].isin(sample_ids)]
        if exp_methods is not None:
            if isinstance(species, str):
                df = df.loc[df[exp_method].str.contains(exp_methods)]
            else:
                df = df.loc[df[exp_method].isin(exp_methods)]
        return df

[docs]    def plot_pie_sig_ratio(self, save_name=None, ax=None, fig=None,
                           figsize=None):
        """

        Parameters
        ----------
        save_name : str
        ax : matplotlib.axes, optional
        fig : matplotlib.figure
        figsize : tuple
            Size of figure

        Returns
        -------

        """
        x = len(self.id_list)
        y = len(self.sig.id_list)
        total = x + y
        if fig is None and ax is None:
            if figsize is None:
                figsize = (3, 3)
            fig = plt.figure(figsize=figsize)
            ax = fig.add_subplot(111)
        wedges, texts, autotexts = ax.pie(
            [x, y],
            explode=(0.05, 0.05),
            textprops={'fontsize': 16},
            autopct=lambda p: '{:.0f}'.format(p * total / 100),
            shadow=True,
            startangle=140
        )

        plt.setp(autotexts, size=20)
        plt.axis('equal')
        if save_name is not None:
            plt.savefig('{}.png'.format(save_name), dpi=300,
                        bbox_inches='tight')
        return fig

[docs]    def volcano_plot(self, save_name=None, out_dir=None, sig_column=False,
                     p_value=0.1, fold_change_cutoff=1.5, x_range=None,
                     y_range=None):
        """ Create a volcano plot of data


        Parameters
        ----------
        save_name: str
            name to save figure
        out_dir: str, directory
            Location to save figure
        sig_column: bool, optional
            If to use significant flags of data
        p_value: float, optional
            Criteria for significant
        fold_change_cutoff: float, optional
            Criteria for significant
        y_range: array_like
            upper and lower bounds of plot in y direction
        x_range: array_like
            upper and lower bounds of plot in x direction

        Returns
        -------
        matplotlib.Figure

        """
        fig = v_plot.volcano_plot(self, save_name=save_name, out_dir=out_dir,
                                  sig_column=sig_column, p_value=p_value,
                                  fold_change_cutoff=fold_change_cutoff,
                                  x_range=x_range, y_range=y_range)
        return fig

[docs]    def volcano_by_sample(self, save_name=None, p_value=0.1,
                          out_dir=None, fold_change_cutoff=1.5, y_range=None,
                          x_range=None, sig_column=False):
        """
        Creates a figure of subplots of provided experimental method

        Parameters
        ----------
        save_name: str
            name to save figure
        out_dir: str, directory
            Location to save figure
        sig_column: bool, optional
            If to use significant flags of data
        p_value: float, optional
            Criteria for significant
        fold_change_cutoff: float, optional
            Criteria for significant
        y_range: array_like
            upper and lower bounds of plot in y direction
        x_range: array_like
            upper and lower bounds of plot in x direction

        Returns
        -------

        """

        data = self.copy()
        n_sample = np.sort(data[sample_id].unique())

        if len(n_sample) > 8:
            n_cols = 3
        else:
            n_cols = 2
        n_rows = int(np.rint(np.rint(len(n_sample) / float(n_cols))))
        if n_cols * n_rows < len(n_sample):
            if n_cols >= n_rows:
                n_rows += 1
            else:
                n_cols += 1

        fig = plt.figure(figsize=(4 * n_rows, 3 * n_cols))
        for n, i in enumerate(n_sample):
            sample = data[data[sample_id] == i].copy()

            sample = sample.dropna(subset=[p_val])
            sample = sample[np.isfinite(sample[fold_change])]
            sample = sample.dropna(subset=[fold_change])
            sec_0, sec_1, sec_2 = v_plot.create_mask(sample, sig_column,
                                                     p_value,
                                                     fold_change_cutoff)
            ax = fig.add_subplot(n_rows, n_cols, n + 1)
            ax.set_title(i)
            v_plot.add_volcano_plot(ax, sec_0, sec_1, sec_2)
            if not sig_column:
                fc = np.log2(fold_change_cutoff)
                log_p_val = -1 * np.log10(p_value)
                ax.axvline(x=fc, linestyle='--')
                ax.axvline(x=-1 * fc, linestyle='--')
                ax.axhline(y=log_p_val, linestyle='--')
            if y_range is not None:
                ax.set_ylim(y_range[0], y_range[1])
            if x_range is not None:
                ax.set_xlim(x_range[0], x_range[1])
        fig.tight_layout()
        if save_name is not None:
            v_plot.save_plot(fig, save_name=save_name, out_dir=out_dir)
        return fig

[docs]    def plot_species(self, species_list=None, subset_index=None,
                     save_name=None, out_dir=None, title=None,
                     plot_type='plotly', image_format='png'):
        """
        Create scatter plot of species list

        Parameters
        ----------
        species_list : list
            list of compounds
        subset_index : list
            Column to filter based on species_list
        save_name : str
            Name of html output file
        out_dir : str
            Location to place plots
        title : str
            Title for HTML page
        plot_type : str
            Type of plot outputs, can be "plotly" or "matplotlib"
        image_format : str
            pdf or png, only used if plot_type="matplotlib"

        Returns
        -------
        matplotlib.Figure or plotly.Figure
        """
        df = self.copy()
        if species_list is not None:
            if subset_index is None:
                subset_index = self._index
            df = df.subset(species_list, index=subset_index)
        return plot_species(
            df, save_name=save_name, out_dir=out_dir, title=title,
            plot_type=plot_type, image_format=image_format
        )

[docs]    def plot_all(self, html_file_name, out_dir='out',
                 plot_type='plotly', run_parallel=False):
        """
        Creates a plot of all metabolites

        Parameters
        ----------
        html_file_name : str
            filename to save html of all plots
        out_dir: str, path
            Directory that will contain all proteins
        plot_type : str
            plotly or matplotlib output
        run_parallel : bool
            Create the plots in parallel
        Returns
        -------

        """

        plot_dataframe(self, html_filename=html_file_name,
                       out_dir=out_dir, plot_type=plot_type,
                       run_parallel=run_parallel)

[docs]    def plot_histogram(self, save_name=None, y_range=None, out_dir=None):
        """
        Plots a histogram of data

        Parameters
        ----------
        save_name: str
            Name of figure
        out_dir: str, path
            Path to location to save figure
        y_range: array_like
            range of data


        Returns
        -------

        """
        data = self.copy()
        data = data.dropna(subset=[p_val])
        data = data[np.isfinite(data[fold_change])]
        data = data.dropna(subset=[fold_change])

        tmp = np.array(log2_normalize_df(data, fold_change)[fold_change])

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.hist(tmp, 50, color='gray')
        if y_range is not None:
            plt.xlim(y_range[0], y_range[1])

        ax.set_yscale('log', basey=10)
        ax.set_xlabel('log$_2$ Fold Change', fontsize=16)
        ax.set_ylabel('Count', fontsize=16)
        fig.tight_layout()
        if save_name is not None:
            v_plot.save_plot(fig, save_name, out_dir)
        return fig


[docs]class ExperimentalData(object):
    """
    Manages all experimental data

    """

    def __init__(self, data_file):
        """

        Parameters
        ----------
        data_file : str, pandas.DataFrame
            Name of file, generally csv.
            If provided a str, the file will be read in as a pandas.DataFrame


        """
        if isinstance(data_file, pd.DataFrame):
            df = data_file.copy()
        else:
            df = pd.read_csv(data_file, parse_dates=False, low_memory=False)
        df.reset_index(drop=True, inplace=True)
        df.drop_duplicates(inplace=True)
        for i in valid_cols:
            if i not in df.dtypes:
                print("{} not in columns.".format(i))

        self.data = BaseData(df)
        self._index = 'identifier'
        self.__proteins = None
        self.__genes = None
        self.__species = None
        self.__rna = None
        self.__compounds = None
        for i in self.exp_methods:
            self.__setattr__(i, Sample(
                self.data.loc[self.data[exp_method] == i]))
        for i in self.sample_ids:
            self.__setattr__(i, Sample(
                self.data.loc[self.data[sample_id] == i]))

    def __setattr__(self, name, value):
        super(ExperimentalData, self).__setattr__(name, value)

    def __getitem__(self, name):
        return super(ExperimentalData, self).__getattribute__(name)

    @property
    def genes(self):
        """ All data tagged with gene

        Includes protein and RNA.

        Returns
        -------

        """
        if self.__genes is None:
            tmp = self.data.copy()
            tmp = tmp.loc[tmp[species_type].isin([protein, gene])]
            self.__genes = Sample(tmp)
        return self.__genes

    @property
    def proteins(self):
        """ Protein level data

        Tagged with "gene" identifier that is not RNA

        Returns
        -------

        """
        if self.__proteins is None:
            tmp = self.data.copy()
            tmp = tmp.loc[(self.data[species_type].isin([protein, gene])) &
                          ~(tmp[exp_method] == rna)]
            self.__proteins = Sample(tmp)
        return self.__proteins

    @property
    def rna(self):
        """ RNA level data

        Tagged with "RNA"

        Returns
        -------

        """
        if self.__rna is None:
            tmp = self.data.copy()
            tmp = tmp.loc[tmp[exp_method] == rna]
            self.__rna = Sample(tmp)
        return self.__rna

    @property
    def compounds(self):
        """ Only compounds in data

        Returns
        -------
        Sample

        """
        if self.__compounds is None:
            tmp = self.data.copy()
            tmp = tmp.loc[tmp[species_type] == metabolites]
            self.__compounds = Sample(tmp)
        return self.__compounds

    @property
    def species(self):
        """ Returns data in Sample format

        Returns
        -------
        Sample

        """
        if self.__species is None:
            self.__species = Sample(self.data.copy())
        return self.__species

    @property
    def exp_methods(self):
        """ List of source columns """
        return list(self.data[exp_method].unique())

    @property
    def sample_ids(self):
        """ List of sample_ids """
        return sorted(list(self.data[sample_id].unique()))

[docs]    def subset(self, species, index='identifier'):
        """

        Parameters
        ----------
        species : list, str
            List of species to create subset dataframe from
        index : str
            Index to filter based on provided 'species' list

        Returns
        -------
        magine.data.experimental_data.Species
        """
        df = self.species.copy()
        if isinstance(species, str):
            df = df.loc[df[index].str.contains(species)]
        else:
            df = df.loc[df[index].isin(species)]
        return df

[docs]    def get_measured_by_datatype(self):
        """
        Returns dict of species per data type

        Returns
        -------
        dict

        """
        return get_measured_by_datatype(self)

[docs]    def create_summary_table(self, sig=False, index=identifier, save_name=None,
                             plot=False, write_latex=False):
        """
        Creates a summary table of data.


        Parameters
        ----------
        sig: bool
            Flag to summarize significant species only
        save_name: str
            Name to save csv and .tex file
        index: str
           Index for counts
        plot: bool
            If you want to create a plot of the table
        write_latex: bool
            Create latex file of table


        Returns
        -------
        pandas.DataFrame

        """
        return create_table_of_data(self, sig=sig, index=index,
                                    save_name=save_name, plot=plot,
                                    write_latex=write_latex)

[docs]    def volcano_analysis(self, out_dir, use_sig_flag=True,
                         p_value=0.1, fold_change_cutoff=1.5):
        """
        Creates a volcano plot for each experimental method

        Parameters
        ----------
        out_dir: str, path
            Path to where the output figures will be saved
        use_sig_flag: bool
            Use significant flag of data
        p_value: float, optional
            p value criteria for significant
            Will not be used if use_sig_flag
        fold_change_cutoff: float, optional
            fold change criteria for significant
            Will not be used if use_sig_flag

        Returns
        -------

        """
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
        for i in self.exp_methods:
            self[i].volcano_plot(
                i, out_dir=out_dir, sig_column=use_sig_flag,
                p_value=p_value, fold_change_cutoff=fold_change_cutoff
            )


template = r'''
\documentclass[12pt, letterpaper]{{article}}
\usepackage{{booktabs}}
\usepackage{{geometry}}
\usepackage{{pdflscape}}
%\usepackage{{nopageno}}
\geometry{{ papersize={{4.444in,12.681in}},total={{2.8in,6.8in}} }}
\begin{{document}}
\begin{{landscape}}
\begin{{table}}
{}
\end{{table}}
\end{{landscape}}
\end{{document}}
'''


def get_measured_by_datatype(data):
    """ Get unique list of species for each 'source' label in data.

    Parameters
    ----------
    data : ExperimentalData

    Returns
    -------
    measured, sig_measured : dict, dict
        Dictionaries where keys are 'source' and values are sets of ids.

    """

    measured = dict()
    sig_measured = dict()
    for i in data.exp_methods:
        sig_measured[i] = set(data[i].sig.id_list)
        measured[i] = set(data[i].id_list)
    return measured, sig_measured


def create_table_of_data(data, sig=False, index='identifier', save_name=None,
                         plot=False, write_latex=False):
    """
    Creates a summary table of data.


    Parameters
    ----------
    data : ExperimentalData
    sig: bool
        Flag to summarize significant species only
    save_name: None, str
        Name to save csv and .tex file
    index: str
        Index to create counts
    plot: bool
        If you want to create a plot of the table
    write_latex: bool
        Create latex file of table


    Returns
    -------
    pandas.DataFrame

    """

    if sig:
        data_copy = data.species.sig.copy()
    else:
        data_copy = data.species.copy()

    count_table = data_copy.pivot_table(values=index, index=exp_method,
                                        columns=sample_id, fill_value=np.nan,
                                        aggfunc=lambda x: x.dropna().nunique())

    # This just makes sure things are printed as ints, not floats
    for i in count_table.columns:
        count_table[i] = count_table[i].fillna(-1).astype(int).replace(-1, '-')
    unique_col = {}
    for i in data.exp_methods:
        if sig:
            unique_col[i] = len(set(data[i].sig[index].values))
        else:
            unique_col[i] = len(set(data[i][index].values))
    count_table['Total Unique Across'] = pd.Series(unique_col,
                                                   index=count_table.index)
    if plot:
        ax = plt.subplot(111, frame_on=False)

        table(ax, count_table, loc='center')
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        plt.tight_layout()
        if save_name is not None:
            plt.savefig('{}.png'.format(save_name), dpi=300,
                        bbox_inches='tight')

    if save_name is not None:
        count_table.to_csv('{}.csv'.format(save_name))
    if write_latex and save_name is not None:
        _write_to_latex(pd_table=count_table, save_name=save_name)
    return count_table


def _write_to_latex(pd_table, save_name):
    filename = '{0}.tex'.format(save_name)

    with open(filename, 'wt') as f:
        st = pd_table.to_latex(
            column_format='*{{{}}}{{c}}'.format(str(pd_table.shape[1] + 2)))
        f.write(template.format(st))

    if _which('pdflatex'):
        print('Compiling table')
        with open(os.devnull, "w") as fnull:
            subprocess.call(['pdflatex', filename], stderr=subprocess.STDOUT,
                            stdout=fnull)

            if _which('pdfcrop'):
                pdffile = '{0}.pdf'.format(save_name)
                subprocess.call(['pdfcrop', pdffile, pdffile],
                                stderr=subprocess.STDOUT, stdout=fnull)
                if _which('convert'):
                    tmp_png_name = '{0}.png'.format(save_name)
                    subprocess.call(['convert', '-density', '300', pdffile,
                                     '-quality', '90', tmp_png_name],
                                    stderr=subprocess.STDOUT, stdout=fnull)
    else:
        print('Install pdflatex to compile to pdf or png\n'
              'You can use the csv file for use in outside tools')


def _which(program):
    def _is_exe(filepath):
        return os.path.isfile(filepath) and os.access(filepath, os.X_OK)

    fpath, fname = os.path.split(program)
    if fpath:
        if _is_exe(program):
            return program
        elif _is_exe(program + '.exe'):
            return program + '.exe'
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if _is_exe(exe_file):
                return exe_file

    return None