Source code for magine.networks.network_generator

import logging
import os
import networkx as nx
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import magine.networks.utils as nt
import magine.networks.databases as db
from magine.mappings.chemical_mapper import ChemicalMapper
from magine.mappings.gene_mapper import GeneMapper

try:
    import cPickle as pickle
except ImportError:
    import pickle
from magine.logging import get_logger

logger = get_logger("magine.networks.network_generator",
                    log_level=logging.INFO)


# logger = get_logger(__name__, log_level=logging.INFO)


[docs]def build_network(seed_species, species='hsa', save_name=None, all_measured_list=None, trim_source_sink=False, use_reactome=True, use_hmdb=False, use_biogrid=True, use_signor=True, verbose=False): """ Construct a network from a list of gene names. Parameters ---------- seed_species : list list of genes to construct network save_name : str, optional output name to save network. Will save one before and after ID conversion species : str species of proteins ('hsa': human, 'mmu':murine) all_measured_list : list list of all species that should be considered in network use_reactome : bool Add ReactomeFunctionalInteraction reaction to network use_biogrid : bool Add BioGrid reaction to network use_hmdb : bool Add HMDB reaction to network all_measured_list use_signor : bool Add SIGNOR reaction to network trim_source_sink : bool, optional Remove source and sink nodes if they are not measured in network verbose : bool Returns ------- networkx.DiGraph """ cm = ChemicalMapper() path_to_graph, node_to_path = db.load_kegg_mappings(species) seed_species = set(x.upper() for x in seed_species) updated_accession = set() old_accession = set() # This maps HMDB numbers to the new numbering format for i in seed_species: if i.startswith('HMDB'): if i in cm.hmdb_accession_to_main: old_accession.add(i) updated_accession.add(cm.hmdb_accession_to_main[i][0]) seed_species.difference_update(old_accession) seed_species.update(updated_accession) seeds_in_kegg = seed_species.intersection(node_to_path) pathway_list = set() for seed in seeds_in_kegg: pathway_list.update(node_to_path[seed]) graph_list = [] for each in pathway_list: tmp = path_to_graph[each] if len(tmp.edges) == 0: continue graph_list.append(tmp) end_network = nt.compose_all(graph_list) if all_measured_list is None: all_measured_set = set(i.upper() for i in end_network.nodes) else: all_measured_set = set(str(x).upper() for x in all_measured_list) all_measured_set.update(seed_species) hmdb_ids = set(i for i in all_measured_set if i.startswith('HMDB')) updated_accession = set() old_accession = set() for i in hmdb_ids: if i in cm.hmdb_accession_to_main: all_measured_set.remove(i) all_measured_set.add(cm.hmdb_accession_to_main[i][0]) networks_to_expand = [] logger.info("Gathering networks") if use_hmdb: networks_to_expand.append(db.load_hmdb_network()) if use_reactome: networks_to_expand.append(db.load_reactome_fi()) if use_biogrid: networks_to_expand.append(db.load_biogrid_network()) if use_signor: networks_to_expand.append(db.load_signor()) logger.info("Merging networks") if len(networks_to_expand) != 0: entire_expansion_network = nt.compose_all(networks_to_expand) end_network = expand_by_db(end_network, entire_expansion_network, all_measured_set) logger.info("Trimming network") # makes all similar edge names the same nt.standardize_edge_types(end_network) # removes everything not connected to the largest graph end_network = nt.delete_disconnected_network(end_network) if trim_source_sink: end_network = nt.trim_sink_source_nodes(end_network, all_measured_list, remove_self_edge=True) if save_name is not None: nx.write_gml(end_network, '{}.gml'.format(save_name)) nx.write_gpickle(end_network, '{}.p'.format(save_name)) final_nodes = set(end_network.nodes) n_hits = len(seed_species.intersection(final_nodes)) logger.info('Network has {} nodes and {} edges'.format( len(final_nodes), len(end_network.edges)) ) logger.info("Found {} of {} seed species in network".format( n_hits, len(seed_species)) ) if all_measured_list is not None: n_measured_hits = len(set(all_measured_list).intersection(final_nodes)) logger.info("Found {} of {} background species in network" "".format(n_measured_hits, len(all_measured_list))) return end_network
[docs]def expand_by_db(starting_network, expansion_source, measured_list, verbose=False): """ add reference network to main network Parameters ---------- starting_network : nx.DiGraph expansion_source : nx.DiGraph measured_list : list_like verbose : bool Returns ------- new_graph : nx.DiGraph """ new_graph = nx.DiGraph() measured_set = set(measured_list) current_nodes = set(starting_network.nodes) # gathers possible nodes to expand # must be in new nodes AND measured set nodes_to_check = set(expansion_source.nodes).intersection(measured_set) # add all current nodes, might be edges between them that are missed nodes_to_check.update(current_nodes) added_nodes = set() for i, j, k in expansion_source.edges(data=True): if i in nodes_to_check and j in nodes_to_check: added_nodes.add(i) added_nodes.add(j) new_graph.add_edge(i, j, **k) for node in added_nodes: new_graph.add_node(node, **expansion_source.node[node]) new_graph = nt.compose(starting_network, new_graph) logger.info("\t\t\tbefore\tafter") logger.info("\tNodes\t{}\t{}".format(len(starting_network.nodes), len(new_graph.nodes))) logger.info("\tEdges\t{}\t{}".format(len(starting_network.edges), len(new_graph.edges))) return new_graph
[docs]def create_background_network(save_name='background_network', fresh_download=False, verbose=True, create_overlap=False): """ Parameters ---------- save_name : str Name of the network fresh_download : bool Download a fresh copy of the databases verbose: bool Print information about the databases create_overlap : bool Creates a figure comparing the databses Returns ------- nx.DiGraph """ logger.info("Generating background network from all databases") kegg_network = db.load_kegg(fresh_download=fresh_download) hmdb_network = db.load_hmdb_network(fresh_download=fresh_download) biogrid_network = db.load_biogrid_network(fresh_download=fresh_download) signor_network = db.load_signor(fresh_download=fresh_download) reactome_network = db.load_reactome_fi() network_list = [hmdb_network, kegg_network, biogrid_network, reactome_network, signor_network] names = ['hmdb', 'kegg', 'biogrid', 'reactome', 'signor'] def find_overlap(n1, n2): nodes1 = set(n1.nodes()) nodes2 = set(n2.nodes()) e1 = set(n1.edges()) e2 = set(n2.edges()) edge_overlap = len(e2.intersection(e1)) node_overlap = len(nodes1.intersection(nodes2)) return node_overlap, edge_overlap if create_overlap: db_maps = {i: j for i, j in zip(names, network_list)} n_dbs = len(names) pal = sns.light_palette("purple", as_cmap=True) node_mat = np.zeros((n_dbs, n_dbs), dtype=np.int) edge_mat = np.zeros((n_dbs, n_dbs), dtype=np.int) for i in range(n_dbs): row = db_maps[names[i]] for j in range(i + 1, n_dbs): col = db_maps[names[j]] n_overlap, e_overlap = find_overlap(row, col) node_mat[i, j] = n_overlap node_mat[j, i] = node_mat[i, j] edge_mat[i, j] = e_overlap edge_mat[j, i] = edge_mat[i, j] fig = plt.figure(figsize=(10, 4)) ax = fig.add_subplot(121) plt.title("Number of node overlaps") sns.heatmap(node_mat, fmt='d', annot=True, linewidths=0.02, cmap=pal, yticklabels=names, xticklabels=names) plt.yticks(rotation=0) ax = fig.add_subplot(122) plt.title("Number of edge overlaps") sns.heatmap(edge_mat, fmt='d', annot=True, linewidths=0.02, cmap=pal, yticklabels=names, xticklabels=names) plt.tight_layout() plt.yticks(rotation=0) plt.subplots_adjust(wspace=.3) plt.savefig('compare_network_dbs.png', dpi=300, bbox_inches='tight') plt.close() logger.info("Combining networks") full_network = nt.compose_all(network_list) logger.info("Finished combining networks") nt.standardize_edge_types(full_network) full_network = nt.delete_disconnected_network(full_network) n_nodes = len(full_network.nodes) n_edges = len(full_network.edges) logger.info("Background network {} nodes and {} edges".format(n_nodes, n_edges)) nx.write_gpickle(full_network, '{}.p.gz'.format(save_name)) nx.write_gml(full_network, '{}.gml'.format(save_name)) return full_network
if __name__ == '__main__': create_background_network(fresh_download=False, verbose=True, create_overlap=True)