Source code for tmap.netx.coenrichment_analysis

import numpy as np
import pandas as pd
import scipy.stats as scs
from statsmodels.sandbox.stats.multicomp import multipletests
from tqdm import tqdm

from tmap.netx.SAFE import get_significant_nodes



def is_enriched(s1, s2, s3, s4):
    """
    Accessory function for further determine whether it is a co-enrichment after fisher-exact test.
    """
    total_1 = len(s1) + len(s2)
    total_2 = len(s3) + len(s4)
    if total_1 == 0 or total_2 == 0:
        return False
    if len(s1) / total_1 > len(s3) / total_2:
        return True
    else:
        return False


[docs]def coenrichment_for_nodes(graph, nodes, enriched_centroid, name, safe_scores=None, SAFE_pvalue=None, _filter=True, mode='both'): """ Coenrichment main function With given feature and its enriched nodes, we could construct a contingency table when we comparing to other feature and its enriched nodes. For statistical test association between different features, fisher-exact test was applied to each constructed contingency table. Fisher-exact test only consider the association between two classifications instead of the ratio. For coenrichment, we also implement a accessory function called ``is_enriched`` to further judge that if the ratio of enrichment is bigger than the non-enrichement. Besides the global co-enrichment, several local enrichment were observed. With ``networkx`` algorithm for finding component, we could extract each local enrichment nodes from global enrichment. Because of the complex combination between comparison of local enrichment, two different contingency table was constructed. The contingency table which compare different enriched nodes among components and enriched or non-enriched nodes of other features shown below. ======================== ================================= ================================= fea fea this comp enriched nodes fea other comp enriched nodes ======================== ================================= ================================= o_f_enriched_nodes s1 s2 o_f_non-enriched_nodes s3 s4 ======================== ================================= ================================= The other contingency table which compare enriched nodes within specific component or non-enriched nodes and enriched or non-enriched nodes of other features shown below. ======================== ================================= ================================= fea fea this comp enriched nodes fea non-enriched nodes ======================== ================================= ================================= o_f_enriched_nodes s1 s2 o_f_non-enriched_nodes s3 s4 ======================== ================================= ================================= For convenient calculation, three different mode [both|global|local] could be choose. Output objects contain two different kinds of dictionary. One dict is recording the correlation information between different features, the other is recording the raw contingency table info between each comparsion which is a list of nodes representing s1,s2,s3,s4. If mode equals to 'both', it will output global correlative dict, local correlative, metainfo. If mode equals to 'global', it will output local correlative, metainfo. If mode equals to 'local', it will output global correlative dict, metainfo. For global correlative, keys are compared features and values are tuples (oddsratio, pvalue) from fisher-exact test. For local correlative, keys are tuples of (index of components, size of components, features) and values are as same as global correlative. The remaining metainfo is a dictionary shared same key to global/local correlative but contains contingency table info. :param tmap.tda.Graph.Graph graph: tmap constructed graph :param list nodes: a list of nodes you want to process from specific feature :param str name: feature name which doesn't need to exist at enriched_centroid :param dict enriched_centroid: enriched_centroids output from ``get_significant_nodes`` :param float SAFE_pvalue: None or a threshold for SAFE score. If it is a None, it will not filter the nodes. :param pd.DataFrame safe_scores: A DataFrame which store SAFE_scores for filter the nodes. If you want to filter the nodes, it must simultaneously give safe_scores and threshold. :param str mode: [both|global|local] :return: """ if mode not in ['both', 'global', 'local']: print("Wrong syntax input, mode should be one of [both|global|local]") return total_nodes = set(graph.nodes) comp_nodes = graph.get_component_nodes(nodes) metainfo = {} global_correlative_feas = {} sub_correlative_feas = {} if SAFE_pvalue is not None and safe_scores is not None: safe_scores = safe_scores.to_dict(orient='dict') fea_enriched_nodes = set([_ for _ in nodes if safe_scores.get(_, 0) >= SAFE_pvalue]) else: fea_enriched_nodes = set(nodes) fea_nonenriched_nodes = total_nodes.difference(fea_enriched_nodes) metainfo[name] = fea_enriched_nodes, comp_nodes for o_f in enriched_centroid.keys(): if o_f != name: o_f_enriched_nodes = set(enriched_centroid[o_f]) o_f_nonenriched_nodes = total_nodes.difference(o_f_enriched_nodes) if mode == 'both' or 'global': # contingency table # fea enriched nodes, fea non-enriched_nodes # o_f_enriched_nodes s1 s2 # o_f_non-enriched_nodes s3 s4 s1 = o_f_enriched_nodes.intersection(fea_enriched_nodes) s2 = o_f_enriched_nodes.intersection(fea_nonenriched_nodes) s3 = o_f_nonenriched_nodes.intersection(fea_enriched_nodes) s4 = o_f_nonenriched_nodes.intersection(fea_nonenriched_nodes) oddsratio, pvalue = scs.fisher_exact([[len(s1), len(s2)], [len(s3), len(s4)]]) if _filter: if pvalue <= 0.05 and is_enriched(s1, s2, s3, s4): global_correlative_feas[o_f] = (oddsratio, pvalue) metainfo[o_f] = (s1, s2, s3, s4) else: global_correlative_feas[o_f] = (oddsratio, pvalue) metainfo[o_f] = (s1, s2, s3, s4) if mode == 'both' or 'local': for idx, nodes in enumerate(comp_nodes): # contingency table # fea this comp enriched nodes, fea other comp enriched nodes # o_f_enriched_nodes s1 s2 # o_f_non-enriched_nodes s3 s4 nodes = set(nodes) _s1 = o_f_enriched_nodes.intersection(nodes) _s2 = o_f_enriched_nodes.intersection(fea_enriched_nodes.difference(nodes)) _s3 = o_f_nonenriched_nodes.intersection(nodes) _s4 = o_f_nonenriched_nodes.intersection(fea_enriched_nodes.difference(nodes)) oddsratio, pvalue1 = scs.fisher_exact([[len(_s1), len(_s2)], [len(_s3), len(_s4)]]) # contingency table # fea this comp enriched nodes, fea non-enriched nodes # o_f_enriched_nodes s1 s2 # o_f_non-enriched_nodes s3 s4 s1 = o_f_enriched_nodes.intersection(nodes) s2 = o_f_enriched_nodes.intersection(fea_nonenriched_nodes) s3 = o_f_nonenriched_nodes.intersection(nodes) s4 = o_f_nonenriched_nodes.intersection(fea_nonenriched_nodes) oddsratio, pvalue2 = scs.fisher_exact([[len(s1), len(s2)], [len(s3), len(s4)]]) if _filter: if pvalue1 <= 0.05 and pvalue2 <= 0.05 and is_enriched(s1, s2, s3, s4) and is_enriched(_s1, _s2, _s3, _s4): sub_correlative_feas[(idx, len(nodes), o_f)] = (pvalue1, pvalue2) metainfo[(idx, len(nodes), o_f)] = (_s1, _s2, s2) else: sub_correlative_feas[(idx, len(nodes), o_f)] = (pvalue1, pvalue2) metainfo[(idx, len(nodes), o_f)] = (_s1, _s2, s2) if mode == 'both': return global_correlative_feas, sub_correlative_feas, metainfo elif mode == 'global': return global_correlative_feas, metainfo elif mode == 'local': return sub_correlative_feas, metainfo else: return
def batch_coenrichment(fea, graph, safe_scores=None, n_iter=5000, p_value=0.05, _pre_cal_enriched=None): """ Batch find with given feature at all possible features found at safe_scores. If _pre_cal_enriched was given, n_iter and p_value will be useless. Or you should modify n_iter and p_value as the params you passed to ``SAFE`` algorithm. :param str/list fea: A single feature or a list of feature which is already applied SAFE algorithm. :param tmap.tda.Graph.Graph graph: :param pd.DataFrame safe_scores: A SAFE score output from ``SAFE_batch`` which must contain all values occur at fea. :param int n_iter: Permutation times used at ``SAFE_batch``. :param float p_value: The p-value to determine the enriched nodes. :param dict _pre_cal_enriched: A pre calculated enriched_centroid which comprised all necessary features will save time. :return: """ global_correlative_feas = {} sub_correlative_feas = {} metainfo = {} print('building network...') if '__iter__' in dir(fea): fea_batch = list(fea)[::] elif type(fea) == str: fea_batch = [fea] else: raise SyntaxError if _pre_cal_enriched is None and safe_scores is None: raise Exception('_pre_cal_enriched and safe_scores must pass one of them') elif _pre_cal_enriched is None and safe_scores is not None: enriched_centroid = get_significant_nodes(graph=graph, safe_scores=safe_scores, pvalue=p_value, n_iter=n_iter, ) else: enriched_centroid = _pre_cal_enriched for fea in set(fea_batch): if fea in safe_scores.columns: _global, _local, _meta = coenrichment_for_nodes(graph, enriched_centroid[fea], fea, enriched_centroid, mode='both') global_correlative_feas.update(_global) sub_correlative_feas.update(_local) metainfo.update(_meta) else: print("%s doesn't exist at the columns of provided safe_scores.") return global_correlative_feas, sub_correlative_feas, metainfo def construct_correlative_metadata(fea, global_correlative_feas, sub_correlative_feas, metainfo, node_data, verbose=1): """ Down-stream transformation of ``batch_coenrichment``. :param fea: :param global_correlative_feas: :param sub_correlative_feas: :param dict metainfo: :param pd.DataFrame node_data: :return: """ if verbose: print('Transforming given correlative relationship into human-readable DataFrame......') # processing global correlative feas global_headers = ['other feature', 'fisher-exact test pvalue', 'ranksum in co-enriched nodes', 'ranksum in others nodes', 'coverage/%', ] sub_headers = ['n_comps', 'comps_size', 'other feature', 'Fisher test pvalue(co-enriched,enriched)', 'Fisher test pvalue(co-enriched,others)', 'coenriched-enriched pvalue', 'coenriched-others pvalue', 'enriched-others pvalue', 'coverage/%', ] global_corr_df = pd.DataFrame(columns=global_headers) for o_f in list(global_correlative_feas.keys()): _1, f_p = global_correlative_feas[o_f] s1, s2, s3, s4 = metainfo[o_f] if o_f in node_data.columns: _data = node_data else: print('error feature %s' % o_f) return y1 = _data.loc[s1, o_f] y2 = _data.loc[set.union(s2, s3, s4), o_f] if fea in node_data.columns: _data = node_data else: print('error feature %s' % o_f) return _y1 = _data.loc[s1, fea] _y2 = _data.loc[set.union(s2, s3, s4), fea] ranksum_p1 = scs.ranksums(y1, y2)[1] ranksum_p2 = scs.ranksums(_y1, _y2)[1] if not len(s1) + len(s3): coverage = np.nan else: coverage = ((len(s1) + len(s2)) / (len(s1) + len(s3))) * 100 global_corr_df = global_corr_df.append(pd.DataFrame([[o_f, f_p, ranksum_p1, ranksum_p2, coverage]], columns=global_headers)) # processing subgraph correlative feas sub_corr_df = pd.DataFrame(columns=sub_headers) for n_c, size, o_f in sub_correlative_feas.keys(): if o_f in node_data.columns: _data = node_data else: print('error feature %s' % o_f) return coenriched_nodes, enriched_nodes_o_f_enriched, nonenriched_nodes_o_f_enriched = metainfo[(n_c, size, o_f)] f_p1, f_p2 = sub_correlative_feas[(n_c, size, o_f)] y1 = _data.loc[coenriched_nodes, o_f] y2 = _data.loc[enriched_nodes_o_f_enriched, o_f] y3 = _data.loc[nonenriched_nodes_o_f_enriched, o_f] p1 = scs.ranksums(y1, y2)[1] p2 = scs.ranksums(y1, y3)[1] p3 = scs.ranksums(y2, y3)[1] coverage = len(coenriched_nodes) / len(set.union(coenriched_nodes, enriched_nodes_o_f_enriched, nonenriched_nodes_o_f_enriched)) * 100 sub_corr_df = sub_corr_df.append(pd.DataFrame([['comps%s' % n_c, size, o_f, f_p1, f_p2, p1, p2, p3, coverage]], columns=sub_headers)) return global_corr_df, sub_corr_df
[docs]def pairwise_coenrichment(graph, safe_scores, n_iter=5000, p_value=0.05, _pre_cal_enriched=None, verbose=1): """ Pair-wise calculation for co-enrichment of each feature found at safe_scores. If _pre_cal_enriched was given, n_iter and p_value is not useless. Or you should modify n_iter and p_value to fit the params you passed to ``SAFE`` algorithm. :param tmap.tda.Graph.Graph graph: :param pd.DataFrame safe_scores: A SAFE score output from ``SAFE_batch`` which must contain all values occur at fea. :param int n_iter: Permutation times used at ``SAFE_batch``. :param float p_value: The p-value to determine the enriched nodes. :param dict _pre_cal_enriched: A pre calculated enriched_centroid which comprised all necessary features will save time. :param verbose: :return: """ dist_matrix = pd.DataFrame(data=np.nan, index=safe_scores.columns, columns=safe_scores.columns) if verbose: print('building network...') iter_obj = tqdm(safe_scores.columns) else: iter_obj = safe_scores.columns if not _pre_cal_enriched: enriched_centroid = get_significant_nodes(graph=graph, safe_scores=safe_scores, pvalue=p_value, n_iter=n_iter) else: enriched_centroid = _pre_cal_enriched for fea in iter_obj: _global, _meta = coenrichment_for_nodes(graph, enriched_centroid[fea], enriched_centroid, name=fea, safe_scores=safe_scores, mode='global', _filter=False) # _filter to fetch raw fisher-exact test result without any cut-off values. for o_f in safe_scores.columns: if fea != o_f: s1, s2, s3, s4 = _meta[o_f] oddsratio, pvalue = _global[o_f] if is_enriched(s1, s2, s3, s4): dist_matrix.loc[fea, o_f] = pvalue else: dist_matrix.loc[fea, o_f] = 1 dist_matrix.loc[fea, fea] = 0 # correct for multiple testing corrected_dist_matrix = pd.DataFrame(multipletests(dist_matrix.values.reshape(-1, ), method='fdr_bh')[1].reshape(dist_matrix.shape), index=dist_matrix.index, columns=dist_matrix.columns) return corrected_dist_matrix