From 1b9823b39bd1cb12b3acd3109be96cce5ccc3099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sophia=20M=C3=BCller-Dott?= Date: Wed, 2 Aug 2023 15:11:45 +0200 Subject: [PATCH] updated benchmark to remove NAs in activities --- decoupler/tests/test_utilsbenchmark.py | 49 ++++++++++++++++++- decoupler/utils_benchmark.py | 67 ++++++++++++++++---------- 2 files changed, 88 insertions(+), 28 deletions(-) diff --git a/decoupler/tests/test_utilsbenchmark.py b/decoupler/tests/test_utilsbenchmark.py index 84a5712..e622768 100644 --- a/decoupler/tests/test_utilsbenchmark.py +++ b/decoupler/tests/test_utilsbenchmark.py @@ -182,8 +182,21 @@ def test_append_by_experiment(): append_by_experiment(df, grpby_i=None, grp=None, act=act, grt=grt, srcs=srcs, mthds=mthds, metrics=metrics, min_exp=1) + + act_na = act.astype(float) + act_na[1,0,0] = np.nan + act_na[1,1,0] = np.nan + act_na[1,2,0] = np.nan + + df_na = [] + + append_by_experiment(df_na, grpby_i=None, grp=None, act=act_na, grt=grt, srcs=srcs, + mthds=mthds, metrics=metrics, min_exp=1) + assert len(df) == 2 assert df[0][5] < df[1][5] + assert df[0][5] < df_na[0][5] #check improvement of performance due to removal of NAs + assert df[0][6] < df_na[0][6] #check change of class imbalance due to removal of NAs def test_append_by_source(): @@ -216,7 +229,39 @@ def test_append_by_source(): append_by_source(df, grpby_i=None, grp=None, act=act, grt=grt, srcs=srcs, mthds=mthds, metrics=metrics, min_exp=1) assert len(df) == 4 - assert df[0][5] < df[1][5] + assert df[0][5] < df[2][5] + + act_na = act.astype(float) + act_na[1,4,0] = np.nan + + df_na = [] + + append_by_source(df_na, grpby_i=None, grp=None, act=act_na, grt=grt, srcs=srcs, + mthds=mthds, metrics=metrics, min_exp=1) + + assert len(df_na) == 3 + assert df_na[0][2] == 'T1' + + act_na[1,0,0] = np.nan + + df_na_2 = [] + append_by_source(df_na_2, grpby_i=None, grp=None, act=act_na, grt=grt, srcs=srcs, + mthds=mthds, metrics=metrics, min_exp=1) + + assert len(df_na_2) == 2 + + act_na_3 = act.astype(float) + act_na_3[1,0,0] = np.nan + + df_na_3 = [] + + append_by_source(df_na_3, grpby_i=None, grp=None, act=act_na_3, grt=grt, srcs=srcs, + mthds=mthds, metrics=metrics, min_exp=1) + + assert len(df_na_3) == 3 + assert df_na_3[0][2] == 'T5' + + def test_append_metrics_scores(): @@ -260,7 +305,7 @@ def test_append_metrics_scores(): append_metrics_scores(df, grpby_i=None, grp=None, act=act, grt=grt, srcs=srcs, mthds=mthds, metrics=metrics, by='source', min_exp=1) assert len(df) == 4 - assert df[0][5] < df[1][5] + assert df[0][5] < df[2][5] def test_check_groupby(): diff --git a/decoupler/utils_benchmark.py b/decoupler/utils_benchmark.py index 1c7efaf..5da1c45 100644 --- a/decoupler/utils_benchmark.py +++ b/decoupler/utils_benchmark.py @@ -123,14 +123,20 @@ def append_by_experiment(df, grpby_i, grp, act, grt, srcs, mthds, metrics, min_e # Flatten act by method act, grt = act.reshape(-1, act.shape[-1]).T, grt.flatten() - # Compute Class Imbalance - ci = np.sum(grt) / len(grt) - # Compute per method and metric for m in range(len(mthds)): mth = mthds[m] for metric in metrics: - scores = compute_metric(act[m], grt, metric, pi0=pi0, n_iter=n_iter, seed=seed) + # identify activity scores with NAs in each method + act_i = act[m] + nan_mask = np.isnan(act_i) + # Remove NAs from activity matrix and ground truth + act_i = act_i[~nan_mask] + grt_i = grt[~nan_mask] + # Compute Class Imbalance + ci = np.sum(grt_i) / len(grt_i) + # Compute metrics + scores = compute_metric(act_i, grt_i, metric, pi0=pi0, n_iter=n_iter, seed=seed) for score in scores: row = [grpby_i, grp, None, mth, metric, score, ci] df.append(row) @@ -138,28 +144,37 @@ def append_by_experiment(df, grpby_i, grp, act, grt, srcs, mthds, metrics, min_e def append_by_source(df, grpby_i, grp, act, grt, srcs, mthds, metrics, min_exp=5, pi0=0.5, n_iter=1000, seed=42): - - # Remove sources with less than min_exp - src_msk = np.sum(grt > 0., axis=0) >= min_exp - act, grt = act[:, src_msk, :], grt[:, src_msk] - srcs = srcs[src_msk] - - # Compute per source, method and metric - for s in range(len(srcs)): - src = srcs[s] - tmp_grt = grt[:, s] - - # Compute Class Imbalance - ci = np.sum(tmp_grt) / len(tmp_grt) - - for m in range(len(mthds)): - mth = mthds[m] - tmp_act = act[:, s, m] - for metric in metrics: - scores = compute_metric(tmp_act, tmp_grt, metric, pi0=pi0, n_iter=n_iter, seed=seed) - for score in scores: - row = [grpby_i, grp, src, mth, metric, score, ci] - df.append(row) + + for m in range(len(mthds)): + mth = mthds[m] + act_i = act[:,:,m] + nan_mask = np.isnan(act_i) + + grt_i = grt.copy() + grt_i[nan_mask]=np.nan + + # Remove sources with less than min_exp + src_msk = np.sum(grt_i > 0., axis=0) >= min_exp + act_i, grt_i = act[:, src_msk, :], grt_i[:, src_msk] + srcs_method = srcs[src_msk] + + # Compute per source, method and metric + for s in range(len(srcs_method)): + src = srcs_method[s] + tmp_grt = grt_i[:, s] + nan_mask = np.isnan(tmp_grt) + + grt_source = tmp_grt[~nan_mask] + act_source = act_i[:, s, m][~nan_mask] + + # Compute Class Imbalance + ci = np.sum(grt_source) / len(grt_source) + if ci != 0. and ci != 1.: + for metric in metrics: + scores = compute_metric(act_source, grt_source, metric, pi0=pi0, n_iter=n_iter, seed=seed) + for score in scores: + row = [grpby_i, grp, src, mth, metric, score, ci] + df.append(row) def append_metrics_scores(df, grpby_i, grp, act, grt, srcs, mthds, metrics, by, min_exp=5, pi0=0.5,