Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moran's I as bio conservation metric #245

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 113 additions & 0 deletions scIB/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1770,6 +1770,119 @@ def graph_connectivity(adata_post, label_key):

return np.mean(clust_res)

def moransi_conservation(adata_pre, adata_post, batch_key, n_hvg=1000, embed='X_pca',
rescale=True,compare_pre=False, hvgs=None):
"""
Compute Moran's I on HVGs (defined across batches) on integrated data
as a measure of bio conservation. The metric can be computed either as
A.) mean difference between Morans's I on embedding of integrated data and
max Moran's I of individual non-integrated batches or
B.) mean Morans's I on embedding of integrated data.
:param adata_pre: Non-integrated data.
Embedding is computed on scaled (m=0, s=1) X (expression)
per batch (recomputing HVGs, scaling, pca, neighbours).
For Moran's I unscaled X is used.
:param adata_post: Integrate data. Should have precomputed embedding and expression in X.
:param batch_key: Batch obs col in adata_pre.
:param n_hvg: How many top HVGs to use for Moran's I computation.
:param embed: Embedding to use for computation of neighbours in adata_post
:param rescale: Whether to rescale result to [0,1] so that 1 is better bio conservation
:param compare_pre: Whether to compute metric under scenario A instead of B.
:param hvgs: Custom list of genes to use for Moran's I computation (instead of hvgs
computed across batches).
:return: The resulting metric.
"""
# Prepare adatas
# Get integrated connectivities for Moran's I
adata_post=adata_post.copy()
sc.pp.neighbors(adata_post,use_rep=embed)
# Prepare pre data
adata_pre=adata_pre.copy()
adata_pre.obs[batch_key]=adata_pre.obs[batch_key].astype('category')

# Get HVGs across samples
if hvgs is None:
hvgs=hvg_batch(adata_pre, batch_key=batch_key, target_genes=n_hvg, flavor='cell_ranger')

def compute_mi(data,hvgs):
"""
Helper function for computing Moran's I on HVGS that are non-constant
and adding them to var
:param data: adata, result is added to var
:param hvgs: Genes for which to compute Moran's I
"""
# Make sure taht genes used for computation are non-constant
hvgs_used=np.array(hvgs)[~(data[:,hvgs].X==data[0,hvgs].X).all(axis=0)]
if len(hvgs)>len(hvgs_used):
print('Using subset (%i) of hvgs (%i) that are not constant in the data'%
(len(hvgs_used),len(hvgs)))
# Compute Moran's I
morans_i=sc.metrics.morans_i(data[:,hvgs_used])
morans_i=pd.Series(morans_i,index=hvgs_used)
# This should not happen, just in case
if (morans_i>1).any() or (morans_i<-1).any():
raise ValueError(
"Problems in computing Moran's I, value not between -1 and 1")
data.var['morans_i']=morans_i

if compare_pre:
# Get per sample connectivities
adatas_sample=dict()
for sample in adata_pre.obs[batch_key].unique():
# Subset only to cells from 1 sample
adata_sample=adata_pre[adata_pre.obs[batch_key]==sample,:].copy()
# Compute HVG for each sample
sc.pp.highly_variable_genes(adata_sample, flavor='cell_ranger',
batch_key='study_sample',n_top_genes =2000)
adata_sample_scl=adata_sample.copy()
# PP each sample: scaling, pca, neighbours
sc.pp.scale(adata_sample_scl,max_value=10)
sc.pp.pca(adata_sample_scl, n_comps=15, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata_sample_scl,n_pcs = 15)
# Add computed info back to unscaled data
adata_sample.uns['neighbors']=adata_sample_scl.uns['neighbors']
adata_sample.obsp['connectivities']=adata_sample_scl.obsp['connectivities']
adata_sample.obsp['distances']=adata_sample_scl.obsp['distances']
adata_sample.uns['pca']=adata_sample_scl.uns['pca']
adata_sample.obsm['X_pca']=adata_sample_scl.obsm['X_pca']
# Save adata of sample
adatas_sample[sample]=adata_sample
del adata_sample
del adata_sample_scl

# Compute Moran's I across samples and on integrated data
for data in list(adatas_sample.values())+[adata_post]:
compute_mi(data,hvgs)

# Compute differences in Moran's I
# Table of Moran's I-s across batches
batch_mis=[]
for sample,data in adatas_sample.items():
mi=data.var['morans_i']
mi.name=sample
batch_mis.append(mi)
batch_mis=pd.concat(batch_mis,axis=1)
# Difference with integrated data
mi_diffs=adata_post.var['morans_i']-batch_mis.max(axis=1)
avg_mi_diff=mi_diffs.mean()

# Rescale so that it is between [0,1] where 1 is better
# Moran's I will be in [-1,1] and thus difference can be in [-2,2]
if rescale:
res=(avg_mi_diff+2)/4
else:
res=avg_mi_diff

else:
# Moran's I on integrated data
compute_mi(adata_post,hvgs)
res=adata_post.var['morans_i'].mean()
if rescale:
# Moran's I will be in [-1,1]
res=(res+1)/2

return res


### Time and Memory
def measureTM(*args, **kwargs):
Expand Down