Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimise PC regression #408

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
- name: Test with pytest
if: ${{ matrix.os != 'macos-latest'}}
run: |
pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv
pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_metrics_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"

- name: Upload coverage to GitHub Actions
Expand Down Expand Up @@ -98,7 +98,7 @@ jobs:

- name: Test with pytest
run: |
pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2
pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2 --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_rpy2_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"

- name: Upload coverage to GitHub Actions
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:

- name: Test with pytest
run: |
pytest --cov=scib --cov-report=xml -vv --tb=native -k integration
pytest --cov=scib --cov-report=xml -vv --tb=native -k integration --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_integration_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"

- name: Upload coverage to GitHub Actions
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ build-backend = "setuptools.build_meta"
log_cli = 'True'
log_cli_level = 'INFO'
addopts = '-p no:warnings'
durations = 0
50 changes: 35 additions & 15 deletions scib/metrics/cell_cycle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from tqdm import tqdm

from ..preprocessing import score_cell_cycle
from ..utils import check_adata
Expand All @@ -11,12 +12,14 @@ def cell_cycle(
adata_post,
batch_key,
embed=None,
agg_func=np.mean,
agg_func=np.nanmean,
organism="mouse",
n_comps=50,
recompute_cc=True,
precompute_pcr_key=None,
verbose=False,
linreg_method="numpy",
n_threads=1,
):
"""Cell cycle conservation score

Expand Down Expand Up @@ -44,6 +47,7 @@ def cell_cycle(
precomputed scores if available as 'S_score' and 'G2M_score' in ``adata_post.obs``
:param precompute_pcr_key: Key in adata_pre for precomputed PCR values for cell
cycle scores. Ignores cell cycle scores in adata_pre if present.
:param n_threads: Number of threads for linear regressions per principle component

:return:
A score between 1 and 0. The larger the score, the stronger the cell cycle
Expand All @@ -70,11 +74,6 @@ def cell_cycle(
if embed == "X_pca":
embed = None

batches = adata_pre.obs[batch_key].unique()
scores_final = []
scores_before = []
scores_after = []

recompute_cc = (
recompute_cc
or "S_score" not in adata_pre.obs_keys()
Expand All @@ -84,19 +83,26 @@ def cell_cycle(
precompute_pcr_key is None or precompute_pcr_key not in adata_pre.uns_keys()
)

for batch in batches:
batches = adata_pre.obs[batch_key].unique()
scores_before = []
scores_after = []
scores_final = []

for batch in tqdm(batches):
before, after = get_pcr_before_after(
adata_pre,
adata_post,
batch_key=batch_key,
batch=batch,
embed=embed,
organism=organism,
pcr_key=precompute_pcr_key,
recompute_cc=recompute_cc,
recompute_pcr=recompute_pcr,
pcr_key=precompute_pcr_key,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)

# scale result
Expand Down Expand Up @@ -140,11 +146,13 @@ def get_pcr_before_after(
batch,
embed,
organism,
recompute_cc,
recompute_pcr,
pcr_key,
n_comps,
verbose,
recompute_cc=False,
recompute_pcr=False,
n_comps=50,
verbose=True,
n_threads=1,
linreg_method="numpy",
):
"""
Principle component regression value on cell cycle scores for one batch
Expand Down Expand Up @@ -190,16 +198,28 @@ def get_pcr_before_after(
covariate = raw_sub.obs[["S_score", "G2M_score"]]

# PCR on adata before integration
if recompute_pcr:
if recompute_pcr: # TODO: does this work for precomputed values?
before = pc_regression(
raw_sub.X, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
raw_sub.X,
covariate,
pca_var=None,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)
else:
before = pd.Series(raw_sub.uns[pcr_key])

# PCR on adata after integration
after = pc_regression(
int_sub, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
int_sub,
covariate,
pca_var=None,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)

return before, after
130 changes: 108 additions & 22 deletions scib/metrics/pcr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,22 @@
import pandas as pd
import scanpy as sc
from scipy import sparse
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

from ..utils import check_adata, check_batch


def pcr_comparison(
adata_pre, adata_post, covariate, embed=None, n_comps=50, scale=True, verbose=False
adata_pre,
adata_post,
covariate,
embed=None,
n_comps=50,
linreg_method="sklearn",
recompute_pca=False,
scale=True,
verbose=False,
n_threads=1,
):
"""Principal component regression score

Expand All @@ -25,6 +34,8 @@ def pcr_comparison(
If None, use the full expression matrix (``adata_post.X``), otherwise use the embedding
provided in ``adata_post.obsm[embed]``.
:param n_comps: Number of principal components to compute
:param recompute_pca: Whether to recompute PCA with default settings
:param linreg_method: Method for linear regression, either 'sklearn' or 'numpy'
:param scale: If True, scale score between 0 and 1 (default)
:param verbose:
:return:
Expand Down Expand Up @@ -52,17 +63,21 @@ def pcr_comparison(
pcr_before = pcr(
adata_pre,
covariate=covariate,
recompute_pca=True,
recompute_pca=recompute_pca,
n_comps=n_comps,
linreg_method=linreg_method,
n_threads=n_threads,
verbose=verbose,
)

pcr_after = pcr(
adata_post,
covariate=covariate,
embed=embed,
recompute_pca=True,
recompute_pca=recompute_pca,
n_comps=n_comps,
linreg_method=linreg_method,
n_threads=n_threads,
verbose=verbose,
)

Expand All @@ -79,7 +94,16 @@ def pcr_comparison(
return pcr_after - pcr_before


def pcr(adata, covariate, embed=None, n_comps=50, recompute_pca=True, verbose=False):
def pcr(
adata,
covariate,
embed=None,
n_comps=50,
recompute_pca=False,
linreg_method="sklearn",
verbose=False,
n_threads=1,
):
"""Principal component regression for anndata object

Wraps :func:`~scib.metrics.pc_regression` while checking whether to:
Expand Down Expand Up @@ -127,25 +151,48 @@ def pcr(adata, covariate, embed=None, n_comps=50, recompute_pca=True, verbose=Fa
assert embed in adata.obsm
if verbose:
print(f"Compute PCR on embedding n_comps: {n_comps}")
return pc_regression(adata.obsm[embed], covariate_values, n_comps=n_comps)
return pc_regression(
adata.obsm[embed],
covariate_values,
n_comps=n_comps,
linreg_method=linreg_method,
n_threads=n_threads,
)

# use existing PCA computation
elif (recompute_pca is False) and ("X_pca" in adata.obsm) and ("pca" in adata.uns):
if verbose:
print("using existing PCA")
return pc_regression(
adata.obsm["X_pca"], covariate_values, pca_var=adata.uns["pca"]["variance"]
adata.obsm["X_pca"],
covariate_values,
pca_var=adata.uns["pca"]["variance"],
linreg_method=linreg_method,
n_threads=n_threads,
)

# recompute PCA
else:
if verbose:
print(f"compute PCA n_comps: {n_comps}")
return pc_regression(adata.X, covariate_values, n_comps=n_comps)
return pc_regression(
adata.X,
covariate_values,
n_comps=n_comps,
linreg_method=linreg_method,
n_threads=n_threads,
)


def pc_regression(
data, covariate, pca_var=None, n_comps=50, svd_solver="arpack", verbose=False
data,
covariate,
pca_var=None,
n_comps=50,
svd_solver="arpack",
linreg_method="sklearn",
verbose=False,
n_threads=1,
):
"""Principal component regression

Expand All @@ -172,14 +219,20 @@ def pc_regression(
:return:
Variance contribution of regression
"""

if isinstance(data, (np.ndarray, sparse.csr_matrix, sparse.csc_matrix)):
matrix = data
else:
raise TypeError(
f"invalid type: {data.__class__} is not a numpy array or sparse matrix"
)

if linreg_method == "sklearn":
linreg_method = linreg_sklearn
elif linreg_method == "numpy":
linreg_method = linreg_np
else:
raise ValueError(f"invalid linreg_method: {linreg_method}")

# perform PCA if no variance contributions are given
if pca_var is None:

Expand All @@ -193,7 +246,7 @@ def pc_regression(
matrix = matrix.toarray()

if verbose:
print("compute PCA")
print("compute PCA...")
X_pca, _, _, pca_var = sc.tl.pca(
matrix,
n_comps=n_comps,
Expand All @@ -216,18 +269,51 @@ def pc_regression(
else:
if verbose:
print("one-hot encode categorical values")
covariate = pd.get_dummies(covariate)
covariate = pd.get_dummies(covariate).to_numpy()

# fit linear model for n_comps PCs
r2 = []
for i in range(n_comps):
pc = X_pca[:, [i]]
lm = LinearRegression()
lm.fit(covariate, pc)
r2_score = np.maximum(0, lm.score(covariate, pc))
r2.append(r2_score)

Var = pca_var / sum(pca_var) * 100
R2Var = sum(r2 * Var) / 100
if verbose:
print(f"Use {n_threads} threads for regression...")
if n_threads == 1:
r2 = []
for i in tqdm(range(n_comps), total=n_comps):
r2_score = linreg_method(X=covariate, y=X_pca[:, [i]])
r2.append(r2_score)
else:
from concurrent.futures import ThreadPoolExecutor, as_completed

r2 = []
# parallelise over all principal components
with tqdm(total=n_comps) as pbar:
with ThreadPoolExecutor(max_workers=n_threads) as executor:
futures = [
executor.submit(linreg_method, X=covariate, y=pc) for pc in X_pca.T
]
for future in as_completed(futures):
r2.append(future.result())
pbar.update(1)

Var = pca_var / sum(pca_var) # * 100
R2Var = sum(r2 * Var) # / 100

return R2Var


def linreg_sklearn(X, y):
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
mumichae marked this conversation as resolved.
Show resolved Hide resolved
lm.fit(X, y)
r2_score = lm.score(X, y)
np.maximum(0, r2_score)
return np.maximum(0, r2_score)


def linreg_np(X, y):
mumichae marked this conversation as resolved.
Show resolved Hide resolved
_, residuals, _, _ = np.linalg.lstsq(X, y, rcond=None)
if residuals.size == 0:
residuals = np.array([0])
rss = residuals[0] if len(residuals) > 0 else 0
tss = np.sum((y - y.mean()) ** 2)
r2_score = 1 - (rss / tss)
return np.maximum(0, r2_score)
2 changes: 1 addition & 1 deletion scib/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ def score_cell_cycle(adata, organism="mouse"):
f"cell cycle genes not in adata\n organism: {organism}\n varnames: {rand_genes}"
)

sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes)
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)


def save_seurat(adata, path, batch, hvgs=None):
Expand Down
Loading
Loading