Installation

if (!require("devtools")) {
  install.packages("devtools")
}
devtools::install_github("chencxxy28/InteRD")

In this tutorial, we use SCDC (Dong et al. (2021)) for illustration to conduct reference-based deconvolution.

Implementation (Case study 1)

First, read in scRNA-seq data from our website

readRDSFromWeb <- function(ref) {
  readRDS(gzcon(url(ref)))
}
seger <- readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/segerstolpe.rds")
baron <- readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/baron.rds")
xin<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/Xin_nonD.rds")

Second, use function generateBulk() to create the pseudo bulk samples, where ct.varname specifies the name of cell type clustering result variable. sample specifies the name of subjects information variable. ct.sub specifies the names of cell types used to construct pseudo bulk samples. Here we provide an example where the pseudo bulk samples are generated from Segerstolpe et al. (2016).

set.seed(1234567)
pseudo.seger<-generateBulk(seger[["sc.eset.qc"]], ct.varname = "cluster", sample = "sample", ct.sub = c("alpha","beta","delta","gamma"), nbulk = 40, low_s = 0.3, upp_s = 0.7)
truep<-pseudo.seger$true_p[complete.cases(pseudo.seger$true_p),]

The generated pseudo bulk object contains a matrix of true cell type proportions (pseudo.seger$truep) and the ExpressionSet object (pseudo.seger$pseudo_eset).

InteRD1 algorithm is the first step of InteRD to do multiple reference ensemble, which takes as input the target bulk data, a list of marker genes (pre-selected by Seurat based on a single cell data from Xin et al. (2016)), a list of subject-level cell type proportions based on each reference set. For illustration, we use the function SCDC_ENSEMBLE() from SCDC package (Dong et al. (2021)) to obtain single-reference-based estimates and ensemble estimates. For reference panels, we consider two single cell data, one from Baron et al. (2016), and the other from Xin et al. (2016). The estimates can be obtained by running the following code

set.seed(1234567)
library(SCDC)
##ensemble of multiple reference sets 
#resuts based on SCDC
  pancreas.sc <- list(baron = baron$sc.eset.qc,
                      xin   = xin
  )
  SCDC_results<-SCDC_ENSEMBLE(bulk.eset = pseudo.seger$pseudo_eset, sc.eset.list = pancreas.sc, ct.varname = "cluster",
                                sample = "sample", weight.basis =TRUE,truep = truep, ct.sub =  c("alpha","beta","delta","gamma"), search.length = 0.02,grid.search=TRUE)
  comb<-SCDC_results$prop.only
  weight_matrix<-SCDC_results$w_table["mAD_Y",1:2]
  SCDC_ENSEMBLE_MAD<-SCDC:::wt_prop(weight_matrix,comb)
  
  # saveRDS(SCDC_ENSEMBLE_MAD,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/SCDC_ENSEMBLE_MAD_seger.rds")
  # saveRDS(comb,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/comb_seger.rds")
#results based on InteRD1
SCDC_ENSEMBLE_MAD<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/SCDC_ENSEMBLE_MAD_seger.rds")
comb<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/comb_seger.rds")
  list_marker<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/list_markerbaron20.rds") #get markers selected from xin et al (2016)
  lambda_option<-c(0,0.01,0.05,0.1,1,5,100)
  cell_type_unique<-c("alpha","beta","delta","gamma")
  InteRD1.output<-InteRD1(bulk.data =pseudo.seger$pseudo_eset,list_marker,cell_type_unique,comb_used=comb,lambda_option)
  InteRD1<-InteRD.predict.prop(InteRD.output=InteRD1.output)

We provide the function evaluate to assess the performance of estimated proportions versus the true proportions based on mean absolute deviance (MAD, the smaller the better), Kendall correlation coefficient (Ken, the larger the better), and Pearson correlation coefficient (Cor, the larger the better).

evaluate(SCDC_ENSEMBLE_MAD,pseudo.seger$true_p)$all.eva
##     all.mad   all.ken   all.cor
## 1 0.1069572 0.7358974 0.8939433
evaluate(InteRD1,pseudo.seger$true_p)$all.eva
##      all.mad     all.ken    all.cor
## 1 0.04851359 0.004487179 0.02931761

InteRD2 algorithm is the second step of InteRD to further integrate prior biological information into the deconvolution in a robust manner. The prior information is the population-level mean cell-type proportions and their corresponding standard deviations across samples. In this tutorial, we obtain this information from scRNA-seq data from Segerstolpe et al. (2016).

ave_est = pop.ct.prop.scRNA(scRNA=seger[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.prop
ave_sd = pop.ct.prop.scRNA(scRNA=seger[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.sd
lambda_option<-c(0,seq(from=1,to=20,length=4),seq(from=30,to=100,length=4),200,500,1000000^2)
InteRD2.output<-InteRD2(bulk.data=pseudo.seger$pseudo_eset,list_marker,cell_type_unique,comb_sampled=InteRD1,ave_est,ave_sd,lambda_option=lambda_option)
InteRD2<-InteRD.predict.prop(InteRD.output=InteRD2.output)

Note that Reference-free approach and TOAST are special cases of InteRD2. The TOAST can be run based on the function InteRD2 by specifying lambda_option=0, and Reference-free approach can be run by the function Ref_free. The following is the demo for reference-free approach.

ref_free.output<-Ref_free(bulk.data=pseudo.seger$pseudo_eset,list_marker=list_marker,cell_type_unique=cell_type_unique)
## [1] 1
reffree<-InteRD.predict.prop(InteRD.output=ref_free.output)

Based on the evaluations, we can tell that InteRD2 is better than InteRD1, the InteRD estimates are all better than the existing method.

evaluate(SCDC_ENSEMBLE_MAD,pseudo.seger$true_p)$all.eva
##     all.mad   all.ken   all.cor
## 1 0.1069572 0.7358974 0.8939433
evaluate(reffree,pseudo.seger$true_p)$all.eva
##      all.mad   all.ken   all.cor
## 1 0.03773585 0.7487179 0.9264559
evaluate(InteRD1,pseudo.seger$true_p)$all.eva
##      all.mad     all.ken    all.cor
## 1 0.04851359 0.004487179 0.02931761
evaluate(InteRD2,pseudo.seger$true_p)$all.eva
##      all.mad   all.ken   all.cor
## 1 0.01548666 0.7583333 0.9313362

Implementation (Case study 2)

Now let us consider another pseudo data generated from Baron et al. (2016).

set.seed(1234567)
pseudo.baron<-generateBulk(baron[["sc.eset.qc"]], ct.varname = "cluster", sample = "sample", ct.sub = c("alpha","beta","delta","gamma"), nbulk = 40, low_s = 0.3, upp_s = 0.7)
truep<-pseudo.baron$true_p[complete.cases(pseudo.baron$true_p),]

Then we apply SCDC to obtain the estimates based on InteRD and SCDC-ENSEMBLE, where we consider two single cell data, one from Segerstolpe et al. (2016), and the other from Xin et al. (2016). The estimates can be obtained by running the following code

set.seed(1234567)
##ensemble of multiple reference sets 
#resuts based on SCDC
  pancreas.sc <- list(seger = seger$sc.eset.qc,
                      xin   = xin
  )
  SCDC_results<-SCDC_ENSEMBLE(bulk.eset = pseudo.baron$pseudo_eset, sc.eset.list = pancreas.sc, ct.varname = "cluster",
                                sample = "sample", weight.basis =TRUE,truep = truep, ct.sub =  c("alpha","beta","delta","gamma"), search.length = 0.02,grid.search=TRUE)
  comb<-SCDC_results$prop.only
  weight_matrix<-SCDC_results$w_table["mAD_Y",1:2]
  SCDC_ENSEMBLE_MAD<-SCDC:::wt_prop(weight_matrix,comb)
  
  # saveRDS(SCDC_ENSEMBLE_MAD,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/SCDC_ENSEMBLE_MAD_baron.rds")
    # saveRDS(comb,"/Users/chixiang.chen/Library/CloudStorage/OneDrive-UniversityofMarylandSchoolofMedicine/postdoc/postdoc/deconvolution/ref_based_rd/data_InteRD/comb_baron.rds")
#results based on InteRD1
SCDC_ENSEMBLE_MAD<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/SCDC_ENSEMBLE_MAD_baron.rds")
comb<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/comb_baron.rds")
  list_marker<-readRDSFromWeb("https://github.com/chencxxy28/Data/raw/main/data_InteRD/list_markerbaron20.rds") #get markers selected from xin et al (2016)
  lambda_option<-c(0,0.01,0.05,0.1,1,5,100)
  cell_type_unique<-c("alpha","beta","delta","gamma")
  InteRD1.output<-InteRD1(bulk.data =pseudo.baron$pseudo_eset,list_marker,cell_type_unique,comb_used=comb,lambda_option)
  InteRD1<-InteRD.predict.prop(InteRD.output=InteRD1.output)

After obtaining InteRD1, we can obtain InteRD2 by integrating prior biological information into the deconvolution in a robust manner. The prior information is the population-level mean cell-type proportions and their corresponding standard deviations across samples that are obtained from scRNA-seq data from Baron et al. (2016).

ave_est = pop.ct.prop.scRNA(scRNA=baron[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.prop
ave_sd = pop.ct.prop.scRNA(scRNA=baron[["sc.eset.qc"]],cell_type_unique=cell_type_unique)$pop.ct.sd
lambda_option<-c(0,seq(from=1,to=20,length=4),seq(from=30,to=100,length=4),200,500,1000000^2)
InteRD2.output<-InteRD2(bulk.data=pseudo.baron$pseudo_eset,list_marker,cell_type_unique,comb_sampled=InteRD1,ave_est,ave_sd,lambda_option=lambda_option)
InteRD2<-InteRD.predict.prop(InteRD.output=InteRD2.output)

Based on the evaluations, we can tell that InteRD-based estimates (both InteRD1 and InteRD2) are better than the existing method.

evaluate(SCDC_ENSEMBLE_MAD,pseudo.baron$true_p)$all.eva
evaluate(InteRD1,pseudo.baron$true_p)$all.eva
evaluate(InteRD2,pseudo.baron$true_p)$all.eva