Example data set

First, load the SingleCellExperiment object of 10X Genomics 2,700 peripheral blood mononuclear cells (PBMC) preprocessed in a previous vignette.

library(SingleCellExperiment)
sce <- readRDS("pbmc3k_tutorial.sce.rds")
sce
## class: SingleCellExperiment 
## dim: 32738 2638 
## metadata(0):
## assays(1): counts
## rownames(32738): ENSG00000243485 ENSG00000237613 ...
##   ENSG00000215616 ENSG00000215611
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames: NULL
## colData names(12): Sample Barcode ... Date_published seurat.ident
## reducedDimNames(2): PCA TSNE
## spikeNames(0):

Run paired t-tests to identify markers

First, we normalize and log-transform the count data.

library(scran)
library(scater)
sce <- computeSumFactors(sce)
sce <- normalize(sce)
sce
## class: SingleCellExperiment 
## dim: 32738 2638 
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(32738): ENSG00000243485 ENSG00000237613 ...
##   ENSG00000215616 ENSG00000215611
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames: NULL
## colData names(12): Sample Barcode ... Date_published seurat.ident
## reducedDimNames(2): PCA TSNE
## spikeNames(0):

In this example, we look for positive markers. Namely, up-regulated genes in each cluster relative to all other clusters.

library(scran)
out <- findMarkers(sce, clusters=sce$seurat.ident, direction="up", lfc=1)
out
## DataFrameList of length 8
## names(8): 0 1 2 3 4 5 6 7

The result out is a list of DataFrame. For each cluster, the log fold-change against every other cluster is reported, along with a combined p-value and FDR.

colnames(out[[1]])
##  [1] "Top"     "p.value" "FDR"     "logFC.1" "logFC.2" "logFC.3" "logFC.4"
##  [8] "logFC.5" "logFC.6" "logFC.7"

Obtain gene signatures for relevant cell types

Load xCellData

This package contains published gene signatures for a number of cell types.

library(xCellData)
library(unisets)
xcell <- xCellData()
xcell
## Sets with 20803 relations between 5079 elements and 489 sets
##             element          set
##         <character>  <character>
##     [1]        C1QA   aDC_HPCA_1
##     [2]        C1QB   aDC_HPCA_1
##     [3]       CCL13   aDC_HPCA_1
##     [4]       CCL17   aDC_HPCA_1
##     [5]       CCL19   aDC_HPCA_1
##     ...         ...          ...
## [20799]       IL2RA Tregs_HPCA_3
## [20800]       KCNA2 Tregs_HPCA_3
## [20801]       LAIR2 Tregs_HPCA_3
## [20802]      MCF2L2 Tregs_HPCA_3
## [20803]        RGS1 Tregs_HPCA_3
## -----------
## elementInfo: IdVector with 0 metadata
##     setInfo: IdVector with 1 metadata (source)

Prepare annotations for the analysis

In this vignette, we are about to use the clusterProfiler package to compute the enrichment of published cell type signatures among sets of genes up-regulated in each cluster. To do so, we need to match the ensembl gene identifiers used in the PBMC data set to the gene symbols used in the published cell type gene signatures.

First, we use the org.Hs.eg.db to convert the xCellData published signatures from gene symbols to ensembl gene identifiers.

library(org.Hs.eg.db)
TERM2GENE <- as.data.frame(xcell)[, c("set", "element")]
TERM2GENE$element <- mapIds(org.Hs.eg.db, TERM2GENE$element, "ENSEMBL", "SYMBOL")

The clusterProfiler also requires a table mapping term identifiers to their name. In this example, each signature identifier is already a descriptive name, so we provide empty strings.

TERM2NAME <- data.frame(
    id = ids(setInfo(xcell)),
    name = rep("", nSets(xcell))
)

Run the analysis

In this example, we apply Gene Set Enrichment Analysis (GSEA) on the lists of genes ranked by decreasing detection average log-fold change between each cluster and all other clusters. Below, we display the 2 most signicantly enriched cell type gene signature for each cluster. Note that for some clusters, none of the signatures reached signifcance.

library(clusterProfiler)
runGSEA <- function(clusterName, threshold=0, top=10) {
    x <- out[[clusterName]]
    x_lfc_colnames <- grep("^logFC\\.", colnames(x))
    x_rowmeans <- rowMeans(as.matrix(x[, x_lfc_colnames]))
    x_rowmeans <- sort(x_rowmeans, decreasing = TRUE)
    x_rowmeans <- x_rowmeans[x_rowmeans > threshold]
    y = GSEA(x_rowmeans, TERM2GENE=TERM2GENE, TERM2NAME=TERM2NAME, pvalueCutoff=1, verbose = FALSE)
    y <- as.data.frame(y)
    y <- head(y, top)
    y <- cbind(cluster = clusterName, y)
    y
}
resList <- lapply(levels(sce$seurat.ident), runGSEA, threshold=0, top=2)
res <- do.call(rbind, resList)
res <- as.data.frame(res)
show_columns <- setdiff(colnames(res), "core_enrichment")
kable(res[, show_columns])
cluster ID Description setSize enrichmentScore NES pvalue p.adjust qvalues rank leading_edge
CD4+ naive T-cells_IRIS_3 0 CD4+ naive T-cells_IRIS_3 35 0.9243340 1.330569 0.0019980 0.2171000 0.1970526 219 tags=37%, list=3%, signal=36%
CD4+ memory T-cells_FANTOM_1 0 CD4+ memory T-cells_FANTOM_1 113 0.8580016 1.209374 0.0039960 0.2171000 0.1970526 552 tags=41%, list=8%, signal=38%
Monocytes_FANTOM_2 1 Monocytes_FANTOM_2 146 0.8235345 1.193806 0.0039960 0.1468531 0.1346022 700 tags=50%, list=13%, signal=45%
Smooth muscle_FANTOM_3 1 Smooth muscle_FANTOM_3 13 0.9662616 1.423483 0.0040363 0.1468531 0.1346022 41 tags=23%, list=1%, signal=23%
B-cells_NOVERSHTERN_1 2 B-cells_NOVERSHTERN_1 46 0.9022696 1.393815 0.0009990 0.0253080 0.0194542 361 tags=63%, list=7%, signal=59%
B-cells_NOVERSHTERN_2 2 B-cells_NOVERSHTERN_2 64 0.8886044 1.368590 0.0009990 0.0253080 0.0194542 459 tags=53%, list=8%, signal=49%
CD4+ T-cells_HPCA_2 3 CD4+ T-cells_HPCA_2 28 0.8848310 1.530851 0.0009990 0.0134403 0.0093614 475 tags=57%, list=7%, signal=53%
CD8+ T-cells_HPCA_3 3 CD8+ T-cells_HPCA_3 22 0.9175060 1.581629 0.0009990 0.0134403 0.0093614 373 tags=68%, list=6%, signal=65%
aDC_IRIS_2 4 aDC_IRIS_2 96 0.7844809 1.262957 0.0009990 0.0293040 0.0261142 819 tags=34%, list=13%, signal=30%
Macrophages_BLUEPRINT_2 4 Macrophages_BLUEPRINT_2 71 0.7999167 1.285911 0.0009990 0.0293040 0.0261142 1056 tags=62%, list=16%, signal=52%
CD8+ Tem_BLUEPRINT_2 5 CD8+ Tem_BLUEPRINT_2 106 0.8379398 1.564130 0.0009990 0.0120036 0.0092408 444 tags=31%, list=6%, signal=30%
CD8+ Tem_BLUEPRINT_3 5 CD8+ Tem_BLUEPRINT_3 49 0.9179788 1.696262 0.0009990 0.0120036 0.0092408 349 tags=53%, list=5%, signal=51%
Melanocytes_FANTOM_3 6 Melanocytes_FANTOM_3 74 0.7773676 1.494800 0.0009990 0.0770186 0.0675602 277 tags=19%, list=5%, signal=18%
Monocytes_FANTOM_3 6 Monocytes_FANTOM_3 110 0.7108240 1.376296 0.0019980 0.0770186 0.0675602 801 tags=34%, list=13%, signal=30%
Platelets_HPCA_2 7 Platelets_HPCA_2 74 0.7431587 1.358691 0.0009990 0.0405812 0.0395528 341 tags=46%, list=15%, signal=40%
Astrocytes_FANTOM_1 7 Astrocytes_FANTOM_1 18 0.8523369 1.493779 0.0010020 0.0405812 0.0395528 98 tags=39%, list=4%, signal=38%