Example data set

First, load the SingleCellExperiment object of 10X Genomics 2,700 peripheral blood mononuclear cells (PBMC) preprocessed in a previous vignette.

library(SingleCellExperiment)
sce <- readRDS("pbmc3k_tutorial.sce.rds")
sce

## class: SingleCellExperiment 
## dim: 32738 2638 
## metadata(0):
## assays(1): counts
## rownames(32738): ENSG00000243485 ENSG00000237613 ...
##   ENSG00000215616 ENSG00000215611
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames: NULL
## colData names(12): Sample Barcode ... Date_published seurat.ident
## reducedDimNames(2): PCA TSNE
## spikeNames(0):

Run paired t-tests to identify markers

First, we normalize and log-transform the count data.

library(scran)
library(scater)
sce <- computeSumFactors(sce)
sce <- normalize(sce)
sce

## class: SingleCellExperiment 
## dim: 32738 2638 
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(32738): ENSG00000243485 ENSG00000237613 ...
##   ENSG00000215616 ENSG00000215611
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames: NULL
## colData names(12): Sample Barcode ... Date_published seurat.ident
## reducedDimNames(2): PCA TSNE
## spikeNames(0):

In this example, we look for positive markers. Namely, up-regulated genes in each cluster relative to all other clusters.

library(scran)
out <- findMarkers(sce, clusters=sce$seurat.ident, direction="up", lfc=1)
out

## DataFrameList of length 8
## names(8): 0 1 2 3 4 5 6 7

The result out is a list of DataFrame. For each cluster, the log fold-change against every other cluster is reported, along with a combined p-value and FDR.

colnames(out[[1]])

##  [1] "Top"     "p.value" "FDR"     "logFC.1" "logFC.2" "logFC.3" "logFC.4"
##  [8] "logFC.5" "logFC.6" "logFC.7"

Obtain gene signatures for relevant cell types

Load xCellData

This package contains published gene signatures for a number of cell types.

library(xCellData)
library(unisets)
xcell <- xCellData()
xcell

## Sets with 20803 relations between 5079 elements and 489 sets
##             element          set
##         <character>  <character>
##     [1]        C1QA   aDC_HPCA_1
##     [2]        C1QB   aDC_HPCA_1
##     [3]       CCL13   aDC_HPCA_1
##     [4]       CCL17   aDC_HPCA_1
##     [5]       CCL19   aDC_HPCA_1
##     ...         ...          ...
## [20799]       IL2RA Tregs_HPCA_3
## [20800]       KCNA2 Tregs_HPCA_3
## [20801]       LAIR2 Tregs_HPCA_3
## [20802]      MCF2L2 Tregs_HPCA_3
## [20803]        RGS1 Tregs_HPCA_3
## -----------
## elementInfo: IdVector with 0 metadata
##     setInfo: IdVector with 1 metadata (source)

Prepare annotations for the analysis

In this vignette, we are about to use the clusterProfiler package to compute the enrichment of published cell type signatures among sets of genes up-regulated in each cluster. To do so, we need to match the ensembl gene identifiers used in the PBMC data set to the gene symbols used in the published cell type gene signatures.

First, we use the org.Hs.eg.db to convert the xCellData published signatures from gene symbols to ensembl gene identifiers.

library(org.Hs.eg.db)
TERM2GENE <- as.data.frame(xcell)[, c("set", "element")]
TERM2GENE$element <- mapIds(org.Hs.eg.db, TERM2GENE$element, "ENSEMBL", "SYMBOL")

The clusterProfiler also requires a table mapping term identifiers to their name. In this example, each signature identifier is already a descriptive name, so we provide empty strings.

TERM2NAME <- data.frame(
    id = ids(setInfo(xcell)),
    name = rep("", nSets(xcell))
)

Run the analysis

In this example, we apply Gene Set Enrichment Analysis (GSEA) on the lists of genes ranked by decreasing detection average log-fold change between each cluster and all other clusters. Below, we display the 2 most signicantly enriched cell type gene signature for each cluster. Note that for some clusters, none of the signatures reached signifcance.

library(clusterProfiler)
runGSEA <- function(clusterName, threshold=0, top=10) {
    x <- out[[clusterName]]
    x_lfc_colnames <- grep("^logFC\\.", colnames(x))
    x_rowmeans <- rowMeans(as.matrix(x[, x_lfc_colnames]))
    x_rowmeans <- sort(x_rowmeans, decreasing = TRUE)
    x_rowmeans <- x_rowmeans[x_rowmeans > threshold]
    y = GSEA(x_rowmeans, TERM2GENE=TERM2GENE, TERM2NAME=TERM2NAME, pvalueCutoff=1, verbose = FALSE)
    y <- as.data.frame(y)
    y <- head(y, top)
    y <- cbind(cluster = clusterName, y)
    y
}
resList <- lapply(levels(sce$seurat.ident), runGSEA, threshold=0, top=2)
res <- do.call(rbind, resList)
res <- as.data.frame(res)
show_columns <- setdiff(colnames(res), "core_enrichment")
kable(res[, show_columns])

	cluster	ID	setSize	enrichmentScore	NES	pvalue	p.adjust	qvalues	rank	leading_edge
CD4+ naive T-cells_IRIS_3	0	CD4+ naive T-cells_IRIS_3	35	0.9243340	1.330569	0.0019980	0.2171000	0.1970526	219	tags=37%, list=3%, signal=36%
CD4+ memory T-cells_FANTOM_1	0	CD4+ memory T-cells_FANTOM_1	113	0.8580016	1.209374	0.0039960	0.2171000	0.1970526	552	tags=41%, list=8%, signal=38%
Monocytes_FANTOM_2	1	Monocytes_FANTOM_2	146	0.8235345	1.193806	0.0039960	0.1468531	0.1346022	700	tags=50%, list=13%, signal=45%
Smooth muscle_FANTOM_3	1	Smooth muscle_FANTOM_3	13	0.9662616	1.423483	0.0040363	0.1468531	0.1346022	41	tags=23%, list=1%, signal=23%
B-cells_NOVERSHTERN_1	2	B-cells_NOVERSHTERN_1	46	0.9022696	1.393815	0.0009990	0.0253080	0.0194542	361	tags=63%, list=7%, signal=59%
B-cells_NOVERSHTERN_2	2	B-cells_NOVERSHTERN_2	64	0.8886044	1.368590	0.0009990	0.0253080	0.0194542	459	tags=53%, list=8%, signal=49%
CD4+ T-cells_HPCA_2	3	CD4+ T-cells_HPCA_2	28	0.8848310	1.530851	0.0009990	0.0134403	0.0093614	475	tags=57%, list=7%, signal=53%
CD8+ T-cells_HPCA_3	3	CD8+ T-cells_HPCA_3	22	0.9175060	1.581629	0.0009990	0.0134403	0.0093614	373	tags=68%, list=6%, signal=65%
aDC_IRIS_2	4	aDC_IRIS_2	96	0.7844809	1.262957	0.0009990	0.0293040	0.0261142	819	tags=34%, list=13%, signal=30%
Macrophages_BLUEPRINT_2	4	Macrophages_BLUEPRINT_2	71	0.7999167	1.285911	0.0009990	0.0293040	0.0261142	1056	tags=62%, list=16%, signal=52%
CD8+ Tem_BLUEPRINT_2	5	CD8+ Tem_BLUEPRINT_2	106	0.8379398	1.564130	0.0009990	0.0120036	0.0092408	444	tags=31%, list=6%, signal=30%
CD8+ Tem_BLUEPRINT_3	5	CD8+ Tem_BLUEPRINT_3	49	0.9179788	1.696262	0.0009990	0.0120036	0.0092408	349	tags=53%, list=5%, signal=51%
Melanocytes_FANTOM_3	6	Melanocytes_FANTOM_3	74	0.7773676	1.494800	0.0009990	0.0770186	0.0675602	277	tags=19%, list=5%, signal=18%
Monocytes_FANTOM_3	6	Monocytes_FANTOM_3	110	0.7108240	1.376296	0.0019980	0.0770186	0.0675602	801	tags=34%, list=13%, signal=30%
Platelets_HPCA_2	7	Platelets_HPCA_2	74	0.7431587	1.358691	0.0009990	0.0405812	0.0395528	341	tags=46%, list=15%, signal=40%
Astrocytes_FANTOM_1	7	Astrocytes_FANTOM_1	18	0.8523369	1.493779	0.0010020	0.0405812	0.0395528	98	tags=39%, list=4%, signal=38%

Enrichment of Gene Signatures in Positive Markers

Kevin Rue-Albrecht

3 July 2019