In this example, we use count data for 2,700 peripheral blood mononuclear cells (PBMC) obtained using the 10X Genomics platform, and process it following the Guided Clustering Tutorial of the Seurat package.
First, fetch the data as a SingleCellExperiment object using the TENxPBMCData package. The first time that the following code chunk is run, users should expect it to take additional time as it downloads data from the web and caches it on their local machine; subsequent evaluations of the same code chunk should only take a few seconds as the data set is then loaded from the local cache.
library(TENxPBMCData)
tenx_pbmc3k <- TENxPBMCData(dataset="pbmc3k")
colnames(tenx_pbmc3k) <- paste0("Cell", seq_len(ncol(tenx_pbmc3k)))
tenx_pbmc3k
## class: SingleCellExperiment
## dim: 32738 2700
## metadata(0):
## assays(1): counts
## rownames(32738): ENSG00000243485 ENSG00000237613 ...
## ENSG00000215616 ENSG00000215611
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames(2700): Cell1 Cell2 ... Cell2699 Cell2700
## colData names(11): Sample Barcode ... Individual Date_published
## reducedDimNames(0):
## spikeNames(0):
Next, prepare a sparse matrix that emulates the first section of the Guided Clustering Tutorial.
library(Matrix)
pbmc.data <- as(counts(tenx_pbmc3k), "Matrix")
pbmc.data <- as(pbmc.data, "dgTMatrix")
From here on, follow the Guided Clustering Tutorial to the letter (code obtained on 2018-11-24).
library(Seurat)
library(dplyr)
# Initialize the Seurat object with the raw (non-normalized data). Keep all
# genes expressed in >= 3 cells (~0.1% of the data). Keep all cells with at
# least 200 detected genes
pbmc <- CreateSeuratObject(raw.data=pbmc.data, min.cells=3, min.genes=200, project="10X_PBMC")
pbmc
## An object of class seurat in project 10X_PBMC
## 13714 genes across 2700 samples.
# The number of genes and UMIs (nGene and nUMI) are automatically calculated
# for every object by Seurat. For non-UMI data, nUMI represents the sum of
# the non-normalized values within a cell We calculate the percentage of
# mitochondrial genes here and store it in percent.mito using AddMetaData.
# We use object@raw.data since this represents non-transformed and
# non-log-normalized counts The % of UMI mapping to MT-genes is a common
# scRNA-seq QC metric.
mito.genes <- subset(rowData(tenx_pbmc3k), grepl("^MT-", Symbol_TENx), "ENSEMBL_ID", drop=TRUE)
mito.genes <- intersect(mito.genes, rownames(pbmc@raw.data))
percent.mito <- Matrix::colSums(pbmc@raw.data[mito.genes, ])/Matrix::colSums(pbmc@raw.data)
# AddMetaData adds columns to object@meta.data, and is a great place to
# stash QC stats
pbmc <- AddMetaData(object=pbmc, metadata=percent.mito, col.name="percent.mito")
VlnPlot(object=pbmc, features.plot=c("nGene", "nUMI", "percent.mito"), nCol=3)
# GenePlot is typically used to visualize gene-gene relationships, but can
# be used for anything calculated by the object, i.e. columns in
# object@meta.data, PC scores etc. Since there is a rare subset of cells
# with an outlier level of high mitochondrial percentage and also low UMI
# content, we filter these as well
par(mfrow=c(1, 2))
GenePlot(object=pbmc, gene1="nUMI", gene2="percent.mito")
GenePlot(object=pbmc, gene1="nUMI", gene2="nGene")
# We filter out cells that have unique gene counts over 2,500 or less than
# 200 Note that low.thresholds and high.thresholds are used to define a
# 'gate'. -Inf and Inf should be used if you don't want a lower or upper
# threshold.
pbmc <- FilterCells(object=pbmc, subset.names=c("nGene", "percent.mito"),
low.thresholds=c(200, -Inf), high.thresholds=c(2500, 0.05))
pbmc <- NormalizeData(object=pbmc, normalization.method="LogNormalize",
scale.factor=10000)
pbmc <- FindVariableGenes(object=pbmc, mean.function=ExpMean, dispersion.function=LogVMR,
x.low.cutoff=0.0125, x.high.cutoff=3, y.cutoff=0.5)
length(x=pbmc@var.genes)
## [1] 1838
pbmc <- ScaleData(object=pbmc, vars.to.regress=c("nUMI", "percent.mito"))
## Regressing out: nUMI, percent.mito
##
## Time Elapsed: 10.7996120452881 secs
## Scaling data matrix
pbmc <- RunPCA(object=pbmc, pc.genes=pbmc@var.genes, do.print=TRUE, pcs.print=1:5,
genes.print=5)
## [1] "PC1"
## [1] "ENSG00000101439" "ENSG00000011600" "ENSG00000085265" "ENSG00000204482"
## [5] "ENSG00000204472"
## [1] ""
## [1] "ENSG00000213402" "ENSG00000008517" "ENSG00000227507" "ENSG00000116824"
## [5] "ENSG00000172543"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "ENSG00000105374" "ENSG00000100453" "ENSG00000180644" "ENSG00000077984"
## [5] "ENSG00000145649"
## [1] ""
## [1] "ENSG00000105369" "ENSG00000156738" "ENSG00000196735" "ENSG00000100721"
## [5] "ENSG00000179344"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "ENSG00000163737" "ENSG00000163736" "ENSG00000168497" "ENSG00000113140"
## [5] "ENSG00000127920"
## [1] ""
## [1] "ENSG00000051523" "ENSG00000231389" "ENSG00000223865" "ENSG00000196126"
## [5] "ENSG00000104894"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "ENSG00000008517" "ENSG00000179144" "ENSG00000165272" "ENSG00000082074"
## [5] "ENSG00000172005"
## [1] ""
## [1] "ENSG00000105369" "ENSG00000196735" "ENSG00000007312" "ENSG00000156738"
## [5] "ENSG00000179344"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "ENSG00000179639" "ENSG00000100079" "ENSG00000110077" "ENSG00000143546"
## [5] "ENSG00000132514"
## [1] ""
## [1] "ENSG00000203747" "ENSG00000261222" "ENSG00000142089" "ENSG00000108798"
## [5] "ENSG00000172216"
## [1] ""
## [1] ""
# Examine and visualize PCA results a few different ways
PrintPCA(object=pbmc, pcs.print=1:5, genes.print=5, use.full=FALSE)
## [1] "PC1"
## [1] "ENSG00000101439" "ENSG00000011600" "ENSG00000085265" "ENSG00000204482"
## [5] "ENSG00000204472"
## [1] ""
## [1] "ENSG00000213402" "ENSG00000008517" "ENSG00000227507" "ENSG00000116824"
## [5] "ENSG00000172543"
## [1] ""
## [1] ""
## [1] "PC2"
## [1] "ENSG00000105374" "ENSG00000100453" "ENSG00000180644" "ENSG00000077984"
## [5] "ENSG00000145649"
## [1] ""
## [1] "ENSG00000105369" "ENSG00000156738" "ENSG00000196735" "ENSG00000100721"
## [5] "ENSG00000179344"
## [1] ""
## [1] ""
## [1] "PC3"
## [1] "ENSG00000163737" "ENSG00000163736" "ENSG00000168497" "ENSG00000113140"
## [5] "ENSG00000127920"
## [1] ""
## [1] "ENSG00000051523" "ENSG00000231389" "ENSG00000223865" "ENSG00000196126"
## [5] "ENSG00000104894"
## [1] ""
## [1] ""
## [1] "PC4"
## [1] "ENSG00000008517" "ENSG00000179144" "ENSG00000165272" "ENSG00000082074"
## [5] "ENSG00000172005"
## [1] ""
## [1] "ENSG00000105369" "ENSG00000196735" "ENSG00000007312" "ENSG00000156738"
## [5] "ENSG00000179344"
## [1] ""
## [1] ""
## [1] "PC5"
## [1] "ENSG00000179639" "ENSG00000100079" "ENSG00000110077" "ENSG00000143546"
## [5] "ENSG00000132514"
## [1] ""
## [1] "ENSG00000203747" "ENSG00000261222" "ENSG00000142089" "ENSG00000108798"
## [5] "ENSG00000172216"
## [1] ""
## [1] ""
VizPCA(object=pbmc, pcs.use=1:2)
PCAPlot(object=pbmc, dim.1=1, dim.2=2)
# ProjectPCA scores each gene in the dataset (including genes not included
# in the PCA) based on their correlation with the calculated components.
# Though we don't use this further here, it can be used to identify markers
# that are strongly correlated with cellular heterogeneity, but may not have
# passed through variable gene selection. The results of the projected PCA
# can be explored by setting use.full=T in the functions above
pbmc <- ProjectPCA(object=pbmc, do.print=FALSE)
PCHeatmap(object=pbmc, pc.use=1, cells.use=500, do.balanced=TRUE, label.columns=FALSE)
## Warning in heatmap.2(data.use, Rowv = NA, Colv = NA, trace = "none", col =
## col.use, : Discrepancy: Rowv is FALSE, while dendrogram is `both'. Omitting
## row dendogram.
## Warning in heatmap.2(data.use, Rowv = NA, Colv = NA, trace = "none", col
## = col.use, : Discrepancy: Colv is FALSE, while dendrogram is `column'.
## Omitting column dendogram.
## Warning in plot.window(...): "dimTitle" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "dimTitle" is not a graphical parameter
## Warning in title(...): "dimTitle" is not a graphical parameter
PCHeatmap(object=pbmc, pc.use=1:12, cells.use=500, do.balanced=TRUE,
label.columns=FALSE, use.full=FALSE)
Small deviation from the tutorial. Skip the lengthy JackStraw computation.
# NOTE: This process can take a long time for big datasets, comment out for
# expediency. More approximate techniques such as those implemented in
# PCElbowPlot() can be used to reduce computation time
if (FALSE) {
pbmc <- JackStraw(object=pbmc, num.replicate=100, display.progress=FALSE)
JackStrawPlot(object=pbmc, PCs=1:12)
}
PCElbowPlot(object=pbmc)
# save.SNN=T saves the SNN so that the clustering algorithm can be rerun
# using the same graph but with a different resolution value (see docs for
# full details)
pbmc <- FindClusters(object=pbmc, reduction.type="pca", dims.use=1:10,
resolution=0.6, print.output=0, save.SNN=TRUE)
PrintFindClustersParams(object=pbmc)
## Parameters used in latest FindClusters calculation run on: 2019-02-28 21:22:03
## =============================================================================
## Resolution: 0.6
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 100 10
## -----------------------------------------------------------------------------
## Reduction used k.param prune.SNN
## pca 30 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
pbmc <- RunTSNE(object=pbmc, dims.use=1:10, do.fast=TRUE)
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object=pbmc)
Save the seurat
object
saveRDS(pbmc, file="pbmc3k_tutorial.rds")
Save the original SingleCellExperiment
object, after:
tenx_pbmc3k <- tenx_pbmc3k[, pbmc@cell.names]
colData(tenx_pbmc3k)[["seurat.ident"]] <- as.factor(pbmc@ident)
reducedDim(tenx_pbmc3k, "PCA") <- pbmc@dr$pca@cell.embeddings
reducedDim(tenx_pbmc3k, "TSNE") <- pbmc@dr$tsne@cell.embeddings
colnames(tenx_pbmc3k) <- NULL # remove the dummy cell names before exporting
saveRDS(tenx_pbmc3k, file="pbmc3k_tutorial.sce.rds")
Export the cluster identity vector. It will be included in the hancock package, and mapped using the dummy cell names defined at the start of this notebook.
ident <- pbmc@ident
names(ident) <- pbmc@cell.names
saveRDS(ident, "pbmc3k.ident.rds")