教程第二十六章:https://osca.bioconductor.org/unfiltered-human-pbmcs-10x-genomics.html
一、下载示例数据,并构建sce对象
library(BiocFileCache)
bfc <- BiocFileCache("raw_data", ask = FALSE)
raw.path <- bfcrpath(bfc, file.path("http://cf.10xgenomics.com/samples",
"cell-exp/2.1.0/pbmc4k/pbmc4k_raw_gene_bc_matrices.tar.gz"))
untar(raw.path, exdir=file.path(tempdir(), "pbmc4k"))
#三个文件的打包
fname <- file.path(tempdir(), "pbmc4k/raw_gene_bc_matrices/GRCh38")
library(DropletUtils)
#Creates a SingleCellExperiment from the CellRanger output directories for 10X Genomics data.
sce.pbmc <- read10xCounts(fname, col.names=TRUE)
sce.pbmc
counts(sce.pbmc)[1:4,1:4]
dim(sce.pbmc)
根据对sce的初步观察,可以初步了解到以下信息
- 1、共有73w+个cell,可判断是含有非常多的empty droplets,需要过滤;
- 2、assay slot的rownames为ensemble ID,一般后续分析使用symbol ID,需要转换;
- 3、rowData slot还提供有对应 symbol ID。但需要注意不同ensemble ID对应同一个symbol ID的情况。而一般sce后续分析行名不能重复。

二、更换行名 ensemble→symbol
- 为了避免symbol名有重复的情况,不是直接替换,而是使用
uniquifyFeatureNames()
函数
library(scater)
?uniquifyFeatureNames
length(rowData(sce.pbmc)$Symbol)
length(unique(rowData(sce.pbmc)$Symbol))
#可以看到有几十个symbol ID是一样的(ensemble ID 都已独一无二的)为了避免symbol ID作为行名重复
rownames(sce.pbmc) <- uniquifyFeatureNames(
rowData(sce.pbmc)$ID, rowData(sce.pbmc)$Symbol)
grep("ENSG",rownames(sce.pbmc), value = T)
#通过下图结果就可以知道uniquifyFeatureNames的作用

三、过滤empty droplet
set.seed(100)
e.out <- emptyDrops(counts(sce.pbmc))
e.out
summary(e.out$FDR)
dim(sce.pbmc) #737280 cells
sce.pbmc <- sce.pbmc[,which(e.out$FDR <= 0.001)]
#过滤empty drop
unfiltered <- sce.pbmc
dim(unfiltered) # 仅剩4233 cells
四、质控--线粒体基因高表达cell
a relaxed QC strategy
1、基因与染色体关系
library(EnsDb.Hsapiens.v86)
#根据ensemble ID 标记基因所在染色体。目的是找出线粒体基因
location <- mapIds(EnsDb.Hsapiens.v86, keys=rowData(sce.pbmc)$ID,
column="SEQNAME", keytype="GENEID")
class(location)
table(location) #有13个染色体基因
2、质控isOutlier()
stats <- perCellQCMetrics(sce.pbmc, subsets=list(Mito=which(location=="MT")))
head(stats)
# a relaxed QC strategy: 仅过滤异常含量线粒体基因指标
high.mito <- isOutlier(stats$subsets_Mito_percent, type="higher")
table(high.mito) #311个细胞被过滤
#high.mito
#FALSE TRUE
# 3922 311
sce.pbmc <- sce.pbmc[,!high.mito]
3、可视化
colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- high.mito
gridExtra::grid.arrange(
plotColData(unfiltered, y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(unfiltered, y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(unfiltered, y="subsets_Mito_percent",
colour_by="discard") + ggtitle("Mito percent"),
ncol=2
)

plotColData(unfiltered, x="sum", y="subsets_Mito_percent",
colour_by="discard") + scale_x_log10()

五、挑选高变基因
library(scran)
set.seed(1000)
#先进行标准化
sce.pbmc <- logNormCounts(sce.pbmc)
set.seed(1001)
dec.pbmc <- modelGeneVarByPoisson(sce.pbmc)
top.pbmc <- getTopHVGs(dec.pbmc, prop=0.1)
plot(dec.pbmc$mean, dec.pbmc$total, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(dec.pbmc)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)

六、降维聚类
- 降维
set.seed(10000)
#考虑到technical noise
sce.pbmc <- denoisePCA(sce.pbmc, subset.row=top.pbmc, technical=dec.pbmc)
set.seed(100000)
sce.pbmc <- runTSNE(sce.pbmc, dimred="PCA")
set.seed(1000000)
sce.pbmc <- runUMAP(sce.pbmc, dimred="PCA")
sce.pbmc
ncol(reducedDim(sce.pbmc, "PCA"))
- 聚类
g <- buildSNNGraph(sce.pbmc, k=10, use.dimred = 'PCA')
clust <- igraph::cluster_walktrap(g)$membership
sce.pbmc$clust <- factor(clust)
table(sce.pbmc$clust)
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#199 517 412 373 126 279 462 938 45 47 152 42 24 156 81 54 15
plotTSNE(sce.pbmc, colour_by="clust")

- 每个clust的marker gene
markers <- findMarkers(sce.pbmc, pval.type="some",
direction="up", groups=sce.pbmc$clust)
marker.set <- markers[["7"]]
as.data.frame(marker.set[1:30,1:2])
#挑选clust7相比较其它clust较为显著的四个基因可视化
plotExpression(sce.pbmc, features=c("CD14", "CD68",
"MNDA", "FCGR3A"), x="clust", colour_by="clust")

网友评论