单细胞转录组基础分析

读取数据

1
2
3

library(Seurat)
library(tidyverse)
library(patchwork)

1
2
3

pbmc.data <- Read10X('filtered_gene_bc_matrices/hg19') # barcodes.tsv, genes.tsv, matrix.mtx三个文件所在文件夹的路径
pbmc <- CreateSeuratObject(counts = pbmc.data, project = 'pbmc3k', min.cells = 3, min.features = 200)
pbmc

细胞过滤

1	dir.create('QC')

1 2	# 计算线粒体的比例 pbmc[['percent.mt']] <- PercentageFeatureSet(pbmc, pattern = '^MT-') # 数据保存在 pbmc@meta.data

1
2
3

violin <- VlnPlot(pbmc, features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt'), ncol = 3) + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
ggsave('QC/vlnplot_before_qc.pdf', plot = violin, width = 9, height = 6)
ggsave('QC/vlnplot_before_qc.png', plot = violin, width = 9, height = 6)

plot1 <- FeatureScatter(pbmc, feature1 = 'nCount_RNA', feature2 = 'nFeature_RNA')
plot2 <- FeatureScatter(pbmc, feature1 = 'nCount_RNA', feature2 = 'percent.mt')
scatter <- CombinePlots(plots = list(plot1, plot2), nrow = 1, legend = 'none')
ggsave('QC/scatterplot_before_qc.pdf', plot = scatter, width = 9, height = 6)
ggsave('QC/scatterplot_before_qc.png', plot = scatter, width = 9, height = 6)

# 根据小提琴图设置过滤标准
minGene = 200 # 一般选择200-500
maxGene = 2500
pctMT = 5

pbmc <- subset(pbmc, subset = nFeature_RNA > minGene & nFeature_RNA < maxGene & percent.mt < pctMT)

violin <- VlnPlot(pbmc, features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt'), ncol = 3) + theme(axis.title.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
ggsave('QC/vlnplot_after_qc.pdf', plot = violin, width = 9, height = 6)
ggsave('QC/vlnplot_after_qc.png', plot = violin, width = 9, height = 6)

数据标准化

1	pbmc <- NormalizeData(pbmc, normalization.method = 'LogNormalize', scale.factor = 10000)

寻找高变基因

1	dir.create('cluster')

pbmc <- FindVariableFeatures(pbmc, selection.method = 'vst', nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)

plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot <- CombinePlots(plots = list(plot1, plot2), legend = 'bottom')
ggsave('cluster/VariableFeatures.pdf', plot = plot, width = 8, height = 6)
ggsave('cluster/VariableFeatures.png', width = 8, height = 6)

数据缩放

# 如果内存足够，对所有基因进行缩放
## all.genes <- rownames(pbmc)
## pbmc <- ScaleData(pbmc, features = all.genes)

# 如果内存不够，只对高变基因进行缩放
scale.genes <- VariableFeatures(pbmc)
pbmc <- ScaleData(pbmc, features = scale.genes)

PCA线性降维

pbmc <- RunPCA(pbmc, features = VariableFeatures(pbmc))

plot1 <- DimPlot(pbmc, reduction = 'pca')
plot2 <- ElbowPlot(pbmc, ndims = 20, reduction = 'pca')
plot <- plot1 + plot2
ggsave('cluster/pca.pdf', plot = plot, width = 8, height = 4)
ggsave('cluster/pca.png', plot = plot, width = 8, height = 4)

细胞聚类

pc.num = 1:10

pbmc <- FindNeighbors(pbmc, dims = pc.num)
pbmc <- FindClusters(pbmc, resolution = 0.5)

metadata <- pbmc@meta.data
cell_cluster <- data.frame(cell_ID = rownames(metadata), cluster_ID = metadata$seurat_clusters)
write.csv(cell_cluster, 'cluster/cell_cluster.csv', row.names = F)

非线性降维

# tSNE
pbmc <- RunTSNE(pbmc, dims = pc.num)
embed_tsne <- Embeddings(pbmc, 'tsne')
write.csv(embed_tsne, 'cluster/embed_tsne.csv')
plot1 <- DimPlot(pbmc, reduction = 'tsne')
ggsave('cluster/tSNE.pdf', plot = plot1, width = 8, height = 7)
ggsave('cluster/tSNE.png', plot = plot1, width = 8, height = 7)

#UMAP
pbmc <- RunUMAP(pbmc, dims = pc.num)
embed_umap <- Embeddings(pbmc, 'umap')
write.csv(embed_umap, 'cluster/embed_umap.csv')
plot2 <- DimPlot(pbmc, reduction = 'umap')
ggsave('cluster/UMAP.pdf', plot = plot2, width = 8, height = 7)
ggsave('cluster/UMAP.png', plot = plot2, width = 8, height = 7)

plot <- plot1 + plot2 + plot_layout(guides = 'collect')
ggsave('cluster/tSNE_UMAP.pdf', plot = plot, width = 10, height = 5)
ggsave('cluster/tSNE_UMAP.png', plot = plot, width = 10, height = 5)

保存数据

1	saveRDS(pbmc, file = 'pbmc_tutorial.rds')

Cluster标志基因

1	dir.create('cell_identify')

默认使用wilcox方法鉴定标志基因，可以通过test.use参数设置分析方法。MASK是专门针对单细胞数据差异分析设计的，DESeq2是传统bulkRNA数据差异分析的经典方法。如果选用 “negbinom”、”poisson”或”DESeq2”等方法，需要将slot参数设为”counts”。

wilcox.markers <- FindAllMarkers(pbmc, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers <- wilcox.markers %>% select(gene, everything()) %>% subset(p_val < 0.05)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)

write.csv(pbmc.markers, 'cell_identify/wilcox_markers.csv', row.names = F)
write.csv(top10, 'cell_identify/top10_wilcox_markers.csv', row.names = F)

挑选标志基因可视化其在各个cluster内的分布

select_genes <- c('MS4A1', 'GNLY', 'CD3E', 'CD14', 'LYZ', 'FCGR3A')

plot1 <- VlnPlot(pbmc, features = select_genes, pt.size = 0, ncol = 2)
ggsave('cell_identify/selectgenes_VlnPlot.pdf', plot = plot1, width = 6, height = 8)
ggsave('cell_identify/selectgenes_VlnPlot.png', plot = plot1, width = 6, height = 8)

plot2 <- FeaturePlot(pbmc, features = select_genes, reduction = 'tsne', label = T, ncol = 2)
ggsave('cell_identify/selectgenes_FeaturePlot.pdf', plot = plot2, width = 8, height = 12)
ggsave('cell_identify/selectgenes_FeaturePlot.png', plot = plot2, width = 8, height = 12)

plot <- plot1 | plot2
ggsave('cell_identify/selectgenes.pdf', plot = plot, width = 10, height = 8)
ggsave('cell_identify/selectgenes.png', plot = plot, width = 10, height = 8)

鉴定细胞类型

自动注释

library(SingleR)
library(celldex)

refdata <- HumanPrimaryCellAtlasData()
testdata <- GetAssayData(pbmc, slot = 'data')
clusters <- pbmc@meta.data$seurat_clusters
cellpred <- SingleR(test = testdata, ref = refdata, labels = refdata$label.main, method = 'cluster', clusters = clusters, assay.type.test = 'logcounts', assay.type.ref = 'logcounts')
celltype <- data.frame(ClusterID = rownames(cellpred), celltype = cellpred$labels, stringsAsFactors = F)
write.csv(celltype, 'cell_identify/celltype_SingleR.csv', row.names = F)

pbmc@meta.data$celltype <- celltype[match(clusters, celltype$ClusterID), 'celltype']

plot1 <- DimPlot(pbmc, group.by = 'celltype', label = T, label.size = 5, reduction = 'tsne')
ggsave('cell_identify/SingleR_tSNE_celltype.pdf', plot = plot1, width = 7, height = 6)
ggsave('cell_identify/SingleR_tSNE_celltype.png', plot = plot1, width = 7, height = 6)

plot2 <- DimPlot(pbmc, group.by = 'celltype', label = T, label.size = 5, reduction = 'umap')
ggsave('cell_identify/SingleR_UMAP_celltype.pdf', plot = plot2, width = 7, height = 6)
ggsave('cell_identify/SingleR_UMAP_celltype.png', plot = plot2, width = 7, height = 6)

plot <- plot1 + plot2 + plot_layout(guides = 'collect')
ggsave('cell_identify/SingleR_celltype.pdf', plot = plot, width = 10, height = 5)
ggsave('cell_identify/SingleR_celltype.png', plot = plot, width = 10, height = 5)

根据先验知识手动注释细胞类型

new.cluster.ids <- c("Naive CD4 T", "CD14+ Mono", "Memory CD4 T", "B", "CD8 T", "FCGR3A+ Mono", "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)

plot1 <- DimPlot(pbmc, label = T, label.size = 5, reduction = 'tsne')
ggsave('cell_identify/tSNE_celltype.pdf', plot = plot1, width = 7, height = 6)
ggsave('cell_identify/tSNE_celltype.png', plot = plot1, width = 7, height = 6)

plot2 <- DimPlot(pbmc, label = T, label.size = 5, reduction = 'umap')
ggsave('cell_identify/UMAP_celltype.pdf', plot = plot2, width = 7, height = 6)
ggsave('cell_identify/UMAP_celltype.png', plot = plot2, width = 7, height = 6)

plot <- plot1 + plot2 + plot_layout(guides = 'collect')
ggsave('cell_identify/celltype.pdf', plot = plot, width = 14, height = 6)
ggsave('cell_identify/celltype.png', plot = plot, width = 14, height = 6)

参考资料

Seurat - Guided Clustering Tutorial
单细胞转录组基础分析