Using multiple random starting seeds in FlowSOM - How?
I'm using the Nowicka et al (2017) workflow (which is excellent ) and they recommend rerunning analysis with multiple starting seeds, to see how robust the detected clusters are, and to ensure rare populations haven't been missed.
I see Hamers et al (2018) use 100 starting seeds.
I am an absolute novice at R programming, and am a bench biologist by training so I would love any advice (or useful links) on the following:
1) How to write the code to have SOM run with multiple starting seeds, and
2) How to determine that the clusters are robust?
I imagine you could use a random seed generator to run SOM multiple times. How would you do this?
Would you then put each run into an vector and then somehow generate an average heatmap of the clusters, or perhaps a graphic showing error bars?
What do CyTOF analysis experts normally do in this situation?
Cheers, Sheila.
This is the code in Nowicka et al (2017) from where they set the seed up to heatmap generation:
library(FlowSOM)
fsom <- ReadInput(fcs, transform = FALSE, scale = FALSE)
set.seed(1234)
som <- BuildSOM(fsom, colsToUse = lineage_markers)
## Get the cell clustering into 100 SOM codes
cell_clustering_som <- som$map$mapping[,1]
## Metaclustering into 20 clusters with ConsensusClusterPlus
library(ConsensusClusterPlus)
codes <- som$map$codes
plot_outdir <- "consensus_plots"
nmc <- 20
mc <- ConsensusClusterPlus(t(codes), maxK = nmc, reps = 100,
pItem = 0.9, pFeature = 1, title = plot_outdir, plot = "png",
clusterAlg = "hc", innerLinkage = "average", finalLinkage = "average",
distance = "euclidean", seed = 1234)
## Get cluster ids for each cell
code_clustering1 <- mc[[nmc]]$consensusClass
cell_clustering1 <- code_clustering1[cell_clustering_som]
# Define cluster colors (here there are 30 colors)
color_clusters <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72",
"#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3",
"#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D",
"#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
"#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000",
"#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")
plot_clustering_heatmap_wrapper <- function(expr, expr01,
cell_clustering, color_clusters, cluster_merging = NULL){
# Calculate the median expression
expr_median <- data.frame(expr, cell_clustering = cell_clustering) %>%
group_by(cell_clustering) %>% summarize_all(funs(median))
expr01_median <- data.frame(expr01, cell_clustering = cell_clustering) %>%
group_by(cell_clustering) %>% summarize_all(funs(median))
# Calculate cluster frequencies
clustering_table <- as.numeric(table(cell_clustering))
clustering_prop <- round(clustering_table / sum(clustering_table) * 100, 2)
# Sort the cell clusters with hierarchical clustering
d <- dist(expr_median[, colnames(expr)], method = "euclidean")
cluster_rows <- hclust(d, method = "ward.D2")
expr_heat <- as.matrix(expr01_median[, colnames(expr01)])
rownames(expr_heat) <- expr01_median$cell_clustering
# Colors for the heatmap
color_heat <- colorRampPalette(rev(brewer.pal(n = 9, name = "RdYlBu")))(100)
legend_breaks = seq(from = 0, to = 1, by = 0.2)
labels_row <- paste0(expr01_median$cell_clustering, " (", clustering_prop ,
"%)")
# Annotation for the original clusters
annotation_row <- data.frame(Cluster = factor(expr01_median$cell_clustering))
rownames(annotation_row) <- rownames(expr_heat)
color_clusters1 <- color_clusters[1:nlevels(annotation_row$Cluster)]
names(color_clusters1) <- levels(annotation_row$Cluster)
annotation_colors <- list(Cluster = color_clusters1)
# Annotation for the merged clusters
if(!is.null(cluster_merging)){
cluster_merging$new_cluster <- factor(cluster_merging$new_cluster)
annotation_row$Cluster_merging <- cluster_merging$new_cluster
color_clusters2 <- color_clusters[1:nlevels(cluster_merging$new_cluster)]
names(color_clusters2) <- levels(cluster_merging$new_cluster)
annotation_colors$Cluster_merging <- color_clusters2
}
pheatmap(expr_heat, color = color_heat, cluster_cols = FALSE,
cluster_rows = cluster_rows, labels_row = labels_row,
display_numbers = TRUE, number_color = "black",
fontsize = 8, fontsize_number = 6, legend_breaks = legend_breaks,
annotation_row = annotation_row, annotation_colors = annotation_colors)
}
plot_clustering_heatmap_wrapper(expr = expr[, lineage_markers_ord],
expr01 = expr01[, lineage_markers_ord],
cell_clustering = cell_clustering1, color_clusters = color_clusters)
I see Hamers et al (2018) use 100 starting seeds.
I am an absolute novice at R programming, and am a bench biologist by training so I would love any advice (or useful links) on the following:
1) How to write the code to have SOM run with multiple starting seeds, and
2) How to determine that the clusters are robust?
I imagine you could use a random seed generator to run SOM multiple times. How would you do this?
Would you then put each run into an vector and then somehow generate an average heatmap of the clusters, or perhaps a graphic showing error bars?
What do CyTOF analysis experts normally do in this situation?
Cheers, Sheila.
This is the code in Nowicka et al (2017) from where they set the seed up to heatmap generation:
library(FlowSOM)
fsom <- ReadInput(fcs, transform = FALSE, scale = FALSE)
set.seed(1234)
som <- BuildSOM(fsom, colsToUse = lineage_markers)
## Get the cell clustering into 100 SOM codes
cell_clustering_som <- som$map$mapping[,1]
## Metaclustering into 20 clusters with ConsensusClusterPlus
library(ConsensusClusterPlus)
codes <- som$map$codes
plot_outdir <- "consensus_plots"
nmc <- 20
mc <- ConsensusClusterPlus(t(codes), maxK = nmc, reps = 100,
pItem = 0.9, pFeature = 1, title = plot_outdir, plot = "png",
clusterAlg = "hc", innerLinkage = "average", finalLinkage = "average",
distance = "euclidean", seed = 1234)
## Get cluster ids for each cell
code_clustering1 <- mc[[nmc]]$consensusClass
cell_clustering1 <- code_clustering1[cell_clustering_som]
# Define cluster colors (here there are 30 colors)
color_clusters <- c("#DC050C", "#FB8072", "#1965B0", "#7BAFDE", "#882E72",
"#B17BA6", "#FF7F00", "#FDB462", "#E7298A", "#E78AC3",
"#33A02C", "#B2DF8A", "#55A1B1", "#8DD3C7", "#A6761D",
"#E6AB02", "#7570B3", "#BEAED4", "#666666", "#999999",
"#aa8282", "#d4b7b7", "#8600bf", "#ba5ce3", "#808000",
"#aeae5c", "#1e90ff", "#00bfff", "#56ff0d", "#ffff00")
plot_clustering_heatmap_wrapper <- function(expr, expr01,
cell_clustering, color_clusters, cluster_merging = NULL){
# Calculate the median expression
expr_median <- data.frame(expr, cell_clustering = cell_clustering) %>%
group_by(cell_clustering) %>% summarize_all(funs(median))
expr01_median <- data.frame(expr01, cell_clustering = cell_clustering) %>%
group_by(cell_clustering) %>% summarize_all(funs(median))
# Calculate cluster frequencies
clustering_table <- as.numeric(table(cell_clustering))
clustering_prop <- round(clustering_table / sum(clustering_table) * 100, 2)
# Sort the cell clusters with hierarchical clustering
d <- dist(expr_median[, colnames(expr)], method = "euclidean")
cluster_rows <- hclust(d, method = "ward.D2")
expr_heat <- as.matrix(expr01_median[, colnames(expr01)])
rownames(expr_heat) <- expr01_median$cell_clustering
# Colors for the heatmap
color_heat <- colorRampPalette(rev(brewer.pal(n = 9, name = "RdYlBu")))(100)
legend_breaks = seq(from = 0, to = 1, by = 0.2)
labels_row <- paste0(expr01_median$cell_clustering, " (", clustering_prop ,
"%)")
# Annotation for the original clusters
annotation_row <- data.frame(Cluster = factor(expr01_median$cell_clustering))
rownames(annotation_row) <- rownames(expr_heat)
color_clusters1 <- color_clusters[1:nlevels(annotation_row$Cluster)]
names(color_clusters1) <- levels(annotation_row$Cluster)
annotation_colors <- list(Cluster = color_clusters1)
# Annotation for the merged clusters
if(!is.null(cluster_merging)){
cluster_merging$new_cluster <- factor(cluster_merging$new_cluster)
annotation_row$Cluster_merging <- cluster_merging$new_cluster
color_clusters2 <- color_clusters[1:nlevels(cluster_merging$new_cluster)]
names(color_clusters2) <- levels(cluster_merging$new_cluster)
annotation_colors$Cluster_merging <- color_clusters2
}
pheatmap(expr_heat, color = color_heat, cluster_cols = FALSE,
cluster_rows = cluster_rows, labels_row = labels_row,
display_numbers = TRUE, number_color = "black",
fontsize = 8, fontsize_number = 6, legend_breaks = legend_breaks,
annotation_row = annotation_row, annotation_colors = annotation_colors)
}
plot_clustering_heatmap_wrapper(expr = expr[, lineage_markers_ord],
expr01 = expr01[, lineage_markers_ord],
cell_clustering = cell_clustering1, color_clusters = color_clusters)