1_HeLa Heatmap.R
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library("RColorBrewer")
library("matrixStats")
library("plyr")
##
## Attaching package: 'plyr'
## The following object is masked from 'package:matrixStats':
##
## count
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:matrixStats':
##
## count
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library("stringr")
library("ggplot2")
# Upload HeLa Screen file
all <- read.csv("All_Z_Scores.csv")
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

# Look at Nuclei Count vs Cell Area - Might Remove
plot <- ggplot(data = all, aes(x = Nuclei.Nuclei.Count.wv1, y = Cells.Area.wv3)) +
geom_point() +
geom_smooth(method = "lm") + theme_bw() +
scale_fill_manual(values = c("#999999", "#E69F00")) +
labs(title = "Nuclei Count vs Cell Area", x = "Nuclei Count (Z-Score)", y = "Cell Area (Z-Score)")
# Fit linear regression model
lm_model <- lm(Cells.Area.wv3 ~ Nuclei.Nuclei.Count.wv1, data = all)
# Extract R-squared value
r_squared <- summary(lm_model)$r.squared
# Add text annotation for R-squared value
plot <- plot +
annotate("text", x = max(all$Nuclei.Nuclei.Count.wv1), y = max(all$Cells.Area.wv3),
label = paste("R^2 =", round(r_squared, 3)), parse = TRUE, hjust = 2, vjust = 10)
#plot
#Save plot
ggsave(plot = plot,"Full_Count_vs_Area_Corr.png")
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
2_Hits Nuclei Count and Area Only.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload HeLa Screen file
all <- read.csv("All_Z_Scores.csv")
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(`Nuclei.Nuclei.Count.wv1` < -3)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,c(6,27)]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Hits_Nuclei_Count_Only.csv")
# Look at Nuclei Count vs Cell Area
plot <- ggplot(data = all, aes(x = Nuclei.Nuclei.Count.wv1, y = Cells.Area.wv3)) +
geom_point() +
geom_smooth(method = "lm") + theme_bw() +
scale_fill_manual(values = c("#999999", "#E69F00")) +
labs(title = "Nuclei Count vs Cell Area", x = "Nuclei Count (Z-Score)", y = "Cell Area (Z-Score)")
# Fit linear regression model
lm_model <- lm(Cells.Area.wv3 ~ Nuclei.Nuclei.Count.wv1, data = all)
# Extract R-squared value
r_squared <- summary(lm_model)$r.squared
# Add text annotation for R-squared value
plot <- plot +
annotate("text", x = max(all$Nuclei.Nuclei.Count.wv1), y = max(all$Cells.Area.wv3),
label = paste("R^2 =", round(r_squared, 3)), parse = TRUE, hjust = 2, vjust = 10)
plot
## `geom_smooth()` using formula = 'y ~ x'

#Save plot
ggsave(plot = plot,"Count_Threshold_Count_vs_Area_Corr.png")
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
3_Filtered Heatmap Count and Area.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(`Nuclei.Nuclei.Count.wv1` < -3, Cells.Area.wv3 >3)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Hits.csv")
4_Hits 1.96.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(`Nuclei.Nuclei.Count.wv1` < -1.96, Cells.Area.wv3 >1.96)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Hits_1.96.csv")
5_Hits 1.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(`Nuclei.Nuclei.Count.wv1` < -1, Cells.Area.wv3 >1)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Hits_1.csv")
6_SENCAN Down.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
Filter <- read.csv("SENCAN Down.csv")
Filter<-Filter$converted_alias
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(RefSeq.Accession.Number %in% Filter)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "SENCAN_Down.csv")
7_Senescopedia.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
Filter <- read.csv("Senescopeida List.csv")
Filter<-Filter$converted_alias
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(RefSeq.Accession.Number %in% Filter)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Senescopeida.csv")
8_Cell Cycle.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
Filter <- read.csv("Cell Cycle Genes.csv")
Filter<-Filter$converted_alias
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(RefSeq.Accession.Number %in% Filter)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Cell Cycle.csv")
9_Senescence.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# Upload your file
all <- read.csv("All_Z_Scores.csv")
Filter <- read.csv("Senescence Kegg.csv")
Filter<-Filter$converted_alias
# Define Hit Parameters - This can be Adjusted #
all <- all %>%
filter(RefSeq.Accession.Number %in% Filter)
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check - NA removed in previous stage
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
#Convert to data matrix
all_matrix <- ((as.matrix(all)))
rownames(all_matrix)<- row_names
# Create heat map
breaks <- unique(c(seq(-5,-1,length=100),seq(-1,0.1,length=100), seq(1,5,length=100)))
my_palette <- colorRampPalette(c("yellow","black","black","purple"))(length(breaks)-1)
heatmap.2(t(all_matrix),
Rowv = T,
Colv = T,
col=my_palette,
breaks=breaks,
density.info="none",
trace="none",
#main = "199 compounds; Length or Prev",
dendrogram=c("both"),
symm=F,symkey=F,symbreaks=T,
labRow=F,
labCol=F,
cexRow=0.8,
cexCol=0.1,
margins = c(8,2),
key.title = "1" , key.xlab="Z Score",
#rowsep =c(0,4,12,19,31,37,43),
#colsep =c(0,233),
sepcolor = c("black"),sepwidth = c(0.05, 0.05),
#ColSideColors=condition_colors, scale="none",
distfun = function(x) dist(x, method = "euclidean"),
hclust=function(x) hclust(x,method="ward.D2"))

Hits <- rownames(all_matrix)
write.csv(Hits, "Senescence.csv")
11_K-Means Clustering
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library(umap)
# Upload your file
all <- read.csv("All_Z_Scores.csv")
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update DF ###
all <- all_small
#Deal with duplicate Gene Entries (12 in total)#
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
row_names <- all$RefSeq.Accession.Number
# Remove unnecessary measures (i.e. extra label columns)
all <- all[,6:42]
# Set the seed for reproducibility
set.seed(123) # You can change the seed for different results
# Perform K-means clustering with K = 4
kmeans_result <- kmeans(all, centers = 4, nstart = 25)
# Add the cluster assignment to the original dataset
all$Cluster <- as.factor(kmeans_result$cluster)
# Run UMAP to reduce dimensionality
umap_result <- umap(all[, -ncol(all)]) # Exclude the 'Cluster' column for UMAP
# Prepare data for plotting
umap_data <- as.data.frame(umap_result$layout)
umap_data$Cluster <- all$Cluster
# Calculate the count of samples per cluster
cluster_counts <- all %>%
count(Cluster) %>%
mutate(Cluster = paste0(Cluster, " (n=", n, ")"))
# Update the UMAP data with the new cluster labels
umap_data$Cluster <- factor(umap_data$Cluster, levels = levels(all$Cluster), labels = cluster_counts$Cluster)
# Plot UMAP with clusters, larger text, and theme_bw
umap_plot <- ggplot(umap_data, aes(x = V1, y = V2, color = Cluster)) +
geom_point(size = 0.2) +
labs(title = "UMAP Plot with K-means Clusters (K=3)",
x = "UMAP 1", y = "UMAP 2") +
theme_bw() + # Change to theme_bw
theme(
legend.position = "right", # Adjust the legend position if needed
text = element_text(size = 14) # Make all text larger
) +
scale_color_manual(
values = c("#E41A1C","#377EB8", "pink", "#4DAF4A"), # Ensure these colors correspond to your clusters
breaks = levels(umap_data$Cluster) # Ensure that breaks match the levels of your Cluster variable
)
# Save the plot as a PNG file in the "k_means_umap" folder
ggsave(filename = "k_means_umap/UMAP_Kmeans_Clusters.png", plot = umap_plot, width = 8, height = 6)
12_K-Means Heatmaps
library(gplots)
library(RColorBrewer)
library(matrixStats)
library(plyr)
library(dplyr)
library(data.table)
library(stringr)
library(ggplot2)
# Upload your file
all <- read.csv("All_Z_Scores.csv")
# Filter out rows with "Blank" in Gene.Symbol
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
### Sanity check
all_removed <- all %>%
filter(grepl("Blank", Gene.Symbol))
# Update the main dataframe
all <- all_small
# Handle duplicate Gene Entries (12 in total)
all <- all %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
# Save RefSeq.Accession.Number for later use
refseq_numbers <- all$RefSeq.Accession.Number
# Remove unnecessary columns (keep only relevant measures for clustering)
all_for_clustering <- all[, 6:42] # Assuming columns 6 to 42 are for clustering
# Set the seed for reproducibility
set.seed(123)
# Perform K-means clustering with K = 3
k <- 4 # Define the number of clusters
kmeans_result <- kmeans(all_for_clustering, centers = k, nstart = 25)
# Add the cluster assignment to the original dataset
all$Cluster <- as.factor(kmeans_result$cluster)
# Create a matrix for heatmap generation
all_matrix <- as.matrix(all_for_clustering) # Convert to matrix for clustering
rownames(all_matrix) <- refseq_numbers # Set row names for the matrix
# Create a directory for saving heatmaps
dir.create("kmeans_heatmaps", showWarnings = FALSE)
# Iterate through each cluster and generate heatmaps and CSVs
for (i in 1:k) { # Use the number of clusters directly
# Filter the current cluster
current_cluster <- all %>% filter(Cluster == i)
# Get the names of the current cluster
current_cluster_names <- current_cluster$RefSeq.Accession.Number
# Subset matrix for the current cluster using row names
current_cluster_HM <- all_matrix[current_cluster_names, , drop = FALSE] # Ensure drop = FALSE to maintain matrix dimensions
# Define filename for the heatmap
png_filename <- paste0("kmeans_heatmaps/KMeans_Cluster_", i, ".png")
# Save the heatmap as a PNG file
png(png_filename, width = 800, height = 800)
# Define color palette and breaks
breaks <- unique(c(seq(-5, -1, length=100), seq(-1, 0.1, length=100), seq(1, 5, length=100)))
my_palette <- colorRampPalette(c("yellow", "black", "black", "purple"))(length(breaks) - 1)
# Create heatmap
heatmap.2(t(current_cluster_HM),
Rowv = TRUE,
Colv = TRUE,
col = my_palette,
breaks = breaks,
density.info = "none",
trace = "none",
dendrogram = c("both"),
symm = FALSE,
symkey = FALSE,
symbreaks = TRUE,
labRow = FALSE,
labCol = FALSE,
cexRow = 0.8,
cexCol = 0.1,
margins = c(8, 2),
key.title = "1",
key.xlab = "Z Score",
sepcolor = c("black"),
sepwidth = c(0.05, 0.05),
distfun = function(x) dist(x, method = "euclidean"),
hclust = function(x) hclust(x, method = "ward.D2"))
dev.off() # Close the PNG device
# Save current cluster data to a CSV file in the working directory
csv_filename <- paste0("KMeans_Cluster_", i, ".csv") # No directory specified, save to the working directory
write.csv(current_cluster, file = csv_filename, row.names = FALSE) # Save the cluster data
}
14_Use Common Hits to Label Senescence Ground Truth
library(gplots)
library(RColorBrewer)
library(dplyr)
# Read your original file and the common hits file
all <- read.csv("All_Z_Scores.csv")
common_hits <- read.csv("common_senescence_hits.csv")
# Extract common hits
common_hits_set <- common_hits$Final_Hits # Update to match the column name from the new common_hits file
# Filter out rows with "Blank" in Gene.Symbol
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
# Handle duplicate Gene Entries
all <- all_small %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
# Create a new dataframe with "Sen" and "NonSen" labels based on common hits
all_labeled <- all %>%
mutate(Senescence_Status = ifelse(RefSeq.Accession.Number %in% common_hits_set, "Sen", "NonSen"))
# Save RefSeq.Accession.Number for later use
refseq_numbers <- all_labeled$RefSeq.Accession.Number
# Remove unnecessary columns (keep only relevant measures for heatmap)
all_for_heatmap <- all_labeled[, 6:42] # Assuming columns 6 to 42 are for the heatmap
# Create a matrix for heatmap generation
all_matrix <- as.matrix(all_for_heatmap) # Convert to matrix for heatmap
rownames(all_matrix) <- refseq_numbers # Set row names for the matrix
# Create a directory for saving heatmaps
dir.create("heatmaps", showWarnings = FALSE)
# Generate heatmaps for "Sen" and "NonSen" categories
for (label in c("Sen", "NonSen")) {
# Filter the current label
current_label <- all_labeled %>% filter(Senescence_Status == label)
# Get the names of the current label
current_label_names <- current_label$RefSeq.Accession.Number
# Subset matrix for the current label using row names
current_label_HM <- all_matrix[current_label_names, , drop = FALSE] # Ensure drop = FALSE to maintain matrix dimensions
# Define filename for the heatmap
png_filename <- paste0("heatmaps/", label, "_Heatmap.png")
# Save the heatmap as a PNG file
png(png_filename, width = 800, height = 800)
# Define color palette and breaks
breaks <- unique(c(seq(-5, -1, length = 100), seq(-1, 0.1, length = 100), seq(1, 5, length = 100)))
my_palette <- colorRampPalette(c("yellow", "black", "black", "purple"))(length(breaks) - 1)
# Create heatmap
heatmap.2(t(current_label_HM),
Rowv = TRUE,
Colv = TRUE,
col = my_palette,
breaks = breaks,
density.info = "none",
trace = "none",
dendrogram = c("both"),
symm = FALSE,
symkey = FALSE,
symbreaks = TRUE,
labRow = FALSE,
labCol = FALSE,
cexRow = 0.8,
cexCol = 0.1,
margins = c(8, 2),
key.title = "1",
key.xlab = "Z Score",
sepcolor = c("black"),
sepwidth = c(0.05, 0.05),
distfun = function(x) dist(x, method = "euclidean"),
hclust = function(x) hclust(x, method = "ward.D2"))
dev.off() # Close the PNG device
}
# Save the updated dataframe with "Senescence_Status" labels
write.csv(all_labeled, "All_Data_Clusters_Consensus.csv", row.names = TRUE)
15_Make full Venn Diagram
library(VennDiagram)
# Read in the data
KMeans_Cluster_1 <- read.csv("KMeans_Cluster_1.csv")
KMeans_Cluster_4 <- read.csv("KMeans_Cluster_4.csv")
Heir_Cluster_1 <- read.csv("Cluster_1.csv")
Heir_Cluster_3 <- read.csv("Cluster_3.csv") # Add Hierarchical Cluster 3
# Extract the identifiers (RefSeq.Accession.Number)
KMeans_Cluster_1_names <- KMeans_Cluster_1$RefSeq.Accession.Number
KMeans_Cluster_4_names <- KMeans_Cluster_4$RefSeq.Accession.Number
Heir_Cluster_1_names <- colnames(Heir_Cluster_1)
Heir_Cluster_3_names <- colnames(Heir_Cluster_3) # Add identifiers for Hierarchical Cluster 3
# Remove the first column from hierarchical clusters if needed
Heir_Cluster_1_names <- Heir_Cluster_1_names[-1]
Heir_Cluster_3_names <- Heir_Cluster_3_names[-1] # Adjust if needed
# Convert to sets for Venn diagram
KMeans_Cluster_1_set <- unique(KMeans_Cluster_1_names)
KMeans_Cluster_4_set <- unique(KMeans_Cluster_4_names)
Heir_Cluster_1_set <- unique(Heir_Cluster_1_names)
Heir_Cluster_3_set <- unique(Heir_Cluster_3_names) # Add set for Hierarchical Cluster 3
# Create a Venn diagram with four sets
venn.plot <- venn.diagram(
x = list(
"KMeans Cluster 1" = KMeans_Cluster_1_set,
"KMeans Cluster 4" = KMeans_Cluster_4_set,
"Hierarchical Cluster 1" = Heir_Cluster_1_set,
"Hierarchical Cluster 3" = Heir_Cluster_3_set
),
category.names = c("KMeans Cluster 1", "KMeans Cluster 4", "Hierarchical Cluster 1", "Hierarchical Cluster 3"),
filename = NULL, # Output to R plot device
output = TRUE
)
# Plot the Venn diagram
png(filename = "venn_diagram_4_clusters.png", width = 800, height = 800)
grid.draw(venn.plot)
dev.off()
## quartz_off_screen
## 2
16_Matching Thresholds Percentage.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# List of file paths
file_paths <- c(
"Hits_Nuc_Count_Only.csv",
"Hits.csv",
"Hits_1.96.csv",
"Hits_1.csv",
"SENCAN.csv",
"SENCAN_Down.csv",
"Senescopeida.csv",
"Senescopeida_Sig.csv",
"Cell Cycle.csv",
"Senescence.csv"
)
# Create a function to load a file and extract the second column as a vector
load_and_extract_second_column <- function(file_path) {
# Load the file into a data frame
data <- read.csv(file_path)
# Extract the second column as a vector
vector <- data[[2]]
return(vector)
}
# Use lapply to load the files and create a list of vectors
vector_list <- lapply(file_paths, load_and_extract_second_column)
# Override the original data frames with the vectors
Count_Only <- vector_list[[1]]
Hits_High <- vector_list[[2]]
Hits_Med <- vector_list[[3]]
Hits_Low <- vector_list[[4]]
SENCAN <- vector_list[[5]]
SENCAN_Neg <- vector_list[[6]]
Senescopedia_Down <- vector_list[[7]]
Senescopedia_Sig <- vector_list[[8]]
Cell_Cycle <- vector_list[[9]]
Senescence <- vector_list[[10]]
# Now for Clusters
Sen_Hits <- common_hits$Final_Hits
library(dplyr)
# Create a dataframe with the counts of matched entries
result <- data.frame(
Vector = c("SENCAN", "SENCAN_Neg", "Senescopedia_Down", "Senescopedia_Sig", "Cell_Cycle", "Senescence", "Sen_Hits"),
Count_Only = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Count_Only) / length(vector) * 100), 1)),
Hits_High = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Hits_High) / length(vector) * 100), 1)),
Hits_Med = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Hits_Med) / length(vector) * 100), 1)),
Hits_Low = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Hits_Low) / length(vector) * 100), 1))
)
# Print the result dataframe
Test <- result
print(Test)
## Vector Count_Only Hits_High Hits_Med Hits_Low
## 1 SENCAN 5.0 1.0 3.0 13.0
## 2 SENCAN_Neg 5.6 1.9 5.6 14.8
## 3 Senescopedia_Down 6.9 1.9 4.4 20.9
## 4 Senescopedia_Sig 7.7 2.2 5.5 16.5
## 5 Cell_Cycle 5.8 0.0 7.5 20.0
## 6 Senescence 4.1 0.0 5.4 21.8
## 7 Sen_Hits 19.6 2.0 9.3 43.1
17_Export Threshold Percentages.R
library("gplots")
library("RColorBrewer")
library("matrixStats")
library("plyr")
library("dplyr")
library("data.table")
library("stringr")
library("ggplot2")
library("Rtsne")
# List of file paths
file_paths <- c(
"Hits_Nuc_Count_Only.csv",
"Hits.csv",
"Hits_1.96.csv",
"Hits_1.csv",
"SENCAN.csv",
"SENCAN_Down.csv",
"Senescopeida.csv",
"Senescopeida_Sig.csv",
"Cell Cycle.csv",
"Senescence.csv"
)
# Create a function to load a file and extract the second column as a vector
load_and_extract_second_column <- function(file_path) {
# Load the file into a data frame
data <- read.csv(file_path)
# Extract the second column as a vector
vector <- data[[2]]
return(vector)
}
# Use lapply to load the files and create a list of vectors
vector_list <- lapply(file_paths, load_and_extract_second_column)
# Override the original data frames with the vectors
Count_Only <- vector_list[[1]]
Hits_High <- vector_list[[2]]
Hits_Med <- vector_list[[3]]
Hits_Low <- vector_list[[4]]
SENCAN <- vector_list[[5]]
SENCAN_Neg <- vector_list[[6]]
Senescopedia_Down <- vector_list[[7]]
Senescopedia_Sig <- vector_list[[8]]
Cell_Cycle <- vector_list[[9]]
Senescence <- vector_list[[10]]
# Now for Clusters
Sen_Hits <- common_hits$Final_Hits
library(dplyr)
result <- data.frame(
Vector = c("SENCAN", "SENCAN_Neg", "Senescopedia_Down", "Senescopedia_Sig", "Cell_Cycle", "Senescence", "Sen_Hits"),
Count_Only = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Count_Only) / length(Count_Only) * 100), 1)),
Hits_High = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Hits_High) / length(Hits_High) * 100), 1)),
Hits_Med = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Hits_Med) / length(Hits_Med) * 100), 1)),
Hits_Low = sapply(list(SENCAN, SENCAN_Neg, Senescopedia_Down, Senescopedia_Sig, Cell_Cycle, Senescence, Sen_Hits), function(vector) round((sum(vector %in% Hits_Low) / length(Hits_Low) * 100), 1))
)
# Print the result dataframe
Test <- result
library(ggplot2)
# Reshape the Test dataframe for ggplot2
Test <- Test %>% filter (Vector %in% c("Sen_Hits"))
Test_melted <- melt(Test, id.vars = "Vector")
## Warning in melt(Test, id.vars = "Vector"): The melt generic in data.table has
## been passed a data.frame and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is deprecated, and this redirection is now
## deprecated as well. To continue using melt methods from reshape2 while both
## libraries are attached, e.g. melt.list, you can prepend the namespace like
## reshape2::melt(Test). In the next version, this warning will become an error.
Test_melted <- Test_melted %>% filter(grepl("Hits", variable))
# Define a custom color palette
my_colors <- c("#E41A1C", "#377EB8", "#4DAF4A")
# Create a prettier square faceted bar chart with a fixed y-axis of 100 and custom colors
ggplot(Test_melted, aes(x = Vector, y = value, fill = Vector)) +
geom_bar(stat = "identity") +
facet_wrap(~ variable, ncol = 2, scales = "free") +
labs(title = "Percentage Comparisons",
x = "Vector",
y = "Percentage") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right",
text = element_text(face = "bold")) +
scale_fill_manual(values = my_colors) +
ylim(0, 100) +
guides(fill = guide_legend(title = "Vector")) +
scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 100))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.

print(Test_melted)
## Vector variable value
## 1 Sen_Hits Hits_High 74.8
## 2 Sen_Hits Hits_Med 88.5
## 3 Sen_Hits Hits_Low 78.6
18_Look_at_GT_Count_and_Area
library(gplots)
library(RColorBrewer)
library(dplyr)
# Read your original file and the common hits file
all <- read.csv("All_Z_Scores.csv")
common_hits <- read.csv("common_senescence_hits.csv")
all <- all[,c(1,2,6,27)]
# Extract common hits
common_hits_set <- common_hits$Final_Hits # Update to match the column name from the new common_hits file
# Filter out rows with "Blank" in Gene.Symbol
all_small <- all %>%
filter(!grepl("Blank", Gene.Symbol))
# Handle duplicate Gene Entries
all <- all_small %>%
group_by(RefSeq.Accession.Number) %>%
mutate(RefSeq.Accession.Number = ifelse(row_number() == 1, RefSeq.Accession.Number, paste0(RefSeq.Accession.Number, "_A")))
# Create a new dataframe with "Sen" and "NonSen" labels based on common hits
all_labeled <- all %>%
mutate(Senescence_Status = ifelse(RefSeq.Accession.Number %in% common_hits_set, "Sen", "NonSen"))
# Save RefSeq.Accession.Number for later use
refseq_numbers <- all_labeled$RefSeq.Accession.Number
# Remove unnecessary columns (keep only relevant measures for heatmap)
all_for_heatmap <- all_labeled[, 3:4] # Assuming columns 6 to 42 are for the heatmap
# Create a matrix for heatmap generation
all_matrix <- as.matrix(all_for_heatmap) # Convert to matrix for heatmap
rownames(all_matrix) <- refseq_numbers # Set row names for the matrix
# Create a directory for saving heatmaps
dir.create("heatmaps", showWarnings = FALSE)
# Generate heatmaps for "Sen" and "NonSen" categories
for (label in c("Sen", "NonSen")) {
# Filter the current label
current_label <- all_labeled %>% filter(Senescence_Status == label)
# Get the names of the current label
current_label_names <- current_label$RefSeq.Accession.Number
# Subset matrix for the current label using row names
current_label_HM <- all_matrix[current_label_names, , drop = FALSE] # Ensure drop = FALSE to maintain matrix dimensions
# Define filename for the heatmap
png_filename <- paste0("heatmaps/", label, "_Heatmap_Count_Area.png")
# Save the heatmap as a PNG file
png(png_filename, width = 800, height = 800)
# Define color palette and breaks
breaks <- unique(c(seq(-5, -1, length = 100), seq(-1, 0.1, length = 100), seq(1, 5, length = 100)))
my_palette <- colorRampPalette(c("yellow", "black", "black", "purple"))(length(breaks) - 1)
# Create heatmap
heatmap.2(t(current_label_HM),
Rowv = TRUE,
Colv = TRUE,
col = my_palette,
breaks = breaks,
density.info = "none",
trace = "none",
dendrogram = c("column"),
symm = FALSE,
symkey = FALSE,
symbreaks = TRUE,
labRow = FALSE,
labCol = FALSE,
cexRow = 0.8,
cexCol = 0.1,
margins = c(8, 2),
key.title = "1",
key.xlab = "Z Score",
sepcolor = c("black"),
sepwidth = c(0.05, 0.05),
distfun = function(x) dist(x, method = "euclidean"),
hclust = function(x) hclust(x, method = "ward.D2"))
dev.off() # Close the PNG device
}