閑言少敘,切入正題。感興趣的果友可以去看鏈接中對應的推文。我們想復現文獻中的Fig2C,這張圖展示的是興趣基因(這里是特定的趨化因子)與免疫細胞(CD8A是特異性表達在CD8 T 細胞的)的關系。這張圖最大的意義是,這里的興趣基因和免疫細胞都可以換成感興趣的基因的。
代碼如下,供大家參考。
######################DeepSeek復現代碼###############################
# 加載必要的包
library(tidyverse)
library(limma)
library(ggplot2)
library(forestplot)
library(hgu133plus2.db)
library(GEOquery)
# 1. 數據讀取
gse_data <- getGEO(filename = "GSE2109_series_matrix.txt.gz", getGPL = F)
expr_data <- exprs(gse_data)
pheno_data <- pData(gse_data)
# 對表達數據進行log2轉換(確保數據未log轉換)
if (max(expr_data, na.rm = TRUE) > 100) {
expr_data <- log2(expr_data + 1)
}
# 2. 數據清洗和樣本選擇
# 提取腫瘤類型信息并標準化
pheno_data$tumor_type <- str_extract(pheno_data$source_name_ch1, "(Breast|Lung|Colon|Kidney|Uterus|Ovary)")
pheno_data$tumor_type <- toupper(pheno_data$tumor_type)
pheno_data$tumor_type <- gsub("OVARY|OVARIAN", "OVARIAN", pheno_data$tumor_type)
pheno_data$tumor_type <- gsub("UTERUS|CORPUS UTERI|UTERINE CORPUS", "UTERUS", pheno_data$tumor_type)
# 篩選感興趣的腫瘤類型
selected_types <- c("KIDNEY", "LUNG", "COLON", "BREAST", "UTERUS", "OVARIAN")
pheno_data <- pheno_data[pheno_data$tumor_type %in% selected_types, ]
expr_data <- expr_data[, colnames(expr_data) %in% rownames(pheno_data)]
# 3. 探針ID轉換
# 定義興趣基因
interest_genes <- c("CD8A", "CCL3", "CCL4", "CCL5", "CCL8", "CCL13", "CCL18",
"CCL19", "CCL21", "CCL22", "CXCL1", "CXCL2", "CXCL3",
"CXCL9", "CXCL10", "CXCL11", "CXCL12", "CXCL13")
# 獲取平臺注釋信息
# 3. 使用hgu133plus2.db進行探針ID轉換
# 獲取探針到基因符號的映射
probe2gene <- AnnotationDbi::select(hgu133plus2.db,
keys = rownames(expr_data),
columns = c("SYMBOL"),
keytype = "PROBEID")
# 處理多對多映射(取每個探針對應的第一個基因符號)
probe2gene <- probe2gene %>%
filter(!is.na(SYMBOL)) %>%
group_by(PROBEID) %>%
dplyr::slice(1) %>%
ungroup()
# 轉換表達矩陣
expr_df <- expr_data %>%
as.data.frame() %>%
rownames_to_column(var = "PROBEID") %>%
inner_join(probe2gene, by = "PROBEID") %>%
dplyr::select(-PROBEID) %>%
filter(SYMBOL %in% interest_genes) %>%
group_by(SYMBOL) %>%
summarise(across(everything(), mean, na.rm = TRUE)) %>%
column_to_rownames(var = "SYMBOL")
# 4. 準備森林圖數據
# 計算各腫瘤類型的平均表達量
forest_data <- data.frame()
for (ttype in selected_types) {
samples <- rownames(pheno_data[pheno_data$tumor_type == ttype, ])
if (length(samples) > 0) {
ttype_expr <- expr_df[, colnames(expr_df) %in% samples]
means <- rowMeans(ttype_expr, na.rm = TRUE)
sds <- apply(ttype_expr, 1, sd, na.rm = TRUE)
n <- ncol(ttype_expr)
temp_df <- data.frame(
gene = rownames(expr_df),
tumor_type = ttype,
mean = means,
lower = means - 1.96 * sds/sqrt(n),
upper = means + 1.96 * sds/sqrt(n),
n = n
)
forest_data <- rbind(forest_data, temp_df)
}
}
# 5. 繪制森林圖
# 準備標簽
label_data <- forest_data %>%
group_by(gene) %>%
summarise() %>%
mutate(mean = NA, lower = NA, upper = NA, tumor_type = "Gene")
# 合并數據
plot_data <- bind_rows(label_data, forest_data) %>%
mutate(
tumor_type = factor(tumor_type, levels = c("Gene", selected_types)),
gene = factor(gene, levels = rev(interest_genes))
)
# 創建森林圖
ggplot(plot_data, aes(x = mean, y = gene, color = tumor_type)) +
geom_point(position = position_dodge(width = 0.7), size = 2) +
geom_errorbarh(aes(xmin = lower, xmax = upper),
height = 0.2, position = position_dodge(width = 0.7)) +
geom_vline(xintercept = 0, linetype = "dashed") +
facet_grid(. ~ tumor_type, scales = "free_x", space = "free_x") +
labs(x = "Expression Level (log2)", y = "",
title = "Gene Expression Across Tumor Types") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text = element_text(face = "bold"),
legend.position = "none"
) +
scale_color_manual(values = c("black", "red", "blue", "green", "purple", "orange", "brown"))
# 6. 繪制合并的森林圖
ggplot(forest_data, aes(x = mean, y = reorder(gene, mean),
color = tumor_type, shape = tumor_type)) +
geom_point(position = position_dodge(width = 0.8), size = 3) +
geom_errorbarh(aes(xmin = lower, xmax = upper),
height = 0.2, position = position_dodge(width = 0.8)) +
geom_vline(xintercept = mean(expr_df, na.rm = TRUE),
linetype = "dashed", color = "gray50") +
labs(x = "Log2 Expression Level", y = "",
title = "Cytokine Expression Across Tumor Types",
color = "Tumor Type", shape = "Tumor Type") +
theme_minimal(base_size = 14) +
theme(
axis.text.y = element_text(face = "italic", size = 12),
axis.text.x = element_text(size = 12),
legend.position = "right",
legend.text = element_text(size = 12),
panel.grid.major.y = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
panel.border = element_rect(fill = NA, color = "gray80")
) +
scale_color_brewer(palette = "Set1") +
scale_shape_manual(values = c(15, 16, 17, 18, 19, 20))
# 7. 計算趨化因子與CD8A的相關性(按腫瘤類型分組)
cor_results <- pheno_data %>%
rownames_to_column(var = "sample_id") %>%
filter(tumor_type %in% selected_types) %>%
group_by(tumor_type) %>%
group_modify(~ {
# 確保樣本ID匹配表達矩陣的列名(expr_df是基因在行,樣本在列)
samples <- intersect(.x$sample_id, colnames(expr_df))
if(length(samples) == 0) return(data.frame())
# 轉置表達矩陣以便樣本在行,基因在列
sub_expr <- t(expr_df[, samples, drop = FALSE]) %>% as.data.frame()
# 檢查CD8A列是否存在
if(!"CD8A" %in% colnames(sub_expr)) {
return(data.frame()) # 跳過沒有CD8A的組
}
# 計算每個趨化因子與CD8A的相關性
map_dfr(setdiff(interest_genes, "CD8A"), function(chemokine) {
if(!chemokine %in% colnames(sub_expr)) {
return(data.frame(
chemokine = chemokine,
cor_estimate = NA,
cor_conf_low = NA,
cor_conf_high = NA,
p_value = NA,
stringsAsFactors = FALSE
))
}
# 移除NA值
complete_cases <- complete.cases(sub_expr[[chemokine]], sub_expr[["CD8A"]])
x <- sub_expr[complete_cases, chemokine]
y <- sub_expr[complete_cases, "CD8A"]
if(length(x) < 3) { # 至少需要3個觀測值
return(data.frame(
chemokine = chemokine,
cor_estimate = NA,
cor_conf_low = NA,
cor_conf_high = NA,
p_value = NA,
stringsAsFactors = FALSE
))
}
cor_test <- cor.test(x, y, method = "pearson")
data.frame(
chemokine = chemokine,
cor_estimate = cor_test$estimate,
cor_conf_low = cor_test$conf.int[1],
cor_conf_high = cor_test$conf.int[2],
p_value = cor_test$p.value,
stringsAsFactors = FALSE
)
})
}) %>%
ungroup() %>%
mutate(tumor_type = factor(tumor_type, levels = selected_types)) %>%
filter(!is.na(cor_estimate)) # 使用明確的列名過濾
# 8. 繪制相關性森林圖
ggplot(cor_results, aes(x = cor_estimate, y = reorder(chemokine, cor_estimate),
color = tumor_type, shape = tumor_type)) +
geom_point(position = position_dodge(width = 0.8), size = 3) +
geom_errorbarh(aes(xmin = cor_conf_low, xmax = cor_conf_high),
height = 0.2, position = position_dodge(width = 0.8)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
labs(x = "Correlation Coefficient with CD8A", y = "",
title = "Chemokines Correlation with CD8A Expression",
subtitle = "Grouped by Tumor Type",
color = "Tumor Type", shape = "Tumor Type") +
theme_minimal(base_size = 14) +
theme(
axis.text.y = element_text(face = "italic", size = 12),
axis.text.x = element_text(size = 12),
legend.position = "right",
legend.text = element_text(size = 12),
panel.grid.major.y = element_blank(),
plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
panel.border = element_rect(fill = NA, color = "gray80")
) +
scale_color_brewer(palette = "Set1") +
scale_shape_manual(values = c(15, 16, 17, 18, 19, 20)) +
scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5))
###################################美化森林圖#################################
# 8. 繪制美化后的相關性森林圖
# 首先計算每個趨化因子的平均相關性用于排序
chemokine_order <- cor_results %>%
group_by(chemokine) %>%
summarise(mean_cor = mean(cor_estimate, na.rm = TRUE)) %>%
arrange(mean_cor) %>%
pull(chemokine)
# 添加顯著性標記
cor_results <- cor_results %>%
mutate(
significance = case_when(
p_value < 0.001 ~ "***",
p_value < 0.01 ~ "**",
p_value < 0.05 ~ "*",
TRUE ~ ""
),
chemokine = factor(chemokine, levels = chemokine_order),
tumor_type = factor(tumor_type, levels = selected_types)
)
# 自定義顏色和形狀
tumor_colors <- c(
"BREAST" = "#E41A1C",
"LUNG" = "#377EB8",
"COLON" = "#4DAF4A",
"KIDNEY" = "#984EA3",
"UTERUS" = "#FF7F00",
"OVARIAN" = "#A65628"
)
tumor_shapes <- c(15, 16, 17, 18, 19, 20)
# 繪制圖形
ggplot(cor_results, aes(x = cor_estimate, y = chemokine,
color = tumor_type, shape = tumor_type)) +
# 參考線
geom_vline(xintercept = 0, linetype = "dashed", color = "gray60", linewidth = 0.5) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dotted", color = "gray80", linewidth = 0.3) +
# 誤差條和點
geom_errorbarh(
aes(xmin = cor_conf_low, xmax = cor_conf_high),
height = 0.15, position = position_dodge(width = 0.7),
linewidth = 0.8, alpha = 0.7
) +
geom_point(
position = position_dodge(width = 0.7),
size = 3, fill = "white", stroke = 1.2
) +
# 顯著性標記
geom_text(
aes(label = significance, x = cor_conf_high + 0.05),
position = position_dodge(width = 0.7),
color = "black", size = 4, vjust = 0.7
) +
# 坐標軸和標簽
scale_x_continuous(
limits = c(-0.25, 1),
breaks = seq(-0.5, 1, 0.25),
expand = expansion(mult = 0.05)
) +
scale_y_discrete(labels = function(x) parse(text = paste0("italic('", x, "')"))) +
# 顏色和形狀
scale_color_manual(values = tumor_colors, name = "Tumor Type") +
scale_shape_manual(values = tumor_shapes, name = "Tumor Type") +
# 標題和圖例
labs(
x = "Pearson correlation coefficient with CD8A expression",
y = "Chemokine",
title = "Association Between Chemokines and CD8A+ T Cell Infiltration",
subtitle = "Correlation analysis across different tumor types",
caption = "Error bars represent 95% confidence intervals\n* p < 0.05, ** p < 0.01, *** p < 0.001"
) +
# 主題設置
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(
face = "bold", hjust = 0.5, size = 16,
margin = margin(b = 10)
),
plot.subtitle = element_text(
hjust = 0.5, color = "gray40", size = 12,
margin = margin(b = 15)
),
plot.caption = element_text(
hjust = 0, color = "gray50", size = 10,
margin = margin(t = 10)
),
axis.title.x = element_text(
margin = margin(t = 10), size = 12
),
axis.text.y = element_text(
size = 12, color = "black", hjust = 1
),
axis.text.x = element_text(
size = 11, color = "black"
),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(
color = "gray90", linewidth = 0.3
),
panel.grid.minor.x = element_line(
color = "gray95", linewidth = 0.2
),
legend.position = "right",
legend.title = element_text(
face = "bold", size = 12
),
legend.text = element_text(
size = 11
),
plot.margin = margin(20, 20, 20, 20),
panel.background = element_rect(
fill = "white", color = NA
),
plot.background = element_rect(
fill = "white", color = NA
)
) +
# 添加小修飾
annotate(
"text", x = -0.9, y = length(chemokine_order) + 0.5,
label = "Negative correlation", size = 4, color = "gray40"
) +
annotate(
"text", x = 0.9, y = length(chemokine_order) + 0.5,
label = "Positive correlation", size = 4, color = "gray40"
)
# 9. 優化版再優化--------------------------
# 按趨化因子名稱字母順序排序
chemokine_order <- sort(unique(cor_results$chemokine))
# 準備右側標簽數據(每個趨化因子只顯示一次)
label_data <- distinct(cor_results, chemokine) %>%
mutate(
label_x = 1.15, # 標簽x坐標
y_position = as.numeric(factor(chemokine, levels = chemokine_order))
)
# 確保主數據中的y位置正確
cor_results <- cor_results %>%
mutate(
chemokine = factor(chemokine, levels = chemokine_order),
y_position = as.numeric(chemokine) # 轉換為數值位置用于繪圖
)
# 自定義顏色和形狀
tumor_colors <- c(
"BREAST" = "#E41A1C",
"LUNG" = "#377EB8",
"COLON" = "#4DAF4A",
"KIDNEY" = "#984EA3",
"UTERUS" = "#FF7F00",
"OVARIAN" = "#A65628"
)
tumor_shapes <- c(15, 16, 17, 18, 19, 20)
# 創建圖形
ggplot(cor_results, aes(x = cor_estimate, y = y_position)) +
# 參考線
geom_vline(xintercept = 0, linetype = "dashed", color = "gray60", linewidth = 0.5) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dotted", color = "gray80", linewidth = 0.3) +
# 誤差條和點(在此層添加美學映射)
geom_errorbarh(
aes(xmin = cor_conf_low, xmax = cor_conf_high,
color = tumor_type),
height = 0.15, position = position_dodge(width = 0.7),
linewidth = 0.8, alpha = 0.7
) +
geom_point(
aes(shape = tumor_type, color = tumor_type),
position = position_dodge(width = 0.7),
size = 3, fill = "white", stroke = 1.2
) +
# 顯著性標記(放在右側)
geom_text(
aes(label = significance, x = 1.05),
position = position_dodge(width = 0.7),
color = "black", size = 4, vjust = 0.7
) +
# 右側趨化因子名稱標簽(每個名稱只顯示一次)
geom_text(
data = label_data,
aes(x = label_x, y = y_position, label = chemokine),
inherit.aes = FALSE, # 關鍵修復:不繼承全局美學映射
color = "black", size = 4, hjust = 0
) +
# 坐標軸設置
scale_x_continuous(
limits = c(0, 1.2), # 擴大范圍容納右側注釋
breaks = seq(0, 1, 0.25),
expand = expansion(mult = 0.05)
) +
scale_y_continuous(
breaks = label_data$y_position,
labels = NULL, # 隱藏y軸標簽
limits = c(0.5, max(label_data$y_position) + 0.5)
) +
# 顏色和形狀
scale_color_manual(values = tumor_colors, name = "Tumor Type") +
scale_shape_manual(values = tumor_shapes, name = "Tumor Type") +
# 標題和圖例
labs(
x = "Pearson correlation coefficient with CD8A expression",
y = NULL,
title = "Association Between Chemokines and CD8A+ T Cell Infiltration",
subtitle = "Correlation analysis across different tumor types",
caption = "Error bars represent 95% confidence intervals\n* p < 0.05, ** p < 0.01, *** p < 0.001"
) +
# 主題設置
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(
face = "bold", hjust = 0.5, size = 16,
margin = margin(b = 10)
),
plot.subtitle = element_text(
hjust = 0.5, color = "gray40", size = 12,
margin = margin(b = 15)
),
plot.caption = element_text(
hjust = 0, color = "gray50", size = 10,
margin = margin(t = 10)
),
axis.title.x = element_text(
margin = margin(t = 10), size = 12
),
axis.text.y = element_blank(), # 隱藏y軸文本
axis.text.x = element_text(
size = 11, color = "black"
),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_line(
color = "gray90", linewidth = 0.3
),
panel.grid.minor.x = element_line(
color = "gray95", linewidth = 0.2
),
legend.position = "right",
legend.title = element_text(
face = "bold", size = 12
),
legend.text = element_text(
size = 11
),
plot.margin = margin(20, 60, 20, 20), # 增大右側邊距
panel.background = element_rect(
fill = "white", color = NA
),
plot.background = element_rect(
fill = "white", color = NA
)
)
這里不止是復現了原圖,而且是超越了原圖。如果我們對其他基因或者免疫細胞感興趣,還可以隨意替換成自己感興趣的,用于前期數據挖掘。
特別聲明:以上內容(如有圖片或視頻亦包括在內)為自媒體平臺“網易號”用戶上傳并發布,本平臺僅提供信息存儲服務。
Notice: The content above (including the pictures and videos if any) is uploaded and posted by a user of NetEase Hao, which is a social media platform and only provides information storage services.