網易首頁 > 網易號 > 正文申請入駐

太震撼！DeepSeek完成Cancer Cell文獻中免疫細胞浸潤的森林圖復現！

2025-06-13 08:12:26　來源: 芒果師兄

上海舉報

分享至

閑言少敘，切入正題。感興趣的果友可以去看鏈接中對應的推文。我們想復現文獻中的Fig2C，這張圖展示的是興趣基因（這里是特定的趨化因子）與免疫細胞（CD8A是特異性表達在CD8 T 細胞的）的關系。這張圖最大的意義是，這里的興趣基因和免疫細胞都可以換成感興趣的基因的。

代碼如下，供大家參考。

######################DeepSeek復現代碼###############################
# 加載必要的包
library(tidyverse)
library(limma)
library(ggplot2)
library(forestplot)
library(hgu133plus2.db)
library(GEOquery)

# 1. 數據讀取
gse_data <- getGEO(filename = "GSE2109_series_matrix.txt.gz", getGPL = F)
expr_data <- exprs(gse_data)
pheno_data <- pData(gse_data)

# 對表達數據進行log2轉換（確保數據未log轉換）
if (max(expr_data, na.rm = TRUE) > 100) {
  expr_data <- log2(expr_data + 1)
}

# 2. 數據清洗和樣本選擇
# 提取腫瘤類型信息并標準化
pheno_data$tumor_type <- str_extract(pheno_data$source_name_ch1, "(Breast|Lung|Colon|Kidney|Uterus|Ovary)")
pheno_data$tumor_type <- toupper(pheno_data$tumor_type)
pheno_data$tumor_type <- gsub("OVARY|OVARIAN", "OVARIAN", pheno_data$tumor_type)
pheno_data$tumor_type <- gsub("UTERUS|CORPUS UTERI|UTERINE CORPUS", "UTERUS", pheno_data$tumor_type)

# 篩選感興趣的腫瘤類型
selected_types <- c("KIDNEY", "LUNG", "COLON", "BREAST", "UTERUS", "OVARIAN")
pheno_data <- pheno_data[pheno_data$tumor_type %in% selected_types, ]
expr_data <- expr_data[, colnames(expr_data) %in% rownames(pheno_data)]

# 3. 探針ID轉換
# 定義興趣基因
interest_genes <- c("CD8A", "CCL3", "CCL4", "CCL5", "CCL8", "CCL13", "CCL18", 
                    "CCL19", "CCL21", "CCL22", "CXCL1", "CXCL2", "CXCL3", 
                    "CXCL9", "CXCL10", "CXCL11", "CXCL12", "CXCL13")

# 獲取平臺注釋信息
# 3. 使用hgu133plus2.db進行探針ID轉換

# 獲取探針到基因符號的映射
probe2gene <- AnnotationDbi::select(hgu133plus2.db, 
                                    keys = rownames(expr_data),
                                    columns = c("SYMBOL"),
                                    keytype = "PROBEID")

# 處理多對多映射（取每個探針對應的第一個基因符號）
probe2gene <- probe2gene %>%
  filter(!is.na(SYMBOL)) %>%
  group_by(PROBEID) %>%
  dplyr::slice(1) %>%
  ungroup()

# 轉換表達矩陣
expr_df <- expr_data %>%
  as.data.frame() %>%
  rownames_to_column(var = "PROBEID") %>%
  inner_join(probe2gene, by = "PROBEID") %>%
  dplyr::select(-PROBEID) %>%
  filter(SYMBOL %in% interest_genes) %>%
  group_by(SYMBOL) %>%
  summarise(across(everything(), mean, na.rm = TRUE)) %>%
  column_to_rownames(var = "SYMBOL")

# 4. 準備森林圖數據
# 計算各腫瘤類型的平均表達量
forest_data <- data.frame()
for (ttype in selected_types) {
  samples <- rownames(pheno_data[pheno_data$tumor_type == ttype, ])
  if (length(samples) > 0) {
    ttype_expr <- expr_df[, colnames(expr_df) %in% samples]
    means <- rowMeans(ttype_expr, na.rm = TRUE)
    sds <- apply(ttype_expr, 1, sd, na.rm = TRUE)
    n <- ncol(ttype_expr)

    temp_df <- data.frame(
      gene = rownames(expr_df),
      tumor_type = ttype,
      mean = means,
      lower = means - 1.96 * sds/sqrt(n),
      upper = means + 1.96 * sds/sqrt(n),
      n = n
    )
    forest_data <- rbind(forest_data, temp_df)
  }
}               

# 5. 繪制森林圖
# 準備標簽
label_data <- forest_data %>%
  group_by(gene) %>%
  summarise() %>%
  mutate(mean = NA, lower = NA, upper = NA, tumor_type = "Gene")

# 合并數據
plot_data <- bind_rows(label_data, forest_data) %>%
  mutate(
    tumor_type = factor(tumor_type, levels = c("Gene", selected_types)),
    gene = factor(gene, levels = rev(interest_genes))
  )

# 創建森林圖
ggplot(plot_data, aes(x = mean, y = gene, color = tumor_type)) +
  geom_point(position = position_dodge(width = 0.7), size = 2) +
  geom_errorbarh(aes(xmin = lower, xmax = upper), 
                 height = 0.2, position = position_dodge(width = 0.7)) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  facet_grid(. ~ tumor_type, scales = "free_x", space = "free_x") +
  labs(x = "Expression Level (log2)", y = "", 
       title = "Gene Expression Across Tumor Types") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(face = "bold"),
    legend.position = "none"
  ) +
  scale_color_manual(values = c("black", "red", "blue", "green", "purple", "orange", "brown"))               

# 6. 繪制合并的森林圖
ggplot(forest_data, aes(x = mean, y = reorder(gene, mean), 
                        color = tumor_type, shape = tumor_type)) +
  geom_point(position = position_dodge(width = 0.8), size = 3) +
  geom_errorbarh(aes(xmin = lower, xmax = upper), 
                 height = 0.2, position = position_dodge(width = 0.8)) +
  geom_vline(xintercept = mean(expr_df, na.rm = TRUE), 
             linetype = "dashed", color = "gray50") +
  labs(x = "Log2 Expression Level", y = "",
       title = "Cytokine Expression Across Tumor Types",
       color = "Tumor Type", shape = "Tumor Type") +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_text(face = "italic", size = 12),
    axis.text.x = element_text(size = 12),
    legend.position = "right",
    legend.text = element_text(size = 12),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
    panel.border = element_rect(fill = NA, color = "gray80")
  ) +
  scale_color_brewer(palette = "Set1") +
  scale_shape_manual(values = c(15, 16, 17, 18, 19, 20))

# 7. 計算趨化因子與CD8A的相關性（按腫瘤類型分組）
cor_results <- pheno_data %>%
  rownames_to_column(var = "sample_id") %>%
  filter(tumor_type %in% selected_types) %>%
  group_by(tumor_type) %>%
  group_modify(~ {
    # 確保樣本ID匹配表達矩陣的列名（expr_df是基因在行，樣本在列）
    samples <- intersect(.x$sample_id, colnames(expr_df))
    if(length(samples) == 0) return(data.frame())

    # 轉置表達矩陣以便樣本在行，基因在列
    sub_expr <- t(expr_df[, samples, drop = FALSE]) %>% as.data.frame()

    # 檢查CD8A列是否存在
    if(!"CD8A" %in% colnames(sub_expr)) {
      return(data.frame())  # 跳過沒有CD8A的組
    }

    # 計算每個趨化因子與CD8A的相關性
    map_dfr(setdiff(interest_genes, "CD8A"), function(chemokine) {
      if(!chemokine %in% colnames(sub_expr)) {
        return(data.frame(
          chemokine = chemokine,
          cor_estimate = NA,
          cor_conf_low = NA,
          cor_conf_high = NA,
          p_value = NA,
          stringsAsFactors = FALSE
        ))
      }

      # 移除NA值
      complete_cases <- complete.cases(sub_expr[[chemokine]], sub_expr[["CD8A"]])
      x <- sub_expr[complete_cases, chemokine]
      y <- sub_expr[complete_cases, "CD8A"]

      if(length(x) < 3) {  # 至少需要3個觀測值
        return(data.frame(
          chemokine = chemokine,
          cor_estimate = NA,
          cor_conf_low = NA,
          cor_conf_high = NA,
          p_value = NA,
          stringsAsFactors = FALSE
        ))
      }

      cor_test <- cor.test(x, y, method = "pearson")
      data.frame(
        chemokine = chemokine,
        cor_estimate = cor_test$estimate,
        cor_conf_low = cor_test$conf.int[1],
        cor_conf_high = cor_test$conf.int[2],
        p_value = cor_test$p.value,
        stringsAsFactors = FALSE
      )
    })
  }) %>%
  ungroup() %>%
  mutate(tumor_type = factor(tumor_type, levels = selected_types)) %>%
  filter(!is.na(cor_estimate))  # 使用明確的列名過濾

# 8. 繪制相關性森林圖
ggplot(cor_results, aes(x = cor_estimate, y = reorder(chemokine, cor_estimate), 
                        color = tumor_type, shape = tumor_type)) +
  geom_point(position = position_dodge(width = 0.8), size = 3) +
  geom_errorbarh(aes(xmin = cor_conf_low, xmax = cor_conf_high), 
                 height = 0.2, position = position_dodge(width = 0.8)) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  labs(x = "Correlation Coefficient with CD8A", y = "",
       title = "Chemokines Correlation with CD8A Expression",
       subtitle = "Grouped by Tumor Type",
       color = "Tumor Type", shape = "Tumor Type") +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_text(face = "italic", size = 12),
    axis.text.x = element_text(size = 12),
    legend.position = "right",
    legend.text = element_text(size = 12),
    panel.grid.major.y = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5, size = 16),
    plot.subtitle = element_text(hjust = 0.5, color = "gray50"),
    panel.border = element_rect(fill = NA, color = "gray80")
  ) +
  scale_color_brewer(palette = "Set1") +
  scale_shape_manual(values = c(15, 16, 17, 18, 19, 20)) +
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, 0.5))

###################################美化森林圖#################################
# 8. 繪制美化后的相關性森林圖

# 首先計算每個趨化因子的平均相關性用于排序
chemokine_order <- cor_results %>%
  group_by(chemokine) %>%
  summarise(mean_cor = mean(cor_estimate, na.rm = TRUE)) %>%
  arrange(mean_cor) %>%
  pull(chemokine)

# 添加顯著性標記
cor_results <- cor_results %>%
  mutate(
    significance = case_when(
      p_value < 0.001 ~ "***",
      p_value < 0.01 ~ "**",
      p_value < 0.05 ~ "*",
      TRUE ~ ""
    ),
    chemokine = factor(chemokine, levels = chemokine_order),
    tumor_type = factor(tumor_type, levels = selected_types)
  )

# 自定義顏色和形狀
tumor_colors <- c(
  "BREAST" = "#E41A1C", 
  "LUNG" = "#377EB8", 
  "COLON" = "#4DAF4A", 
  "KIDNEY" = "#984EA3",
  "UTERUS" = "#FF7F00", 
  "OVARIAN" = "#A65628"
)

tumor_shapes <- c(15, 16, 17, 18, 19, 20)

# 繪制圖形
ggplot(cor_results, aes(x = cor_estimate, y = chemokine, 
                        color = tumor_type, shape = tumor_type)) +
  # 參考線
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray60", linewidth = 0.5) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dotted", color = "gray80", linewidth = 0.3) +

  # 誤差條和點
  geom_errorbarh(
    aes(xmin = cor_conf_low, xmax = cor_conf_high),
    height = 0.15, position = position_dodge(width = 0.7),
    linewidth = 0.8, alpha = 0.7
  ) +
  geom_point(
    position = position_dodge(width = 0.7), 
    size = 3, fill = "white", stroke = 1.2
  ) +

  # 顯著性標記
  geom_text(
    aes(label = significance, x = cor_conf_high + 0.05),
    position = position_dodge(width = 0.7),
    color = "black", size = 4, vjust = 0.7
  ) +

  # 坐標軸和標簽
  scale_x_continuous(
    limits = c(-0.25, 1), 
    breaks = seq(-0.5, 1, 0.25),
    expand = expansion(mult = 0.05)
  ) +
  scale_y_discrete(labels = function(x) parse(text = paste0("italic('", x, "')"))) +

  # 顏色和形狀
  scale_color_manual(values = tumor_colors, name = "Tumor Type") +
  scale_shape_manual(values = tumor_shapes, name = "Tumor Type") +

  # 標題和圖例
  labs(
    x = "Pearson correlation coefficient with CD8A expression",
    y = "Chemokine",
    title = "Association Between Chemokines and CD8A+ T Cell Infiltration",
    subtitle = "Correlation analysis across different tumor types",
    caption = "Error bars represent 95% confidence intervals\n* p < 0.05, ** p < 0.01, *** p < 0.001"
  ) +

  # 主題設置
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(
      face = "bold", hjust = 0.5, size = 16,
      margin = margin(b = 10)
    ),
    plot.subtitle = element_text(
      hjust = 0.5, color = "gray40", size = 12,
      margin = margin(b = 15)
    ),
    plot.caption = element_text(
      hjust = 0, color = "gray50", size = 10,
      margin = margin(t = 10)
    ),
    axis.title.x = element_text(
      margin = margin(t = 10), size = 12
    ),
    axis.text.y = element_text(
      size = 12, color = "black", hjust = 1
    ),
    axis.text.x = element_text(
      size = 11, color = "black"
    ),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_line(
      color = "gray90", linewidth = 0.3
    ),
    panel.grid.minor.x = element_line(
      color = "gray95", linewidth = 0.2
    ),
    legend.position = "right",
    legend.title = element_text(
      face = "bold", size = 12
    ),
    legend.text = element_text(
      size = 11
    ),
    plot.margin = margin(20, 20, 20, 20),
    panel.background = element_rect(
      fill = "white", color = NA
    ),
    plot.background = element_rect(
      fill = "white", color = NA
    )
  ) +

  # 添加小修飾
  annotate(
    "text", x = -0.9, y = length(chemokine_order) + 0.5,
    label = "Negative correlation", size = 4, color = "gray40"
  ) +
  annotate(
    "text", x = 0.9, y = length(chemokine_order) + 0.5,
    label = "Positive correlation", size = 4, color = "gray40"
  )

# 9. 優化版再優化--------------------------
# 按趨化因子名稱字母順序排序
chemokine_order <- sort(unique(cor_results$chemokine))

# 準備右側標簽數據（每個趨化因子只顯示一次）
label_data <- distinct(cor_results, chemokine) %>%
  mutate(
    label_x = 1.15,  # 標簽x坐標
    y_position = as.numeric(factor(chemokine, levels = chemokine_order))
  )

# 確保主數據中的y位置正確
cor_results <- cor_results %>%
  mutate(
    chemokine = factor(chemokine, levels = chemokine_order),
    y_position = as.numeric(chemokine)  # 轉換為數值位置用于繪圖
  )

# 自定義顏色和形狀
tumor_colors <- c(
  "BREAST" = "#E41A1C", 
  "LUNG" = "#377EB8", 
  "COLON" = "#4DAF4A", 
  "KIDNEY" = "#984EA3",
  "UTERUS" = "#FF7F00", 
  "OVARIAN" = "#A65628"
)

tumor_shapes <- c(15, 16, 17, 18, 19, 20)

# 創建圖形
ggplot(cor_results, aes(x = cor_estimate, y = y_position)) +
  # 參考線
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray60", linewidth = 0.5) +
  geom_vline(xintercept = c(-0.5, 0.5), linetype = "dotted", color = "gray80", linewidth = 0.3) +

  # 誤差條和點（在此層添加美學映射）
  geom_errorbarh(
    aes(xmin = cor_conf_low, xmax = cor_conf_high,
        color = tumor_type),
    height = 0.15, position = position_dodge(width = 0.7),
    linewidth = 0.8, alpha = 0.7
  ) +
  geom_point(
    aes(shape = tumor_type, color = tumor_type),
    position = position_dodge(width = 0.7), 
    size = 3, fill = "white", stroke = 1.2
  ) +

  # 顯著性標記（放在右側）
  geom_text(
    aes(label = significance, x = 1.05),
    position = position_dodge(width = 0.7),
    color = "black", size = 4, vjust = 0.7
  ) +

  # 右側趨化因子名稱標簽（每個名稱只顯示一次）
  geom_text(
    data = label_data,
    aes(x = label_x, y = y_position, label = chemokine),
    inherit.aes = FALSE,  # 關鍵修復：不繼承全局美學映射
    color = "black", size = 4, hjust = 0
  ) +

  # 坐標軸設置
  scale_x_continuous(
    limits = c(0, 1.2),  # 擴大范圍容納右側注釋
    breaks = seq(0, 1, 0.25),
    expand = expansion(mult = 0.05)
  ) +
  scale_y_continuous(
    breaks = label_data$y_position,
    labels = NULL,  # 隱藏y軸標簽
    limits = c(0.5, max(label_data$y_position) + 0.5)
  ) +

  # 顏色和形狀
  scale_color_manual(values = tumor_colors, name = "Tumor Type") +
  scale_shape_manual(values = tumor_shapes, name = "Tumor Type") +

  # 標題和圖例
  labs(
    x = "Pearson correlation coefficient with CD8A expression",
    y = NULL,
    title = "Association Between Chemokines and CD8A+ T Cell Infiltration",
    subtitle = "Correlation analysis across different tumor types",
    caption = "Error bars represent 95% confidence intervals\n* p < 0.05, ** p < 0.01, *** p < 0.001"
  ) +

  # 主題設置
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(
      face = "bold", hjust = 0.5, size = 16,
      margin = margin(b = 10)
    ),
    plot.subtitle = element_text(
      hjust = 0.5, color = "gray40", size = 12,
      margin = margin(b = 15)
    ),
    plot.caption = element_text(
      hjust = 0, color = "gray50", size = 10,
      margin = margin(t = 10)
    ),
    axis.title.x = element_text(
      margin = margin(t = 10), size = 12
    ),
    axis.text.y = element_blank(),  # 隱藏y軸文本
    axis.text.x = element_text(
      size = 11, color = "black"
    ),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_line(
      color = "gray90", linewidth = 0.3
    ),
    panel.grid.minor.x = element_line(
      color = "gray95", linewidth = 0.2
    ),
    legend.position = "right",
    legend.title = element_text(
      face = "bold", size = 12
    ),
    legend.text = element_text(
      size = 11
    ),
    plot.margin = margin(20, 60, 20, 20),  # 增大右側邊距
    panel.background = element_rect(
      fill = "white", color = NA
    ),
    plot.background = element_rect(
      fill = "white", color = NA
    )
  )

這里不止是復現了原圖，而且是超越了原圖。如果我們對其他基因或者免疫細胞感興趣，還可以隨意替換成自己感興趣的，用于前期數據挖掘。

特別聲明：以上內容(如有圖片或視頻亦包括在內)為自媒體平臺“網易號”用戶上傳并發布，本平臺僅提供信息存儲服務。

Notice: The content above (including the pictures and videos if any) is uploaded and posted by a user of NetEase Hao, which is a social media platform and only provides information storage services.