040 复杂进化树 | content.zh



setwd("D:\\小杜的生信笔记\\2024\\20241217_系统发育树专栏二")

rm(list=ls())
#!加载R包
# BiocManager::install("ggtree")
# BiocManager::install("ggtreeExtra")
# BiocManager::install("treeio")
# install.packages("tidytree")
# install.packages("ggstar")
# install.packages("ggplot2")
# install.packages("ggnewscale")
######################1、加载包####################
library(ggtreeExtra)
library(ggtree)
library(treeio)
library(tidytree)
library(ggstar)
library(ggplot2)
library(ggnewscale)

#'@2.读取数据
# 读取树形结构文件 (Newick格式)
tree <- read.tree("data/kegg.nwk") # 读取 Newick 格式的树形数据 
# 读取各环的属性数据
dt1 <- read.csv("data/tippoint_attr.csv")  # 读取提示点的属性数据
dt2 <- read.csv("data/firstring_attr.csv")  # 读取第一环的属性数据
dt3 <- read.csv("data/secondring_attr.csv")  # 读取第二环的属性数据
dt4 <- read.csv("data/barplot_attr.csv")  # 读取条形图的属性数据

head(dt1)
head(dt2)
head(dt3)
head(dt4)

##'@数据处理
dt1 <- aggregate(.~ID, dt1, paste0, collapse="/") 
##'@重新排列门类列的顺序
dt1$Phyla <- factor(dt1$Phyla, levels=c("Actinobacteria","Aquificae","Bacteroidetes",
                                        "Chlamydiae","Chlorobi","Chloroflexi","Crenarchaeota",
                                        "Cyanobacteria","Euryarchaeota","Firmicutes",
                                        "Spirochaetes/Proteobacteria",
                                        "Tenericutes","Thermi","Thermotogae","Other"))

#'@对Type2重新排序
dt3$Type2 <- factor(dt3$Type2, levels=c("FA synth init", "FA synth elong",
                                        "acyl-CoA synth", "beta-Oxidation",
                                        "Ketone biosynth"))
#'@提取支系层的节点标签

nodelab <- tree$node.label[nchar(tree$node.label)>0]
nodeids <- nodeid(tree, nodelab)

#'@支系标签的位置
textex <- c(1.0, 0.4, 0.2, 1.4, 1.4, 0.4, 1.4, 1.4, 0.4, 0.4,
            0.8, 1, 0.6, 0.6, 0.4, 0.3, 0, 0.4, 0.1, 0.25,
            0.2, 0.3, 0.8, 0.8, 0.8, 0.6, 2.4)

#‘@设置层级标签的属性
cladelabels <- mapply(function(x, y, z){geom_cladelabel(node=x, label=y, barsize=NA, extend=0.3,
                                                        offset.text=z, fontsize=1.3, angle="auto",
                                                        hjust=0.5, horizontal=FALSE, fontface="italic")},
                      nodeids, nodelab, textex, SIMPLIFY=FALSE)


##'@设置高亮图层的颜色
fills <- c("#808080", "#808080", "#808080", "#808080", "#808080",
           "#191970", "#87CEFA", "#FFC125", "#B0171F", "#B0171F",
           "#B0171F", "#B0171F", "#B0171F", "#B0171F", "#B0171F",
           "#B0171F", "#B0171F", "#B0171F", "#B0171F", "#B0171F",
           "#B0171F", "#B0171F", "#9ACD32", "#9ACD32", "#9ACD32",
           "#006400", "#800000")

##'@设置节点的高亮
highlights <- mapply(function(x, y){geom_hilight(node=x, extendto=5.8, alpha=0.3,
                                                 fill=y, color=y, size=0.05)},
                     nodeids, fills, SIMPLIFY=FALSE)


##'@自定义颜色
colors <- c("#9ACD32", "#EE6A50", "#87CEFA", "#FFC125", "#D15FEE", "#8DEEEE", "#800000",
            "#006400", "#800080", "#808080", "#B0171F", "#191970", "#7B68EE",
            "#00CD00", "Black")

##------------------------------------------------------------------------------
##'@绘制图形
p1 <- ggtree(
  tree,
  layout="circular",
  size=0.1
)

p1

ggsave("./output/fig1.jpg",width = 6, height = 6)

##'@设置高亮
p1 <- p1 +
  highlights

p1

ggsave("./output/fig2.jpg",width = 6, height = 6)

##'
p2 <- p1 %<+% dt1 +
  geom_tippoint(
    mapping=aes(
      fill=Phyla  # 将数据列 "Phyla" 映射为点的填充颜色
    ),
    shape = 21,    # 点的形状，此处为带有边框的圆形
    size = 1.2,    # 点的大小
    stroke = 0.05, # 点边框的宽度
    position = "identity", # 直接绘制，无额外调整位置
    show.legend = FALSE    # 不在图例中显示这些点
  )+
  scale_fill_manual(values=colors)  # 手动设置点填充颜色的映射，`colors` 是一个向量

p2

ggsave("./output/fig3.jpg",width = 6, height = 6)

##'@添加额外的分支注释标签和新填充映射
p2 <- p2 +
  cladelabels +
  new_scale_fill()

p2

ggsave("./output/fig3-2.jpg", plot = p2, width = 6, height = 6)

p3 <- p2 + 
  geom_fruit(           # 在树旁添加图形（如热图）
    data = dt2,         # 关联的数据
    geom = geom_tile,   # 使用矩形块绘制热图
    mapping = aes(      # 映射数据到图形属性
      y = ID,           # 系统树节点 ID
      x = ring,         # 热图的列分组变量
      fill = Type1      # 热图填充颜色的变量
    ),
    offset = 0.01,      # 热图与树的水平偏移距离
    pwidth = 0.14       # 热图宽度占比
  ) +
  scale_fill_manual(     # 手动设置热图填充颜色
    name = "ATP synthesis", # 图例标题
    values = c("#339933", "#dfac03"), # 自定义颜色
    guide = guide_legend(             # 图例样式
      keywidth = 0.35,  # 图例键宽度
      keyheight = 0.35, # 图例键高度
      order = 1         # 图例显示顺序
    )
  ) +
  new_scale_fill()       # 再次引入新的填充映射

p3

ggsave("./output/fig4.jpg", plot = p3, width = 6, height = 6)

##'@在P3的基础上添加热图
##'
p4 <- p3 + 
  geom_fruit(
    data = dt3,         # 第二个数据集，包含系统树叶节点的相关信息
    geom = geom_tile,   # 热图样式，使用矩形块表示
    mapping = aes(      # 数据映射
      y = ID,           # 系统树节点的 ID
      alpha = Abundance, # 丰度值映射为透明度
      x = Type2,        # 分类变量，用于热图的列分组
      fill = Type2      # 分类变量，用于填充颜色
    ),
    offset = 0.001,     # 热图与树的距离
    pwidth = 0.18       # 热图宽度占比
  ) +
  scale_fill_manual(     # 手动设置分类变量的填充颜色
    name = "Fatty Acid metabolism", # 图例标题
    values = c("#b22222", "#005500", "#0000be", "#9f1f9f", "#793a07"), # 颜色
    guide = guide_legend(           # 自定义图例样式
      keywidth = 0.35, 
      keyheight = 0.35, 
      order = 2
    )
  ) +
  scale_alpha_continuous( # 透明度映射
    range = c(0, 0.4),    # 透明度范围
    guide = guide_legend( # 自定义透明度图例样式
      keywidth = 0.35, 
      keyheight = 0.35, 
      order = 3
    )
  ) +
  new_scale_fill()         # 新的填充映射，用于后续操作


p4

ggsave("./output/fig5.jpg", width = 7, height = 6)

###'@添加柱状图
p5 <- p4 +
  geom_fruit(
    data = dt4,         # 数据集，包含柱状图信息
    geom = geom_bar,    # 添加柱状图
    mapping = aes(
      y = ID,           # 与树节点对应
      x = Length,       # 表示定量变量
      fill = Phyla      # 分类变量，用颜色表示
    ),
    stat = "identity",  # 数据按原值绘制
    orientation = "y",  # 水平方向柱状图
    pwidth = 0.3,       # 柱状图宽度
    position = position_dodgex() # 横向分组排列
  ) +
  scale_fill_manual(
    values = colors,    # 设置分类变量颜色
    guide = guide_legend(
      keywidth = 0.35,  # 图例键宽度
      keyheight = 0.35, # 图例键高度
      order = 4         # 图例显示顺序
    )
  ) +
  geom_treescale(
    fontsize = 1.2,     # 比例尺文字大小
    linesize = 0.3      # 比例尺线条粗细
  ) +
  theme(
    legend.position = c(0.93, 0.76),      # 图例位置
    legend.background = element_rect(fill = NA), # 图例背景透明
    legend.title = element_text(size = 6),       # 图例标题字体大小
    legend.text = element_text(size = 4.5),      # 图例内容字体大小
    legend.spacing.y = unit(0.02, "cm")          # 图例项之间的垂直间距
  )

p5

ggsave("./output/fig6.jpg", plot = p5, width = 8, height = 6)

# 设置工作目录
setwd("F:\\生信代码复现\\系统发育树")
rm(list=ls())


# 安装必要的包
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(c("DECIPHER", "Biostrings", "ggtree", "ggtreeExtra", "treeio", "ggmsa"))

# 加载所有需要的包
library(DECIPHER)
library(Biostrings)
library(ape)
library(ggtreeExtra)
library(ggtree)
library(treeio)
library(tidytree)
library(ggstar)
library(ggplot2)
library(ggnewscale)
library(ggmsa)

# 1. 处理FASTA文件生成进化树
seqs <- readDNAStringSet("进化树.fa")  # 替换为您的FASTA文件路径

# 清理序列名称
names(seqs) <- gsub(" ", "_", names(seqs))
names(seqs) <- make.unique(names(seqs))

# 序列比对
alignment <- AlignSeqs(seqs, 
                      iterations = 2, 
                      refinements = 2,
                      processors = 1,
                      gapOpening = -18,
                      gapExtension = -3)

# 转换为DNAbin对象
aligned_seqs <- as.DNAbin(alignment)

# 构建进化树
dist_matrix <- dist.dna(aligned_seqs, model = "K80")
tree <- nj(dist_matrix)

# 添加bootstrap支持率
tree_boot <- boot.phylo(tree, aligned_seqs, 
                       function(x) nj(dist.dna(x)),
                       B = 1000)

# 保存进化树
write.tree(tree, file = "data/kegg.nwk")

# 2. 创建必要的CSV文件
# tippoint_attr.csv
tip_data <- data.frame(
    ID = tree$tip.label,
    Phyla = "Ascomycota",
    Class = "Leotiomycetes",
    Order = "Helotiales",
    Family = "Helotiaceae",
    Genus = "Lunulospora"
)
write.csv(tip_data, "data/tippoint_attr.csv", row.names = FALSE)

# firstring_attr.csv
first_ring <- data.frame(
    ID = tree$tip.label,
    ring = rep(1:2, length.out = length(tree$tip.label)),
    Type1 = sample(c("Environmental", "Clinical"), 
                  length(tree$tip.label), replace = TRUE)
)
write.csv(first_ring, "data/firstring_attr.csv", row.names = FALSE)

# secondring_attr.csv
second_ring <- data.frame(
    ID = tree$tip.label,
    Type2 = sample(c("Morphotype A", "Morphotype B", "Morphotype C"),
                  length(tree$tip.label), replace = TRUE),
    Abundance = runif(length(tree$tip.label))
)
write.csv(second_ring, "data/secondring_attr.csv", row.names = FALSE)

# barplot_attr.csv
barplot_data <- data.frame(
    ID = tree$tip.label,
    Length = sapply(seqs, length),
    Phyla = "Ascomycota"
)
write.csv(barplot_data, "data/barplot_attr.csv", row.names = FALSE)

# 3. 数据处理
dt1 <- read.csv("data/tippoint_attr.csv")
dt2 <- read.csv("data/firstring_attr.csv")
dt3 <- read.csv("data/secondring_attr.csv")
dt4 <- read.csv("data/barplot_attr.csv")

dt1 <- aggregate(.~ID, dt1, paste0, collapse="/")

# 4. 设置颜色和其他视觉参数
colors <- c("#4DAF4A")  # 单一物种的颜色

# 提取支系层的节点标签
nodelab <- tree$node.label[nchar(tree$node.label)>0]
nodeids <- nodeid(tree, nodelab)

# 支系标签的位置
textex <- rep(0.5, length(nodeids))  # 简化的标签位置

# 设置层级标签的属性
cladelabels <- mapply(function(x, y, z){
    geom_cladelabel(node=x, 
                    label=y, 
                    barsize=NA, 
                    extend=0.3,
                    offset.text=z, 
                    fontsize=1.3, 
                    angle="auto",
                    hjust=0.5, 
                    horizontal=FALSE, 
                    fontface="italic")
}, nodeids, nodelab, textex, SIMPLIFY=FALSE)

# 5. 绘图
# 基础树
p1 <- ggtree(tree,
             layout="circular",
             size=0.1,
             aes(color=tree_boot)) +
      scale_color_gradient(
          low="red",
          high="blue",
          name="Bootstrap support"
      )

# 保存基础树
ggsave("./output/fig1.jpg", p1, width = 6, height = 6)

# 添加分类信息
p2 <- p1 %<+% dt1 +
      geom_tippoint(
          mapping=aes(fill=Phyla),
          shape = 21,
          size = 1.2,
          stroke = 0.05,
          position = "identity",
          show.legend = FALSE
      ) +
      scale_fill_manual(values=colors)

# 保存带分类信息的树
ggsave("./output/fig2.jpg", p2, width = 6, height = 6)

# 添加标签
p3 <- p2 +
      cladelabels +
      new_scale_fill()

# 保存带标签的树
ggsave("./output/fig3.jpg", p3, width = 6, height = 6)

# 添加第一环
p4 <- p3 +
      geom_fruit(
          data = dt2,
          geom = geom_tile,
          mapping = aes(
              y = ID,
              x = ring,
              fill = Type1
          ),
          offset = 0.01,
          pwidth = 0.14
      ) +
      scale_fill_manual(
          name = "Sample Source",
          values = c("#339933", "#dfac03"),
          guide = guide_legend(
              keywidth = 0.35,
              keyheight = 0.35,
              order = 1
          )
      ) +
      new_scale_fill()

# 保存带第一环的树
ggsave("./output/fig4.jpg", p4, width = 7, height = 6)

# 添加第二环
p5 <- p4 +
      geom_fruit(
          data = dt3,
          geom = geom_tile,
          mapping = aes(
              y = ID,
              alpha = Abundance,
              x = Type2,
              fill = Type2
          ),
          offset = 0.001,
          pwidth = 0.18
      ) +
      scale_fill_manual(
          name = "Morphotype",
          values = c("#b22222", "#005500", "#0000be"),
          guide = guide_legend(
              keywidth = 0.35,
              keyheight = 0.35,
              order = 2
          )
      ) +
      scale_alpha_continuous(
          range = c(0, 0.4),
          guide = guide_legend(
              keywidth = 0.35,
              keyheight = 0.35,
              order = 3
          )
      ) +
      new_scale_fill()

# 保存带第二环的树
ggsave("./output/fig5.jpg", p5, width = 7, height = 6)

# 最终图形：添加序列长度条形图
p6 <- p5 +
      geom_fruit(
          data = dt4,
          geom = geom_bar,
          mapping = aes(
              y = ID,
              x = Length,
              fill = Phyla
          ),
          stat = "identity",
          orientation = "y",
          pwidth = 0.3,
          position = position_dodgex()
      ) +
      scale_fill_manual(
          values = colors,
          guide = guide_legend(
              keywidth = 0.35,
              keyheight = 0.35,
              order = 4
          )
      ) +
      geom_treescale(
          fontsize = 1.2,
          linesize = 0.3
      ) +
      theme(
          legend.position = "right",
          legend.background = element_rect(fill = NA),
          legend.title = element_text(size = 8),
          legend.text = element_text(size = 6),
          legend.spacing.y = unit(0.02, "cm")
      )

# 保存最终图形
ggsave("./output/fig6.jpg", p6, width = 8, height = 6)

# 6. 添加序列比对可视化（可选）
aln_plot <- ggmsa(alignment, font = NULL, color = "Chemistry_NT") +
            theme_minimal()

# 保存序列比对图
ggsave("./output/alignment.jpg", aln_plot, width = 12, height = 6)