045 Wgcma分析

一、实验目的

二、具体代码

##'@WGCNA分析
##'@date:2024.1.17
##'@整理:小杜的生信笔记
##'@本教程的优点:可直接运行,但必要参数需要修改
##'@
##'@本次更新:
##'@1.添加hub基因筛选脚本;2.Hub基因网络图

# 设置工作目录
setwd("F:\\生信代码复现\\WGCNA分析\\20240117_WGCNA-六-更新")  ###💚💚💚这个我们需要修改本地的文件

# 清空环境变量
rm(list = ls())

##'@标准化【optional】
##'@对于数据标准化,根据自己的需求即可,非必要
# 对数据进行log2转换
# WGCNA.fpkm <- log2(WGCNA.fpkm +1 )
# 保存转换后的数据
# write.csv(WGCNA.fpkm, "ExpData_WGCNA_log2.csv")

#install.packages("WGCNA")
#BiocManager::install('WGCNA')
##'@安装所需R包
##'

# 安装BiocManager和其他依赖包
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# if (!require('devtools')) install.packages('devtools');
# if (!require("WGCNA")) BiocManager::install("WGCNA");
# if (!require("igraph")) install.packages("igraph");
# if (!require("tidygraph")) install.packages("tidygraph");
# if (!require("ggraph")) install.packages("ggraph");
# if (!require("linkET")) devtools::install_github("Hy4m/linkET", force = TRUE);
# if (!require("netET")) devtools::install_github("Hy4m/netET");

# 批量安装Bioconductor包
# Bioconductor_packages <- c("WGCNA","igraph","tidygraph","ggraph")
# 
# for (pkg in Bioconductor_packages){
#   if(!require(pkg, character.only = T)){
#     BiocManager::install(pkg, ask = F, update = F)
#     require(pkg, character.only = T)
#   }
# }

# 加载必要的R包
library(WGCNA)        # 用于WGCNA分析的主要包
library(ggraph)       # 用于绘制网络图
library(tidygraph)    # 用于网络数据处理
library(linkET)       # 用于网络可视化
library(igraph)       # 用于网络分析
library(netET)        # 用于网络可视化增强

# 设置字符串不作为因子
options(stringsAsFactors = FALSE)

# 读取表达数据
WGCNA.fpkm = read.csv("ExpData_WGCNA_log2.csv", header = T, check.names = F)
# 查看数据前10行和前10列
WGCNA.fpkm[1:10,1:10]

#'@[optional]快速浏览数据集
# 查看数据维度
dim(WGCNA.fpkm)
# 查看列名
names(WGCNA.fpkm)

# 转置数据矩阵并创建新的数据框
datExpr0 = as.data.frame(t(WGCNA.fpkm[,-1]))
# 设置行名和列名
names(datExpr0) = WGCNA.fpkm$sample;
rownames(datExpr0) = names(WGCNA.fpkm[,-1])
# 查看数据维度
dim(datExpr0)

#########检查缺失值并过滤##########
# 使用WGCNA包的goodSamplesGenes函数检查数据质量
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK

# 如果数据中存在问题基因或样本
if (!gsg$allOK)
{
  # 输出需要移除的基因和样本
  if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
  # 移除问题基因和样本
  datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
# 查看过滤后的数据维度
dim(datExpr0)

# 基于平均表达量进行过滤
meanFPKM= 0.5  # 设置过滤阈值
n=nrow(datExpr0)
# 计算每个基因的平均表达量
datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean)
# 根据阈值过滤基因
datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM]
# 转置并格式化过滤后的数据
filtered_fpkm=t(datExpr0)
filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm)
names(filtered_fpkm)[1]="sample"
# 查看过滤后数据的前几行
head(filtered_fpkm)
# 查看过滤后的数据维度
dim(filtered_fpkm)
# 保存过滤后的数据
write.table(filtered_fpkm, file="WGCNA.过滤后数据.txt",,
            row.names=F, col.names=T,quote=FALSE,sep="\t")

#############样本聚类###########
# 计算样本间的距离并进行层次聚类
sampleTree = hclust(dist(datExpr0), method = "average")
# 绘制样本聚类树状图
pdf(file = "1.sampleClustering.pdf", width = 6, height = 4)
par(cex = 0.6)
par(mar = c(0,6,6,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
     cex.axis = 1.5, cex.main = 2)
dev.off()

###'@去除离群值
# 绘制带有切割线的聚类树状图
pdf("2_sample clutering_delete_outliers.pdf", width = 6, height = )
par(cex = 0.6)
par(mar = c(0,6,6,0))
cutHeight = 15  # 设置切割高度
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2, 
     cex.axis = 1.5, cex.main = 2) +
  abline(h = cutHeight, col = "red")
dev.off()

##'@过滤离散样本
# 根据切割高度划分样本
clust = cutreeStatic(sampleTree, cutHeight = cutHeight, minSize = 10)
keepSamples = (clust==1)
# 保留非离群样本的数据
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# 查看过滤后的数据维度和内容
dim(datExpr)
head(datExpr)
datExpr[1:11,1:12]

###----------------------------------------------------------------------------
###############'@载入性状数据
# 读取性状数据
traitData = read.csv("TraitData.csv",row.names=1)
# 查看性状数据前几行
head(traitData)
allTraits = traitData
dim(allTraits)
names(allTraits)

# 将性状数据与表达数据的样本对应
fpkmSamples = rownames(datExpr)
traitSamples =rownames(allTraits)
traitRows = match(fpkmSamples, traitSamples)
datTraits = allTraits[traitRows,]
rownames(datTraits)

###----------------------------------------------------------------------------
##'@第一步的数据过滤就结束,进行下面的分析

#'@开启WGCNA多线程分析
enableWGCNAThreads(5)

#'@计算软阈值范围
powers = c(c(1:10), seq(from = 12, to=30, by=2))

#'@进行网络拓扑分析
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)

#'@绘制软阈值筛选图
pdf(file="4_软阈值选择.pdf",width=12, height = 8)
par(mfrow = c(1,2))
cex1 = 0.85

# 绘制Scale-free拓扑拟合指数
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# 添加R^2阈值线
abline(h=0.85,col="red")

# 绘制平均连接度
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()

######选择软阈值
##'@获取系统推荐的最佳软阈值
sft$powerEstimate

##'@手动设置软阈值
softPower = 20

# 计算邻接矩阵
adjacency = adjacency(datExpr, power = softPower)

##### 将邻接矩阵转换为拓扑重叠矩阵
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM

# 进行层次聚类
geneTree = hclust(as.dist(dissTOM), method = "average");

# 绘制聚类树状图
pdf(file="4_Gene clustering on TOM-based dissimilarity.pdf",width=6,height=4)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)
dev.off()

# 设置模块最小基因数
minModuleSize = 30

# 使用动态剪切树方法识别模块
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
                            deepSplit = 2, pamRespectsDendro = FALSE,
                            minClusterSize = minModuleSize);
# 查看模块分布
table(dynamicMods)

# 将数字标签转换为颜色标签
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)

# 绘制基因树和模块颜色
pdf(file="5_Dynamic Tree Cut.pdf",width=8,height=6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")
dev.off()

# 计算模块特征向量
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes

# 计算模块特征向量的相异度
MEDiss = 1-cor(MEs);

# 对模块特征向量进行聚类
METree = hclust(as.dist(MEDiss), method = "average")

# 绘制模块特征向量聚类图
pdf(file="6_Clustering of module eigengenes.pdf",width=7,height=6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")
MEDissThres = 0.2  # 设置合并阈值
# 添加切割线
abline(h=MEDissThres, col = "red")
dev.off()

##'@合并相似模块
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 获取合并后的模块颜色
mergedColors = merge$colors
# 获取合并后的模块特征向量
mergedMEs = merge$newMEs
# 查看合并后的模块分布
table(mergedColors)

#导出所有模块的基因列表
color<-unique(mergedColors)
for (i  in 1:length(color)) {
  y=t(assign(paste(color[i],"expr",sep = "."),datExpr[mergedColors==color[i]]))
  write.csv(y,paste('6',color[i],"csv",sep = "."),quote = F)
}

##'@输出merge模块图形
pdf(file="7_merged dynamic.pdf", width = 8, height = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()

# 重命名为moduleColors用于后续分析
moduleColors = mergedColors
# 构建颜色对应的数字标签
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs

####分析模块与外部临床性状的关系###########
# 定义基因和样本数量
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# 计算模块与性状的相关性
moduleTraitCor = cor(MEs, datTraits, use = "p")
# 计算相关性的p值
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

# 绘制模块-性状关系热图
pdf(file="8_Module-trait relationships.pdf",width=8,height=5)
# 准备相关性和p值的显示文本
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "")

dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))

# 绘制热图
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datTraits),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = blueWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))
dev.off()

###'@绘制相关性网络图
# 构建网络相关性分析
link_cor <- mantel_test(
  ###'@若你需要全部输出,请直接使用性状数据即可
  datTraits,
  mergedMEs,
  ### 修改你的性状数据信息
  spec_select =list(Heat =1,
                    Cold =2,
                    UAO =3,
                    TT =4,
                    AUC =5)) %>% 
  mutate(rd=cut(r,breaks=c(-Inf,0.2,0.6,Inf),
                labels=c("<0.2","0.2-0.6",">=0.6")),
         pd=cut(p,breaks=c(-Inf,0.01,0.5,Inf),
                labels=c("<0.01","0.01-0.05",">=0.05")))

# 绘制相关性网络图
pdf("8-2.相关性网络图.pdf",width = 8,height = 6)
qcorrplot(correlate(mergedMEs),type = "upper",   ## "upper"or "lower"
          diag = FALSE) +
  # 设置图形形状
  geom_square() +
  # 添加连接线
  geom_couple(aes(colour = pd, size = rd), data = link_cor, 
              curvature = nice_curvature()) +
  # 设置颜色渐变
  scale_fill_gradientn(colours = rev(RColorBrewer::brewer.pal(11, "RdBu")))+
  # 设置图例参数
  scale_size_manual(values = c(0.5, 1, 2)) +
  scale_colour_manual(values = color_pal(3)) +
  guides(size = guide_legend(title = "Mantel's r",
                             override.aes = list(colour = "grey35"), 
                             order = 2),
         colour = guide_legend(title = "Mantel's p", 
                               override.aes = list(size = 3), 
                               order = 1),
         fill = guide_colorbar(title = "Pearson's r",
                               order = 3))+
  # 设置图形边距和主题
  theme(plot.margin = unit(c(0,0,0,0),units="cm"),
        legend.background = element_blank(),
        legend.key = element_blank())
dev.off()

###'@计算模块成员度(MM)和基因显著性(GS)
# 获取模块名称
modNames = substring(names(MEs), 3)
# 计算基因模块成员度
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
# 计算MM的p值
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
# 设置列名
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")

# 获取性状名称
traitNames=names(datTraits)
# 计算基因与性状的相关性
geneTraitSignificance = as.data.frame(cor(datExpr, datTraits, use = "p"))
# 计算GS的p值
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
# 设置列名
names(geneTraitSignificance) = paste("GS.", traitNames, sep="")
names(GSPvalue) = paste("p.GS.", traitNames, sep="")

####为每个性状和模块绘制MM vs GS图
for (trait in traitNames){
  traitColumn=match(trait,traitNames)
  for (module in modNames){
    column = match(module, modNames)
    moduleGenes = moduleColors==module
    if (nrow(geneModuleMembership[moduleGenes,]) > 1){
      pdf(file=paste("9_", trait, "_", module,"_Module membership vs gene significance.pdf",sep=""),width=7,height=7)
      par(mfrow = c(1,1))
      verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                         abs(geneTraitSignificance[moduleGenes, traitColumn]),
                         xlab = paste("Module Membership in", module, "module"),
                         ylab = paste("Gene significance for ",trait),
                         main = paste("Module membership vs. gene significance\n"),
                         cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
      dev.off()
    }
  }
}

# 获取基因名称
names(datExpr)
probes = names(datExpr)

############导出GS和MM信息#########
# 创建包含基因信息的数据框
geneInfo0 = data.frame(probes= probes,
                       moduleColor = moduleColors)

# 添加基因-性状相关性信息
for (Tra in 1:ncol(geneTraitSignificance))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneTraitSignificance[,Tra],
                         GSPvalue[, Tra])
  names(geneInfo0) = c(oldNames,names(geneTraitSignificance)[Tra],
                       names(GSPvalue)[Tra])
}

# 添加基因-模块成员度信息
for (mod in 1:ncol(geneModuleMembership))
{
  oldNames = names(geneInfo0)
  geneInfo0 = data.frame(geneInfo0, geneModuleMembership[,mod],
                         MMPvalue[, mod])
  names(geneInfo0) = c(oldNames,names(geneModuleMembership)[mod],
                       names(MMPvalue)[mod])
}

# 按模块颜色排序基因信息
geneOrder =order(geneInfo0$moduleColor)
geneInfo = geneInfo0[geneOrder, ]
# 导出GS和MM信息
write.table(geneInfo, file = "10_GS_and_MM.xls",sep="\t",row.names=F)

#######可视化基因网络###########
# 设置参数
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
nSelect = 1000

# 随机选择基因进行可视化
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
# 对选定的基因进行聚类
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]

# 调整TOM矩阵以优化可视化效果
plotDiss = selectTOM^7
diag(plotDiss) = NA

# 绘制网络热图
library(gplots)
pdf(file="13_Network heatmap plot_selected genes.pdf",width=8, height=8)
mycol = colorpanel(250,'red','orange','lemonchiffon')
TOMplot(plotDiss, selectTree, selectColors, col=mycol ,main = "Network heatmap plot, selected genes")
dev.off()

#######可视化特征基因网络#############
# 绘制特征基因树状图和邻接矩阵热图
pdf(file="14_Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=4)
par(cex = 0.9)
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
dev.off()

# 单独绘制特征基因树状图
pdf(file="15_Eigengene dendrogram_1.pdf",width=4, height=3)
par(cex = 1.0)
plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0), plotHeatmaps = FALSE)
dev.off()

# 单独绘制特征基因邻接矩阵热图
pdf(file="15_Eigengene adjacency heatmap_2.pdf",width=4, height=3)
par(cex = 1.0)
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2), plotDendrograms = FALSE, xLabelsAngle = 90)
dev.off()

###-----------------=============-------------------------------------------------
##'@导出模块网络数据用于Cytoscape可视化
# 创建输出目录
dir.create('11_cytoscape', recursive = TRUE)
moduleColorsWW <- mergedColors

# 为每个模块导出网络数据
for (mod in 1:nrow(table(moduleColorsWW))) {
  modules <- names(table(moduleColorsWW))[mod]
  probes <- colnames(datExpr)
  inModule <- (moduleColorsWW == modules)
  modProbes <- probes[inModule]
  modGenes <- modProbes
  # 提取模块内基因的TOM矩阵
  modtom_sim <- TOM[inModule, inModule]
  dimnames(modtom_sim) <- list(modProbes, modProbes)
  # 设置输出文件路径
  outEdge <- paste('11_cytoscape/', modules, '.edge_list.txt',sep = '')
  outNode <- paste('11_cytoscape/', modules, '.node_list.txt', sep = '')
  # 导出网络数据
  exportNetworkToCytoscape(modtom_sim,
                           edgeFile = outEdge,
                           nodeFile = outNode,
                           weighted = TRUE,
                           threshold = 0.3,  # 设置权重阈值
                           nodeNames = modProbes,
                           altNodeNames = modGenes,
                           moduleColorsWW[inModule])
}

###'@Hub基因的筛选
# 选择感兴趣的模块
module = "magenta"   # 设置目标模块
column = match(module, modNames)
moduleGenes = moduleColors==module
# 提取模块内的基因
hub_module<-as.data.frame(dimnames(data.frame(datExpr))[[2]][moduleGenes])
names(hub_module)="genename"
# 计算MM和GS值
MM<-abs(geneModuleMembership[moduleGenes,column])
GS<-abs(geneTraitSignificance[moduleGenes, 1])
hub_MMGS<-as.data.frame(cbind(MM,GS))
rownames(hub_MMGS)=hub_module$genename

# 筛选Hub基因
key_hub_b<-subset(hub_MMGS, abs(hub_MMGS$MM)>0.6 & abs(hub_MMGS$GS)>0.2)
dim(key_hub_b)
head(key_hub_b)
# 导出Hub基因信息
write.csv(key_hub_b, "19.Hub_genes.csv")

###'@绘制Hub基因网络图
# 读取网络数据
net <- read.table("11_cytoscape/magenta.edge_list.txt",header = T)
info <- read.table("11_cytoscape/magenta.node_list.txt",header = T)
# 创建网络图对象
g <- tbl_graph(nodes = info,edges = net,directed = F)
# 计算节点度数
V(g)$degree <- degree(g)
# 设置颜色方案
mycol <- c("#FF8901","#00C5FF","#FF5485")

# 绘制环形网络图
ggraph(g, layout = 'linear',circular = TRUE)+
  geom_edge_arc(colour="grey50",width=1,alpha=0.3)+
  geom_node_point(aes(color=class),size=6,alpha=0.8)+
  scale_colour_manual(values = mycol)+
  geom_node_text(aes(x = 1.05 * x,y = 1.05 * y,
                     angle = -((-node_angle(x, y) + 90) %% 180) + 90,
                     label=name),
                 nudge_y = 0,
                 hjust = 'outward',
                 repel = F,
                 size=2.5)+
  coord_fixed(clip = "off")+
  theme_graph()


我来帮你分析这段WGCNA代码,如果运行这段代码需要准备以下内容:

  1. 必需的输入文件:
- ExpData_WGCNA_log2.csv - 基因表达数据文件
- TraitData.csv - 性状数据文件
  1. 需要修改的代码部分:
# 1. 工作目录路径需要修改为本地路径
setwd("D:\\BioinfoFile\\小杜的生信笔记\\2024\\20240117_WGCNA-六-更新")  

# 2. 软阈值可能需要根据实际数据调整
softPower = 20  

# 3. 模块最小基因数可根据需要调整
minModuleSize = 30

# 4. Hub基因筛选的模块颜色需要根据实际结果选择
module = "magenta"   

# 5. Hub基因筛选阈值可以根据需要调整
key_hub_b <- subset(hub_MMGS, abs(hub_MMGS$MM)>0.6 & abs(hub_MMGS$GS)>0.2)
  1. 需要安装的R包:
- WGCNA
- ggraph
- tidygraph
- linkET
- igraph
- netET
- BiocManager
  1. 输入数据格式要求:

ExpData_WGCNA_log2.csv 格式:

  • 第一列为基因名/ID
  • 其他列为样本的表达值
  • 需要进行log2转换

TraitData.csv 格式:

  • 行名为样本名(需要与表达数据的样本名一致)
  • 列为各个性状数据
  1. 系统要求:
  • R版本 ≥ 4.0
  • 足够的内存(根据数据量大小,建议至少8GB)
  • 建议使用Windows或Linux系统
  1. 注意事项:
  • 确保输入数据没有缺失值
  • 样本名在表达数据和性状数据中需要完全一致
  • 建议先用小数据集测试代码运行是否正常

这样你的同学就可以在自己的电脑上运行这个WGCNA分析流程了。如果遇到具体问题,可以根据错误信息进行针对性解决。