一、实验目的
二、具体代码
##'@WGCNA分析
##'@date:2024.1.17
##'@整理:小杜的生信笔记
##'@本教程的优点:可直接运行,但必要参数需要修改
##'@
##'@本次更新:
##'@1.添加hub基因筛选脚本;2.Hub基因网络图
# 设置工作目录
setwd("F:\\生信代码复现\\WGCNA分析\\20240117_WGCNA-六-更新") ###💚💚💚这个我们需要修改本地的文件
# 清空环境变量
rm(list = ls())
##'@标准化【optional】
##'@对于数据标准化,根据自己的需求即可,非必要
# 对数据进行log2转换
# WGCNA.fpkm <- log2(WGCNA.fpkm +1 )
# 保存转换后的数据
# write.csv(WGCNA.fpkm, "ExpData_WGCNA_log2.csv")
#install.packages("WGCNA")
#BiocManager::install('WGCNA')
##'@安装所需R包
##'
# 安装BiocManager和其他依赖包
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# if (!require('devtools')) install.packages('devtools');
# if (!require("WGCNA")) BiocManager::install("WGCNA");
# if (!require("igraph")) install.packages("igraph");
# if (!require("tidygraph")) install.packages("tidygraph");
# if (!require("ggraph")) install.packages("ggraph");
# if (!require("linkET")) devtools::install_github("Hy4m/linkET", force = TRUE);
# if (!require("netET")) devtools::install_github("Hy4m/netET");
# 批量安装Bioconductor包
# Bioconductor_packages <- c("WGCNA","igraph","tidygraph","ggraph")
#
# for (pkg in Bioconductor_packages){
# if(!require(pkg, character.only = T)){
# BiocManager::install(pkg, ask = F, update = F)
# require(pkg, character.only = T)
# }
# }
# 加载必要的R包
library(WGCNA) # 用于WGCNA分析的主要包
library(ggraph) # 用于绘制网络图
library(tidygraph) # 用于网络数据处理
library(linkET) # 用于网络可视化
library(igraph) # 用于网络分析
library(netET) # 用于网络可视化增强
# 设置字符串不作为因子
options(stringsAsFactors = FALSE)
# 读取表达数据
WGCNA.fpkm = read.csv("ExpData_WGCNA_log2.csv", header = T, check.names = F)
# 查看数据前10行和前10列
WGCNA.fpkm[1:10,1:10]
#'@[optional]快速浏览数据集
# 查看数据维度
dim(WGCNA.fpkm)
# 查看列名
names(WGCNA.fpkm)
# 转置数据矩阵并创建新的数据框
datExpr0 = as.data.frame(t(WGCNA.fpkm[,-1]))
# 设置行名和列名
names(datExpr0) = WGCNA.fpkm$sample;
rownames(datExpr0) = names(WGCNA.fpkm[,-1])
# 查看数据维度
dim(datExpr0)
#########检查缺失值并过滤##########
# 使用WGCNA包的goodSamplesGenes函数检查数据质量
gsg = goodSamplesGenes(datExpr0, verbose = 3)
gsg$allOK
# 如果数据中存在问题基因或样本
if (!gsg$allOK)
{
# 输出需要移除的基因和样本
if (sum(!gsg$goodGenes)>0)
printFlush(paste("Removing genes:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", ")))
if (sum(!gsg$goodSamples)>0)
printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")))
# 移除问题基因和样本
datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]
}
# 查看过滤后的数据维度
dim(datExpr0)
# 基于平均表达量进行过滤
meanFPKM= 0.5 # 设置过滤阈值
n=nrow(datExpr0)
# 计算每个基因的平均表达量
datExpr0[n+1,]=apply(datExpr0[c(1:nrow(datExpr0)),],2,mean)
# 根据阈值过滤基因
datExpr0=datExpr0[1:n,datExpr0[n+1,] > meanFPKM]
# 转置并格式化过滤后的数据
filtered_fpkm=t(datExpr0)
filtered_fpkm=data.frame(rownames(filtered_fpkm),filtered_fpkm)
names(filtered_fpkm)[1]="sample"
# 查看过滤后数据的前几行
head(filtered_fpkm)
# 查看过滤后的数据维度
dim(filtered_fpkm)
# 保存过滤后的数据
write.table(filtered_fpkm, file="WGCNA.过滤后数据.txt",,
row.names=F, col.names=T,quote=FALSE,sep="\t")
#############样本聚类###########
# 计算样本间的距离并进行层次聚类
sampleTree = hclust(dist(datExpr0), method = "average")
# 绘制样本聚类树状图
pdf(file = "1.sampleClustering.pdf", width = 6, height = 4)
par(cex = 0.6)
par(mar = c(0,6,6,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
cex.axis = 1.5, cex.main = 2)
dev.off()
###'@去除离群值
# 绘制带有切割线的聚类树状图
pdf("2_sample clutering_delete_outliers.pdf", width = 6, height = )
par(cex = 0.6)
par(mar = c(0,6,6,0))
cutHeight = 15 # 设置切割高度
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 2,
cex.axis = 1.5, cex.main = 2) +
abline(h = cutHeight, col = "red")
dev.off()
##'@过滤离散样本
# 根据切割高度划分样本
clust = cutreeStatic(sampleTree, cutHeight = cutHeight, minSize = 10)
keepSamples = (clust==1)
# 保留非离群样本的数据
datExpr = datExpr0[keepSamples, ]
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# 查看过滤后的数据维度和内容
dim(datExpr)
head(datExpr)
datExpr[1:11,1:12]
###----------------------------------------------------------------------------
###############'@载入性状数据
# 读取性状数据
traitData = read.csv("TraitData.csv",row.names=1)
# 查看性状数据前几行
head(traitData)
allTraits = traitData
dim(allTraits)
names(allTraits)
# 将性状数据与表达数据的样本对应
fpkmSamples = rownames(datExpr)
traitSamples =rownames(allTraits)
traitRows = match(fpkmSamples, traitSamples)
datTraits = allTraits[traitRows,]
rownames(datTraits)
###----------------------------------------------------------------------------
##'@第一步的数据过滤就结束,进行下面的分析
#'@开启WGCNA多线程分析
enableWGCNAThreads(5)
#'@计算软阈值范围
powers = c(c(1:10), seq(from = 12, to=30, by=2))
#'@进行网络拓扑分析
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
#'@绘制软阈值筛选图
pdf(file="4_软阈值选择.pdf",width=12, height = 8)
par(mfrow = c(1,2))
cex1 = 0.85
# 绘制Scale-free拓扑拟合指数
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,cex=cex1,col="red");
# 添加R^2阈值线
abline(h=0.85,col="red")
# 绘制平均连接度
plot(sft$fitIndices[,1], sft$fitIndices[,5],
xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
######选择软阈值
##'@获取系统推荐的最佳软阈值
sft$powerEstimate
##'@手动设置软阈值
softPower = 20
# 计算邻接矩阵
adjacency = adjacency(datExpr, power = softPower)
##### 将邻接矩阵转换为拓扑重叠矩阵
TOM = TOMsimilarity(adjacency);
dissTOM = 1-TOM
# 进行层次聚类
geneTree = hclust(as.dist(dissTOM), method = "average");
# 绘制聚类树状图
pdf(file="4_Gene clustering on TOM-based dissimilarity.pdf",width=6,height=4)
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
dev.off()
# 设置模块最小基因数
minModuleSize = 30
# 使用动态剪切树方法识别模块
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize);
# 查看模块分布
table(dynamicMods)
# 将数字标签转换为颜色标签
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
# 绘制基因树和模块颜色
pdf(file="5_Dynamic Tree Cut.pdf",width=8,height=6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
dev.off()
# 计算模块特征向量
MEList = moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes
# 计算模块特征向量的相异度
MEDiss = 1-cor(MEs);
# 对模块特征向量进行聚类
METree = hclust(as.dist(MEDiss), method = "average")
# 绘制模块特征向量聚类图
pdf(file="6_Clustering of module eigengenes.pdf",width=7,height=6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
MEDissThres = 0.2 # 设置合并阈值
# 添加切割线
abline(h=MEDissThres, col = "red")
dev.off()
##'@合并相似模块
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
# 获取合并后的模块颜色
mergedColors = merge$colors
# 获取合并后的模块特征向量
mergedMEs = merge$newMEs
# 查看合并后的模块分布
table(mergedColors)
#导出所有模块的基因列表
color<-unique(mergedColors)
for (i in 1:length(color)) {
y=t(assign(paste(color[i],"expr",sep = "."),datExpr[mergedColors==color[i]]))
write.csv(y,paste('6',color[i],"csv",sep = "."),quote = F)
}
##'@输出merge模块图形
pdf(file="7_merged dynamic.pdf", width = 8, height = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
# 重命名为moduleColors用于后续分析
moduleColors = mergedColors
# 构建颜色对应的数字标签
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs
####分析模块与外部临床性状的关系###########
# 定义基因和样本数量
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# 计算模块与性状的相关性
moduleTraitCor = cor(MEs, datTraits, use = "p")
# 计算相关性的p值
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
# 绘制模块-性状关系热图
pdf(file="8_Module-trait relationships.pdf",width=8,height=5)
# 准备相关性和p值的显示文本
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
signif(moduleTraitPvalue, 1), ")", sep = "")
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# 绘制热图
labeledHeatmap(Matrix = moduleTraitCor,
xLabels = names(datTraits),
yLabels = names(MEs),
ySymbols = names(MEs),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.5,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
dev.off()
###'@绘制相关性网络图
# 构建网络相关性分析
link_cor <- mantel_test(
###'@若你需要全部输出,请直接使用性状数据即可
datTraits,
mergedMEs,
### 修改你的性状数据信息
spec_select =list(Heat =1,
Cold =2,
UAO =3,
TT =4,
AUC =5)) %>%
mutate(rd=cut(r,breaks=c(-Inf,0.2,0.6,Inf),
labels=c("<0.2","0.2-0.6",">=0.6")),
pd=cut(p,breaks=c(-Inf,0.01,0.5,Inf),
labels=c("<0.01","0.01-0.05",">=0.05")))
# 绘制相关性网络图
pdf("8-2.相关性网络图.pdf",width = 8,height = 6)
qcorrplot(correlate(mergedMEs),type = "upper", ## "upper"or "lower"
diag = FALSE) +
# 设置图形形状
geom_square() +
# 添加连接线
geom_couple(aes(colour = pd, size = rd), data = link_cor,
curvature = nice_curvature()) +
# 设置颜色渐变
scale_fill_gradientn(colours = rev(RColorBrewer::brewer.pal(11, "RdBu")))+
# 设置图例参数
scale_size_manual(values = c(0.5, 1, 2)) +
scale_colour_manual(values = color_pal(3)) +
guides(size = guide_legend(title = "Mantel's r",
override.aes = list(colour = "grey35"),
order = 2),
colour = guide_legend(title = "Mantel's p",
override.aes = list(size = 3),
order = 1),
fill = guide_colorbar(title = "Pearson's r",
order = 3))+
# 设置图形边距和主题
theme(plot.margin = unit(c(0,0,0,0),units="cm"),
legend.background = element_blank(),
legend.key = element_blank())
dev.off()
###'@计算模块成员度(MM)和基因显著性(GS)
# 获取模块名称
modNames = substring(names(MEs), 3)
# 计算基因模块成员度
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))
# 计算MM的p值
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))
# 设置列名
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")
# 获取性状名称
traitNames=names(datTraits)
# 计算基因与性状的相关性
geneTraitSignificance = as.data.frame(cor(datExpr, datTraits, use = "p"))
# 计算GS的p值
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
# 设置列名
names(geneTraitSignificance) = paste("GS.", traitNames, sep="")
names(GSPvalue) = paste("p.GS.", traitNames, sep="")
####为每个性状和模块绘制MM vs GS图
for (trait in traitNames){
traitColumn=match(trait,traitNames)
for (module in modNames){
column = match(module, modNames)
moduleGenes = moduleColors==module
if (nrow(geneModuleMembership[moduleGenes,]) > 1){
pdf(file=paste("9_", trait, "_", module,"_Module membership vs gene significance.pdf",sep=""),width=7,height=7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
abs(geneTraitSignificance[moduleGenes, traitColumn]),
xlab = paste("Module Membership in", module, "module"),
ylab = paste("Gene significance for ",trait),
main = paste("Module membership vs. gene significance\n"),
cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
dev.off()
}
}
}
# 获取基因名称
names(datExpr)
probes = names(datExpr)
############导出GS和MM信息#########
# 创建包含基因信息的数据框
geneInfo0 = data.frame(probes= probes,
moduleColor = moduleColors)
# 添加基因-性状相关性信息
for (Tra in 1:ncol(geneTraitSignificance))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneTraitSignificance[,Tra],
GSPvalue[, Tra])
names(geneInfo0) = c(oldNames,names(geneTraitSignificance)[Tra],
names(GSPvalue)[Tra])
}
# 添加基因-模块成员度信息
for (mod in 1:ncol(geneModuleMembership))
{
oldNames = names(geneInfo0)
geneInfo0 = data.frame(geneInfo0, geneModuleMembership[,mod],
MMPvalue[, mod])
names(geneInfo0) = c(oldNames,names(geneModuleMembership)[mod],
names(MMPvalue)[mod])
}
# 按模块颜色排序基因信息
geneOrder =order(geneInfo0$moduleColor)
geneInfo = geneInfo0[geneOrder, ]
# 导出GS和MM信息
write.table(geneInfo, file = "10_GS_and_MM.xls",sep="\t",row.names=F)
#######可视化基因网络###########
# 设置参数
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
nSelect = 1000
# 随机选择基因进行可视化
set.seed(10)
select = sample(nGenes, size = nSelect)
selectTOM = dissTOM[select, select]
# 对选定的基因进行聚类
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select]
# 调整TOM矩阵以优化可视化效果
plotDiss = selectTOM^7
diag(plotDiss) = NA
# 绘制网络热图
library(gplots)
pdf(file="13_Network heatmap plot_selected genes.pdf",width=8, height=8)
mycol = colorpanel(250,'red','orange','lemonchiffon')
TOMplot(plotDiss, selectTree, selectColors, col=mycol ,main = "Network heatmap plot, selected genes")
dev.off()
#######可视化特征基因网络#############
# 绘制特征基因树状图和邻接矩阵热图
pdf(file="14_Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=4)
par(cex = 0.9)
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)
dev.off()
# 单独绘制特征基因树状图
pdf(file="15_Eigengene dendrogram_1.pdf",width=4, height=3)
par(cex = 1.0)
plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0), plotHeatmaps = FALSE)
dev.off()
# 单独绘制特征基因邻接矩阵热图
pdf(file="15_Eigengene adjacency heatmap_2.pdf",width=4, height=3)
par(cex = 1.0)
plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2), plotDendrograms = FALSE, xLabelsAngle = 90)
dev.off()
###-----------------=============-------------------------------------------------
##'@导出模块网络数据用于Cytoscape可视化
# 创建输出目录
dir.create('11_cytoscape', recursive = TRUE)
moduleColorsWW <- mergedColors
# 为每个模块导出网络数据
for (mod in 1:nrow(table(moduleColorsWW))) {
modules <- names(table(moduleColorsWW))[mod]
probes <- colnames(datExpr)
inModule <- (moduleColorsWW == modules)
modProbes <- probes[inModule]
modGenes <- modProbes
# 提取模块内基因的TOM矩阵
modtom_sim <- TOM[inModule, inModule]
dimnames(modtom_sim) <- list(modProbes, modProbes)
# 设置输出文件路径
outEdge <- paste('11_cytoscape/', modules, '.edge_list.txt',sep = '')
outNode <- paste('11_cytoscape/', modules, '.node_list.txt', sep = '')
# 导出网络数据
exportNetworkToCytoscape(modtom_sim,
edgeFile = outEdge,
nodeFile = outNode,
weighted = TRUE,
threshold = 0.3, # 设置权重阈值
nodeNames = modProbes,
altNodeNames = modGenes,
moduleColorsWW[inModule])
}
###'@Hub基因的筛选
# 选择感兴趣的模块
module = "magenta" # 设置目标模块
column = match(module, modNames)
moduleGenes = moduleColors==module
# 提取模块内的基因
hub_module<-as.data.frame(dimnames(data.frame(datExpr))[[2]][moduleGenes])
names(hub_module)="genename"
# 计算MM和GS值
MM<-abs(geneModuleMembership[moduleGenes,column])
GS<-abs(geneTraitSignificance[moduleGenes, 1])
hub_MMGS<-as.data.frame(cbind(MM,GS))
rownames(hub_MMGS)=hub_module$genename
# 筛选Hub基因
key_hub_b<-subset(hub_MMGS, abs(hub_MMGS$MM)>0.6 & abs(hub_MMGS$GS)>0.2)
dim(key_hub_b)
head(key_hub_b)
# 导出Hub基因信息
write.csv(key_hub_b, "19.Hub_genes.csv")
###'@绘制Hub基因网络图
# 读取网络数据
net <- read.table("11_cytoscape/magenta.edge_list.txt",header = T)
info <- read.table("11_cytoscape/magenta.node_list.txt",header = T)
# 创建网络图对象
g <- tbl_graph(nodes = info,edges = net,directed = F)
# 计算节点度数
V(g)$degree <- degree(g)
# 设置颜色方案
mycol <- c("#FF8901","#00C5FF","#FF5485")
# 绘制环形网络图
ggraph(g, layout = 'linear',circular = TRUE)+
geom_edge_arc(colour="grey50",width=1,alpha=0.3)+
geom_node_point(aes(color=class),size=6,alpha=0.8)+
scale_colour_manual(values = mycol)+
geom_node_text(aes(x = 1.05 * x,y = 1.05 * y,
angle = -((-node_angle(x, y) + 90) %% 180) + 90,
label=name),
nudge_y = 0,
hjust = 'outward',
repel = F,
size=2.5)+
coord_fixed(clip = "off")+
theme_graph()
我来帮你分析这段WGCNA代码,如果运行这段代码需要准备以下内容:
- 必需的输入文件:
- ExpData_WGCNA_log2.csv - 基因表达数据文件
- TraitData.csv - 性状数据文件
- 需要修改的代码部分:
# 1. 工作目录路径需要修改为本地路径
setwd("D:\\BioinfoFile\\小杜的生信笔记\\2024\\20240117_WGCNA-六-更新")
# 2. 软阈值可能需要根据实际数据调整
softPower = 20
# 3. 模块最小基因数可根据需要调整
minModuleSize = 30
# 4. Hub基因筛选的模块颜色需要根据实际结果选择
module = "magenta"
# 5. Hub基因筛选阈值可以根据需要调整
key_hub_b <- subset(hub_MMGS, abs(hub_MMGS$MM)>0.6 & abs(hub_MMGS$GS)>0.2)
- 需要安装的R包:
- WGCNA
- ggraph
- tidygraph
- linkET
- igraph
- netET
- BiocManager
- 输入数据格式要求:
ExpData_WGCNA_log2.csv 格式:
- 第一列为基因名/ID
- 其他列为样本的表达值
- 需要进行log2转换
TraitData.csv 格式:
- 行名为样本名(需要与表达数据的样本名一致)
- 列为各个性状数据
- 系统要求:
- R版本 ≥ 4.0
- 足够的内存(根据数据量大小,建议至少8GB)
- 建议使用Windows或Linux系统
- 注意事项:
- 确保输入数据没有缺失值
- 样本名在表达数据和性状数据中需要完全一致
- 建议先用小数据集测试代码运行是否正常
这样你的同学就可以在自己的电脑上运行这个WGCNA分析流程了。如果遇到具体问题,可以根据错误信息进行针对性解决。