一、生信分析的目的
前面已经介绍了从差异表达、生存分析和ROC分析三个方面对基因进行初步筛选。
但是在实际操作中会发现,在可视化和进一步分析过程中,如果想研究多个基因,就要不停的在代码中更改基因名称,还要慢慢地去数基因所在的列数,并且难免会有数错的可能。
那能不能直接让代码自己判断然后将所有符合条件的基因进行批量可视化呢,这次就分享对基因集进行快速的批量化筛选一条龙(无脑运行代码就能出结果)的方法。
二、具体的代码
remove(list = ls()) ##清空当前环境
setwd("D:/生信代码复现/6.番外:快速批量化/6.番外:快速批量化") ##💚💚💚设置路径
library(data.table)
library(tidyverse)
library(ggsignif)
library(RColorBrewer)
library(future.apply)
library(survival)
library(survminer)
library(survivalROC)
library(survival)
library(survminer)
library(timeROC)
library(limma)
rt1=read.table("combined_RNAseq_FPKM.txt",sep="\t",header=T,check.names=F) #💛💛💛基因表达矩阵文件,整理得到
#多行取平均值
rt1=as.matrix(rt1)
rownames(rt1)=rt1[,1]
exp=rt1[,2:ncol(rt1)]
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)
data=data[rowMeans(data)>0,]
data2=as.data.frame(data)
#以01A和11A分组,正常放前面,肿瘤放后面
exp_data_T = data2%>% dplyr::select(str_which(colnames(.), "-01A")) # 匹配列名或用下示写法
nT = ncol(exp_data_T)
exp_data_N = data2%>% dplyr::select(str_which(colnames(.), "-11A"))
nN = ncol(exp_data_N)
rt1= cbind(exp_data_N, exp_data_T)
#write.table(rt1,file = "groupout.txt",sep="\t",quote=F) ###💜💜💜输出分组文件
#加载limma包,用于校正和比较差异
#rt=read.table("groupout.txt",sep="\t",header=T,check.names=F,row.names = 1)
library(limma)
#normalize以下为对数据的校正
#画箱线图
pdf(file="rawBox.pdf") #💜💜💜
boxplot(rt1,col = "blue",xaxt = "n",outline = F)
dev.off()
#校正
rt1=normalizeBetweenArrays(rt1)
rt0=rt1
data1=t(rt1)
#画校正后箱线图
pdf(file="normalBox.pdf") #💜💜💜
boxplot(rt1,col = "red",xaxt = "n",outline = F)
dev.off()
group1=sapply(strsplit(colnames(rt1),"\\-"), "[", 4)
group1=sapply(strsplit(group1,""), "[", 1)
group1=gsub("2", "1", group1)
conNum=length(group1[group1==1]) #正常组样品数目
#view(conNum)#查看正常组样品数目
treatNum=length(group1[group1==0]) #肿瘤组样品数目
#view(treatNum)#查看肿瘤组样品数目
Type=c(rep(1,conNum), rep(2,treatNum))
#样品分组
exp=cbind(data1, Type)
exp=as.data.frame(exp)
#输出目标基因的表达量
outTab=exp
#colnames(outTab)=c(gene, "Type")
outTab=cbind(ID=row.names(outTab), outTab)
#differential差异分析
class <- c(rep("con",conNum),rep("treat",treatNum))
design <- model.matrix(~factor(class)+0)
colnames(design) <- c("con","treat")
#算方差
df.fit <- lmFit(rt1,design)
df.matrix<- makeContrasts(con - treat,levels=design)
fit<- contrasts.fit(df.fit,df.matrix)
#贝叶斯检验
fit2 <- eBayes(fit)
#输出基因
allDiff=topTable(fit2,adjust='fdr',n=Inf) #选择20万是为了输出所有基因
#写入表格
write.table(allDiff,file="limmaTab.xls",sep="\t",quote=F) #💜💜💜
#找出差异两倍以上,pvalue小于0.05,写入表格
diffLab <- allDiff[with(allDiff, ((logFC> 1 | logFC< (-1)) & adj.P.Val < 0.05 )), ]
write.table(diffLab,file="diffExp.xls",sep="\t",quote=F) #💜💜💜
#差异基因表达水平,用于共表达
diffExpLevel <- rt1[rownames(diffLab),]
#qvalue=allDiff[rownames(diffLab),]$adj.P.Val
#diffExpQvalue=cbind(qvalue,diffExpLevel)
write.table(diffExpLevel,file="diffExpLevel.xls",sep="\t",quote=F) #💜💜💜
write.csv(diffExpLevel,file="diffGeneExp.csv",quote=F,row.names = T) #💜💜💜
expFile="diffGeneExp.csv" #表达数据文件
cliFile="time.txt" #💛💛💛临床数据文件,这个临床存活时间需要本地存在这个文件
#读取表达数据文件
rt2=read.csv(expFile, header=T, check.names=F, row.names=1)
exp_data_T = rt2%>% dplyr::select(str_which(colnames(.), "-01A")) # 匹配列名或用下示写法
nT = ncol(exp_data_T)
exp_data_N = rt2%>% dplyr::select(str_which(colnames(.), "-11A"))
nN = ncol(exp_data_N)
rt2= cbind(exp_data_N, exp_data_T)
#读取生存数据文件
cli=read.table(cliFile, header=T, sep="\t", check.names=F, row.names=1)
cli$time=cli$time/365
group1=sapply(strsplit(colnames(rt2),"\\-"), "[", 4)
group1=sapply(strsplit(group1,""), "[", 1)
group1=gsub("2", "1", group1)
conNum=length(group1[group1==1]) #正常组样品数目
#view(conNum)#查看正常组样品数目
treatNum=length(group1[group1==0]) #肿瘤组样品数目
#view(treatNum)#查看肿瘤组样品数目
Type=c(rep(1,conNum), rep(2,treatNum))
#删掉正常样品
tumorData=exp_data_T
tumorData=as.matrix(tumorData)
tumorData=t(tumorData)
rownames(tumorData)=gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", rownames(tumorData))
data=avereps(tumorData)
#数据合并并输出结果
sameSample=intersect(row.names(data), row.names(cli))
data=data[sameSample,,drop=F]
cli=cli[sameSample,,drop=F]
rt2=cbind(cli, data)
#输出合并后的数据
outTab=cbind(ID=row.names(rt2), rt2)
outTab=outTab[,-ncol(outTab)]
write.table(outTab, file="expTime.txt", sep="\t", quote=F, row.names=F)
rt2=read.table("expTime.txt",header=T,sep="\t",check.names=F,row.names=1)
sigGenes=c("time","event")
time=rt2$time
data=rt2[1]
exp=rt2[,3:ncol(rt2)]
time=cbind(time,exp)
rt2=cbind(data, time)
#生存分析
res3 <- data.frame()
genes <- colnames(rt2)[-c(1:2)]
plan(multisession)
system.time(res3 <- future_lapply(1:length(genes), function(i){
group = ifelse(rt2[,genes[i]]>median(rt2[,genes[i]]),'high','low')
if(length(table(group))==1) return(NULL)
surv =as.formula(paste('Surv(time, event)~', 'group'))
data = cbind(rt2[,1:2],group)
x = survdiff(surv, data = data)
pValue=1-pchisq(x$chisq,df=1)
return(c(genes[i],pValue))
}))
res3 <- data.frame(do.call(rbind,res3))
names(res3 ) <- c('ID','pValue_log')
res3 <- res3[with(res3, (pValue_log < 0.05 )), ]
write.table(res3,file="OS.xls",sep="\t",quote=F)
surSigExp <- cbind(rt2[sigGenes],rt2[res3$ID])
write.table(surSigExp,file="surSigExp.txt",sep="\t",row.names=T,quote=F)
#ROC分析
rocFilter=0.7 #ROC过滤值
rt3=read.table("surSigExp.txt",header=T,sep="\t",check.names=F,row.names=1) #读取输入文件
outTab=data.frame()
sigGenes=c("time","event")
for(i in colnames(rt3[,3:ncol(rt3)])){
roc=timeROC(T=rt3$time,
delta=rt3$event,
marker = rt3[,i],
cause=1,
weighting='aalen',times=c(1,3,5,10),ROC=TRUE)
if(roc$AUC[1]>rocFilter & roc$AUC[2]>rocFilter & roc$AUC[3]>rocFilter){
sigGenes=c(sigGenes,i)
outTab=rbind(outTab,cbind(gene=i,AUC1year=roc$AUC[1],AUC3year=roc$AUC[2],AUC5year=roc$AUC[3]))
}
}
write.table(outTab,file="ROC.xls",sep="\t",row.names=F,quote=F) #输出基因和p值表格文件
rocSigExp=rt3[,sigGenes]
rocSigExp=cbind(id=row.names(rocSigExp),rocSigExp)
write.table(rocSigExp,file="rocSigExp.txt",sep="\t",row.names=F,quote=F)
#可视化
genes=(rocSigExp[,4:ncol(rocSigExp)])
gene2=outTab$gene
for(i in 1:length(genes)){
print(i)
ROC_rt3=timeROC(T=rocSigExp$time, delta=rocSigExp$event,
marker=rocSigExp[,3+i], cause=1,
weighting='aalen',
times=c(1,3,5,10), ROC=TRUE)
pdf(paste0(colnames(genes[i]), "_ROC.pdf"), width=5, height=5)
plot(ROC_rt3,time=1,col='green',title=FALSE,lwd=2)
plot(ROC_rt3,time=3,col='blue',add=TRUE,title=FALSE,lwd=2)
plot(ROC_rt3,time=5,col='red',add=TRUE,title=FALSE,lwd=2)
plot(ROC_rt3,time=10,col='yellow',add=TRUE,title=FALSE,lwd=2)
legend('bottomright',
c(paste0('AUC at 1 years: ',sprintf("%.03f",ROC_rt3$AUC[1])),
paste0('AUC at 3 years: ',sprintf("%.03f",ROC_rt3$AUC[2])),
paste0('AUC at 3 years: ',sprintf("%.03f",ROC_rt3$AUC[3])),
paste0('AUC at 10 years: ',sprintf("%.03f",ROC_rt3$AUC[4]))),
col=c("green",'blue','red','yellow'),lwd=2,bty = 'n')
dev.off()
gene=gene2[i]
data=t(rt1[gene,,drop=F])
Type=c(rep(1,conNum), rep(2,treatNum))
exp=cbind(data, Type)
exp=as.data.frame(exp)
colnames(exp)=c("gene", "Type")
exp$Type=ifelse(exp$Type==1, "Normal", "Tumor")
group=levels(factor(exp$Type))
exp$Type=factor(exp$Type, levels=group)
comp=combn(group,2)
my_comparisons=list()
for(i in 1:ncol(comp)){my_comparisons[[i]]<-comp[,i]}
boxplot=ggboxplot(exp, x="Type", y="gene", color="Type",
xlab="",
ylab=paste0(gene, " expression"),
legend.title="Type",
palette = c("blue","red"),
add = "jitter")+
stat_compare_means(comparisons=my_comparisons,symnum.args=list(cutpoints = c(0, 0.001, 0.01, 0.05, 1), symbols = c("***", "**", "*", "ns")),label = "p.signif")
pdf(file=paste0(gene,".diff.pdf"), width=5, height=4.5)
print(boxplot)
dev.off()
group = ifelse(rt2[,gene]>median(rt2[,gene]),'high','low')
p=ggsurvplot(survfit(Surv(time, event)~group,
data=rt2), conf.int=F, pval=TRUE)
pdf(paste0(gene, "_surv.pdf"),width = 5, height = 5)
print(p, newpage = FALSE)
dev.off()
}
三、最终的效果
①R语言中的效果:
②文件效果: