一、目的
二、代码
library(limma)
setwd("E:/微生物-赵建组/课题进展/生信挖掘相关基因/03.TCGA胃癌数据/01.TCGA下载的胃癌的数据")
merge_TCGA <- function(metadata, path, data.type, mRNA_expr_type="TPM", symbol=T, RNA_type=T){
filenames <- file.path(path, metadata$file_id, metadata$file_name,
fsep = .Platform$file.sep)
if (data.type=='RNAseq') {
message ('############### 正在进行提取,请稍后 ################\n',
' ### 正在进行提取 ###\n',
' ### 正在进行提取 ###\n',
' ### 正在进行提取 ###\n')
if(mRNA_expr_type=="TPM"){
column=4
}else if(mRNA_expr_type=="TPM"){
column=7
}else if(mRNA_expr_type=="FPKM"){
column=8
}else if(mRNA_expr_type=="FPKM_UQ"){
column=9
}
rnaMatrix <- do.call("cbind", lapply(filenames, function(fl)
read.table(fl,skip=6,sep="\t")[,column]))
ensembl <- read.table(filenames[1],skip=6,sep="\t",stringsAsFactors = F)$V1
gene_symbol <- read.table(filenames[1],skip=6,sep="\t",stringsAsFactors = F)$V2
type <- read.table(filenames[1],skip=6,sep="\t",stringsAsFactors = F)$V3
index=grepl("^\\d+$",sapply(strsplit(ensembl, '.', fixed=TRUE), '[',2))
rnaMatrix=rnaMatrix[index,]
rownames(rnaMatrix) <- sapply(strsplit(ensembl[index], '.', fixed=TRUE), '[',1)
gene_symbol=gene_symbol[index]
type=type[index]
colnames(rnaMatrix) <- metadata$sample
nSamples = ncol(rnaMatrix)
nGenes = nrow(rnaMatrix)
if(RNA_type){
rnaMatrix=data.frame(type,rnaMatrix,stringsAsFactors = F,check.names = F)
}
if(symbol){
rnaMatrix=data.frame(gene_symbol,rnaMatrix,stringsAsFactors = F,check.names = F)
}
message (paste('Number of samples: ', nSamples, '\n', sep=''),
paste('Number of genes: ', nGenes, '\n', sep=''))
return (rnaMatrix)
}else if (data.type=='miRNAs') {
message ('############### Merging miRNAs data ###############\n')
mirMatrix <- lapply(filenames, function(fl) filtermir(fl))
mirs <- mirbase$V1
mirMatrix <- do.call('cbind', lapply(mirMatrix,
function(expr) expr[mirs]))
rownames(mirMatrix) <- mirbase$V2
colnames(mirMatrix) <- metadata$sample
mirMatrix[is.na(mirMatrix)] <- 0
nSamples = ncol(mirMatrix)
nGenes = nrow(mirMatrix)
message (paste('Number of samples: ', nSamples, '\n', sep=''),
paste('Number of miRNAs: ', nGenes, '\n', sep=''))
return (mirMatrix)
}else{
stop('data type error!')
}
}
filtermir <- function(fl) {
expr <- read.table(fl, header=TRUE, stringsAsFactors = FALSE)
expr <- expr[TPMtsWith(expr$miRNA_region, "mature"),]
expr <- aggregate(expr$read_count, list(expr$miRNA_region), sum)
mirs <- sapply(strsplit(expr$Group.1, ',', fixed=TRUE),'[',2)
expr <- expr[,-1]
names(expr) <- mirs
return(expr)
}
FilterDuplicate <- function(metadata) {
filter <- which(duplicated(metadata[,'sample']))
if (length(filter) != 0) {
metadata <- metadata[-filter,]
}
message (paste('Removed', length(filter), 'samples', sep=' '))
return (metadata)
}
FilterSampleType <- function(metadata) {
filter <- which(! metadata$sample_type %in%
c('PrimaryTumor', 'SolidTissueNormal'))
if (length(filter) != 0) {
metadata <- metadata[-filter,]
}
message (paste('Removed', length(filter), 'samples', sep=' '))
return (metadata)
}
metaMatrix.RNA=read.table("RNAseq_sample_sheet.tsv",sep="\t",header=T)
names(metaMatrix.RNA)=gsub("sample_id","sample",gsub("\\.","_",tolower(names(metaMatrix.RNA))))
metaMatrix.RNA$sample_type=gsub(" ","",metaMatrix.RNA$sample_type)
metaMatrix.RNA <- FilterDuplicate(metaMatrix.RNA)
metaMatrix.RNA <- FilterSampleType(metaMatrix.RNA)
RNA_TPM=merge_TCGA(metadata=metaMatrix.RNA,
path="RNAseq",
data.type="RNAseq",
mRNA_expr_type="TPM",
symbol = T,
RNA_type=T
)
RNA_TPM=as.matrix(RNA_TPM)
rownames(RNA_TPM)=RNA_TPM[,1]
RNA_TPM_exp=RNA_TPM[,3:ncol(RNA_TPM)]
RNA_TPM_dimnames=list(rownames(RNA_TPM_exp),colnames(RNA_TPM_exp))
RNA_TPM_data=matrix(as.numeric(as.matrix(RNA_TPM_exp)),nrow=nrow(RNA_TPM_exp),dimnames=RNA_TPM_dimnames)
RNA_TPM_data=avereps(RNA_TPM_data)
write.table(file="combined_RNAseq_TPM.txt",RNA_TPM_data,sep="\t",quote=F)
RNA_TPM=as.data.frame(RNA_TPM)
RNA_TPM_lnc=RNA_TPM[RNA_TPM$type=="lncRNA",]
RNA_TPM_lnc=as.matrix(RNA_TPM_lnc)
rownames(RNA_TPM_lnc)=RNA_TPM_lnc[,1]
RNA_TPM_lnc_exp=RNA_TPM_lnc[,3:ncol(RNA_TPM_lnc)]
RNA_TPM_lnc_dimnames=list(rownames(RNA_TPM_lnc_exp),colnames(RNA_TPM_lnc_exp))
RNA_TPM_lnc_data=matrix(as.numeric(as.matrix(RNA_TPM_lnc_exp)),nrow=nrow(RNA_TPM_lnc_exp),dimnames=RNA_TPM_lnc_dimnames)
RNA_TPM_lnc_data=avereps(RNA_TPM_lnc_data)
write.table(file="combined_RNAseq_TPM_lncRNA.txt",RNA_TPM_lnc_data,sep="\t",quote=F)