037 Pca分析

代码目的

主成分分析 (PCA) 是一种无监督学习的多元统计分析方法,用于将高维数据投影到较低维空间,提取多元事物的主要因素,揭示其本质特征。它被广泛应用于很多领域,比如理论物理学、气象学、心理学、生物学、化学、工程学等。

代码


#代码在微信公众号,不懂绘图中,请自己翻历史图文消息。
#相关运行问题请加qq群687752308
#也接代做的
rm(list=ls())
library(ade4)
library(vegan)
library(gclus)
library(ape)
library(missMDA)
library(FactoMineR)
library(ggplot2)
library(ggrepel)
library(ggthemes)
library(ggExtra)
#导入数据
env<-read.csv("D:/生信代码复现/PCA分析/env.csv",header = T,row.names = 1)

# 使用RDA函数执行pca分析
#scale = TRUE代表进行标准化
env.pca <- rda(env, scale = TRUE)
env.pca
# scaling参数代表为标尺类型
#默认为2型标尺
summary(env.pca)
#可以切换为1型标尺
summary(env.pca, scaling = 1)
#提取特征值
ev <- env.pca$CA$eig
ev
#kaiser-Guttman准则选取排序轴
ev[ev>mean(ev)]
#断棍模型
n<-length(ev)
bsm<-data.frame(j=seq(1:n),p=0)
bsm$p[1]<-1/n          
for(i in 2:n){
  bsm$p[i]=bsm$p[i-1]+(1/(n+1-i))
}
bsm$p<-100*bsm$p/n
bsm
#特征值
barplot(ev,main="Eigenvalues",col="bisque",las=2)
abline(h=mean(ev),col="red")#特征根平均值
legend("topright","Average eigenvalue",lwd=1,col=2,bty="n")
#方差百分比
barplot(t(cbind(100*ev/sum(ev),bsm$p[n:1])),beside=TRUE,
        main="%variance",col=c("bisque",2),las=2)
legend("topright",c("%eigenvalue","Broken stick model"),
       pch=15,col=c("bisque",2),bty="n")
#1型标尺默认图
biplot(env.pca, scaling = 1, main = "PCA - scaling 1")
#2型标尺默认图
biplot(env.pca, main = "PCA - scaling 2")
#使用之前的代码进行绘图
#样方坐标
site<-env.pca$CA$u[,1:2]
head(site)
#环境坐标
data<-env.pca$CA$v[,1:2]
head(data)
#解释度
pca1<-round(env.pca$CA$eig[1]/sum(env.pca$CA$eig)*100,2)
pca1
pca2<-round(env.pca$CA$eig[2]/sum(env.pca$CA$eig)*100,2)
pca2
#绘图
grp=as.data.frame(c(rep("A",6),rep("B",6),
                    rep("C",6),rep("D",6),rep("E",6)))###对样方分组
colnames(grp)="group"###重命名列名
p<-ggplot() +
  geom_point(data = site,aes(PC1,PC2,color=grp$group),size=4)+
  scale_colour_manual(values=c("#99e5f3","#67a8cd","#ffc17f"
                               ,"#cf9f88","#6fb3a8"))+#颜色可修改
  geom_segment(data = data,aes(x = 0, y = 0, xend = PC1, yend = PC2), 
               arrow = arrow(angle=22.5,length = unit(0.35,"cm"),
                             type = "closed"),linetype=1, size=0.6,colour = "red")+#颜色可修改
  geom_text_repel(data = data,aes(PC1,PC2,label=row.names(data)))+
  labs(x="PCA1 54.26%",y="PCA2 19.67%")+#修改为自己解释率
  geom_hline(yintercept=0,linetype=3,size=1) + 
  geom_vline(xintercept=0,linetype=3,size=1)+
  guides(shape=guide_legend(title=NULL,color="black"),
         fill=guide_legend(title=NULL))+
  theme_bw()+theme(panel.grid=element_blank())
p
#有分组文件时,添加置信椭圆
p1<-ggplot() +
  geom_point(data = site,aes(PC1,PC2,color=grp$group),size=4)+
  scale_colour_manual(values=c("#99e5f3","#67a8cd","#ffc17f"
                               ,"#cf9f88","#6fb3a8"))+#颜色可修改
  stat_ellipse(data = site, geom = "polygon", level = 0.9,
               linetype = 1, linewidth = 0.1,
               aes(fill = grp$group, x = PC1, y = PC2),
               alpha = 0.3, show.legend = T) +
  scale_fill_manual(values = c("#99e5f3","#67a8cd","#ffc17f"
                               ,"#cf9f88","#6fb3a8"))+
  geom_segment(data = data,aes(x = 0, y = 0, xend = PC1, yend = PC2), 
               arrow = arrow(angle=22.5,length = unit(0.35,"cm"),
                             type = "closed"),linetype=1, size=0.6,colour = "red")+#颜色可修改
  geom_text_repel(data = data,aes(PC1,PC2,label=row.names(data)))+
  labs(x="PCA1 54.26%",y="PCA2 19.67%")+#修改为自己解释率
  geom_hline(yintercept=0,linetype=3,size=1) + 
  geom_vline(xintercept=0,linetype=3,size=1)+
  guides(shape=guide_legend(title=NULL,color="black"),
         fill=guide_legend(title=NULL))+
  theme_bw()+theme(panel.grid=element_blank())
p1
#添加边际密度曲线
ggMarginal(
  p1,
  type=c('density'),
  margins='both',
  size=3.5,
  groupColour=F,
  groupFill=T
)
#将PCA结果和聚类结果同时放在一起
#对数据进行标准化,然后计算欧式距离进行聚类分析
env.w <- hclust(dist(scale(env)), "ward.D")
#剪裁聚类树,只保留4个聚类簇
gr <- cutree(env.w, k = 4)
grl <- levels(factor(gr))
#提取样方坐标。1型标尺
sit.sc1 <- scores(env.pca, display = "wa", scaling = 1)
# 按照聚类分析的结果对样方进行上色(1型)
p <- plot(
  env.pca,
  display = "wa",
  scaling = 1,
  type = "n",
  main = "PCA correlation + clusters"
)
abline(v = 0, lty = "dotted")
abline(h = 0, lty = "dotted")
for (i in 1:length(grl)) {
  points(sit.sc1[gr == i, ],
         pch = (14 + i ) ,
         cex = 2,
         col = i + 1)
}
text(sit.sc1, row.names(env), cex = 0.7, pos = 3)
# PCA中添加聚类树
ordicluster(p, env.w, col = "dark grey")
# 添加图例
legend(
  locator(1),
  paste("Cluster", c(1:length(grl))),
  pch = 14 + c(1:length(grl)),
  col = 1 + c(1:length(grl)),
  pt.cex = 2
)
#tips如果是物种数据进行Hellinger转换

复现图片

image.png