高级生物统计学-数据清洗代码整理
高级生物统计学数据清洗相关代码整理-自学留用
rm(list=ls())
gc()
remove.packages(c("rTG"),lib=file.path("G:/R1216/R-4.2.1/library"))#apply, lapply, sapply, tapply https://www.jianshu.com/p/59fb24ca2ea7
#数据结构
#向量
myvector <- c(1,2,4,5)
myvector#矩阵
mymatrix <- matrix(1:20,nrow = 4,ncol=5)
mymatrix#数组
arr1 <- c("a","b")
arr2 <- c("tree","grass","sea")
arr3 <- c("apple","pear","strawberry","cherry")
myarray <- array(1:24,c(3,4,2),dimnames = list(arr2,arr3,arr1))
myarray#数据框
name <- c("jackson","anna","lily")
age <- c(29,26,31)
height <- c(183,165,159)
mydata <- data.frame(name,age,height)
mydata#a1建成一个空向量,然后对这个向量进行赋值
a1<-c()
a1[1]=0.2
a1
a1[2]=-0.2
a1
a1[100]=0.7 #赋值后会出现3-99均为NA值
a1#stringr包
#str_sub: 截取字符串;str_split: 字符串分割
#str_split_fixed: 字符串分割,同str_split
#str_subset: 返回匹配的字符串
#str_extract: 从字符串中提取匹配字符
#str_to_upper: 字符串转成大写
#str_to_lower: 字符串转成小写
library(stringr)
#str_split_fixed
str_split_fixed("hello-jack", "-", n = 2) #以-为界进行拆分
str_split_fixed("Apple-Banana-Orange", "-", n = 2)[1,1]
str_split_fixed("2022-10-19", "-", n = 3) #以-为界,拆分成三部分
str_split_fixed("2022-10-19", "-", n = 3)[1,2] #提第1行第2列
b1=str_split_fixed("2022-10-19", "-", n = 3)[,1] #提出第一列的元素,并赋值给b1
b1
as.numeric(paste(b1))#转为数值
as.numeric(b1)#str_split
str_split("a-b-c", "-", n = 3)
fruits <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
str_split(fruits, " and ")
str_split(fruits, " and ", simplify = TRUE)data(mtcars)
abc = mtcars$wt
abc
abcd = as.character(abc)#转化为字符串
abcd
str_sub(abcd[1],2,3)
#str_sub函数通过指定开始和结束位置,过滤出字符串的部分
#字符串。取出abcd向量的第一个元素的第2和3位置的字符
str_sub(abcd[2],-2,-1) #取出abcd向量的倒数第2和倒数第1位置的字符
str_sub(abcd[3], 2, 2) <- "-"
# 字符过滤,并赋值;第三个向量的从第2到第2位置被替换为“-”
abcd
str_split(abcd[4],"[.]")
#将小数点前后拆分成两部分#================================================================================
# 字符分割,返回列表
str_split(string = "banana",pattern = "")# 字符分割,返回矩阵
str_split(string = "banana",pattern = "",simplify = T)# 字符分割,指定分割块数
str_split_fixed(string = "banana",pattern = "",n = 3)#=====================================================================================
#提取特定字符串后在新列展示
set.seed(001)
date1=seq(as.Date("2000/1/1"), as.Date("2000/5/1"), "day")
length(date1)
v1=c(runif(22,min=0,max=1),runif(50,min=10,max=20),
runif(50,min=20,max=30))
df=data.frame(t1=date1,v2=v1)
x1=head(df)
x1
str_split_fixed(x1$t1, "-", n = 3)
x1$year=str_split_fixed(x1$t1, "-", n = 3)[,1]#还可以这样str_sub(x1$t1,1,4)
x1$month=str_split_fixed(x1$t1,"-",n=3)[,2]
x1$day=str_split_fixed(x1$t1, "-", n = 3)[,3]
x1
#将第3-5列转化成数字型
x1[,3:5]=lapply(x1[,3:5],function(x)as.numeric(paste(x)))
#将不同列的字符串连接在一起并在新列展示
data("airquality")
head(airquality)
airquality$Year=2005
airquality$new=1:nrow(airquality)
airquality$new=paste(airquality$Year,"-",airquality$Month,"-",
airquality$Day,sep="")#注意和airquality$new=paste(airquality$Year,"-",airquality$Month,"-",
# airquality$Day)对比
head(airquality)
#===================================================================================
# loop in r
ab = c(1,3,5,7) #建立向量ab
forloop = function(x){ #代码用function(x)开始,function(x)中x代指任何一个参数
n = length(x) #循环的次数
vec = c() #建立一个空向量
for (i in 1:n){
vec[i]=x[i]^3 #core核心计算公式
}
return(vec)
}
forloop(ab) #代入向量ab进行forloop计算
#不使用循环也可以进行批量计算的另一种操作
v1=c(1,3,5,7)
v1^2+1#==============================================================================
data(iris)
head(iris)
iris$new= iris$Sepal.Length/iris$Sepal.Width #对鸢尾花数据集进行长/宽比计算的批量操作,并增加一个新的列newiris$log_width=log10(iris$Sepal.Width)#对鸢尾花数据集进行宽取对数计算的批量操作,并增加一个新的列log_width
head(iris)iris$t1=iris$Sepal.Length^2-1
head(iris)forloop1=function(x){
n=1:length(x)
vec=c()
for (i in 1:n) {
vec[i]=x[i]^2
}
return(vec)
}
forloop(iris$t1)aa1<-abs(rnorm(100))
aa1
forloop(aa1)
#========================================================================================# simple data frame tibble
rm(airquality)
#重命名
data("airquality")
head(airquality)
#方法1
names(airquality)[names(airquality)=="Ozone"]<-"xx"
#方法2
colnames(airquality)[colnames(airquality)=="xx"] <- "yy"
colnames(airquality)[1] <- "Ozone"
#大小写改写
colnames(airquality) <- tolower(colnames(airquality)) #将每项名改成小写
head(airquality)
colnames(airquality) <- toupper(colnames(airquality)) #将每项名改成大写
head(airquality)#====================================================================================
#reshape2 melt and dcat 长宽格式互换#宽变长
#案例1
library(reshape2)
data("airquality")
head(airquality)
names(airquality) <- toupper(names(airquality))
colnames(airquality) <- tolower(colnames(airquality)) #将每项名改成小写
air_long <- melt(airquality, id.vars = c("month", "day")) # id.vars 是被当做维度的列变量,每个变量在结果中占一列
head(air_long)
unique(air_long$variable) #查看保留变量有哪些
#案例2
aql <- reshape2::melt(airquality, id.vars = c("month", "day"),#修改名称
variable.name = "climate_variable",
value.name = "climate_value")
head(aql)colnames(aql)[1]="month"#还可以这样修改
head(aql)
#长变宽
aqw <- reshape2::dcast(aql, month + day ~ climate_variable,value.var = "climate_value")
#month day不变,展开多个变量climate_value
head(aqw)
#============================================================================================#长宽数据相互转换
#宽数据变成长数据
library(reshape2)
data("iris")
head(iris)
iris_long<-melt(iris,id.vars = c("Species"),variable.name = "traits",value.name = "measurement")
#以Species为依据呈现,将变量名字命名为traits,traits对应数据命名为measurement
iris_long
dim(iris_long)
str(iris)code <-rep(1:50,12)
#iris具有150 obs. of 5 variables,其中物种具有3类,因此为每个物种具有50组观察数据,
#包含Sepal.Length Sepal.Width Petal.Length Petal.Width几个变量,根据这个进行编号
iris_frame <- data.frame(iris_long,code)
head(iris_frame)#重命名列命
colnames(iris_frame)[4]="ID"
head(iris_frame)
#============================================================================================
#长数据变成宽数据
library(reshape2)
iris_width <- dcast(iris_frame, Species+ID~traits,value.var = "measurement")
head(iris_width)
iris_width
dim(iris_width)
#取iris_long中iris_long$traits=="Sepal.Length"的子集
abc1<-subset(iris_long,iris_long$traits=="Sepal.Length")
head(abc1)
dim(abc1)#=====================================================================================
#利用函数循环实现
fadd=function(iris){
a1=list()
fp=unique(iris$Species)
n1=length(fp)
for (i in 1:n1) {
abc1=subset(iris,Species==fp[i])
abc1$ID=1:nrow(abc1)
a1[[i]]=abc1
}
a2=rbindlist(a1)
return(a2)
}tail(abc1)
head(abc1)
dim(abc1)#====================================================================================
#排序 arrange排序,默认升序,desc降序,
x1<-head(iris)
x1
library(plyr)
x2<-arrange(x1,Sepal.Length)#默认升序
x2<-arrange(x1,desc(Sepal.Length))#默认升序
print(x2)#===================================================================================
#ddply
#案例1:
ddply()
set.seed(1)
d <- data.frame(year = rep(2000:2002, each = 3),#注意和rep(2000:2002, 3)的区别
count = round(runif(9, 0, 20)))
dddply(d,"year",summarise, mean_count = mean(count),sd_count = sd(count))
#案例2
c1 <- data.frame(year = rep(2000:2001, each = 5),
age = rep(24:25, n= 5),
count = round(runif(10, 0, 20)),#随机生成10个数,其范围为0~20之间,round用于将值四舍五入到特定数量的十进制值
content = runif(10,0,1))
c1
#对year和age进行均值等的计算
#方法1
ddply(c1,.(year,age), summarise, mean = mean(count), total = sum(count), number = length(count))
#方法2
ddply(c1,c("year","age"), summarise, mean = mean(count), total = sum(count), number = length(count))
#对year进行均值、总和计算后再在此基础上进行二次计算
c01<-ddply(c1,"year", summarise, mean = mean(count), total = sum(count), number = length(count),a1=mean/total)
#可以二次计算
c01
c11<-ddply(c1,"year", mutate, mean = mean(count), total = sum(count), number = length(count),a1=mean/total)
#可以二次计算,注意与c01结果对比
c11
c12<-ddply(c1,"year", transform, mean = mean(count), total = sum(count), number = length(count),a2=mean/total)
#不可以二次计算
c12
#Error in mean/total : non-numeric argument to binary operator
#对比mutate和transform,mutate可以进行一次运算和二次运算,
#能在计算了mean和total之后还可以在对mean和total进行二次计算(如加减乘除)
#practice2
c011<-ddply(c1,c("year","age"), summarise, mean = mean(count))
c011
c111<-ddply(c1,c("year","age"), mutate, mean = mean(count), total = sum(count), a1=mean/total)
c111
c122<-ddply(c1,c("year","age"), transform, mean = mean(count), total = sum(count), a2=mean/total)
c122
#对构建的向量进行命名时最好有可读性
#案例3,包含NA值时
head(airquality)
solar <- ddply(airquality, "Month", summarise,mean_Solar.R=mean((Solar.R),na.rm = T))
solarST <- ddply(airquality, "Month", mutate,ST=Solar.R/Temp)
head(ST)A1 <- ddply(airquality, "Month", mutate,aa1=mean(Solar.R,na.rm = T),aa2=mean(Temp,na.rm = T),aa3=aa1/aa2)
#这个时候其实是不能用na.rm的,这样可以知道哪些值不能进行计算
A1A2 <- ddply(airquality, "Day", mutate, aa3=Solar.R/Temp)
A2
#===========================================================================================#转化成因子
head(mtcars)
str(mtcars)
mtcars$gear=factor(mtcars$gear)
mtcars$carb=factor(mtcars$carb)
ddply(mtcars,"gear",summarise,mean_disp=mean(disp,na.rm=TRUE))#==========================================================================================
#读取和存储数据(csv格式)
#方法1.使用data.table包的fread读取csv格式的数据
getwd()
setwd("F:/02 博士课程相关/统计学课程/") #每次都需要修改路径哦
library(data.table)
data<-fread("data20221028.csv",header = T)
head(data)
dim(data)#方法2.使用read.csv读取csv格式的数据
data2<-read.csv("data20221028.csv",header = T)
head(data2)
dim(data2)#仅提取第2到第9列
data3<-data[,2:9]
dim(data3)
head(data3)
#重命名列
#方法1.使用plyr包中的rename函数
library(plyr)
data3<-rename(data3, c(day="doy",sp="species"))
#方法2.使用names函数重命名
names(data3)[names(data3)=="pep_id"]<-"ID" #或names(data3)[1]<-"ID"
#方法3
colnames(data3)[2]="latitude"
colnames(data3)[3]="longitude"
head(data3)
#方法4
head(iris)
colnames(iris)=gsub("sepal","SEP",colnames(iris),ignore.case=T)#忽略被替换对象的大小写
head(iris)
#进行计算
#只保留"year"和"bbch",进行平均数等计算
mean<-ddply(data3,c("species","bbch"),summarise,doy_mean = mean(doy))
mean#进行储存
fwrite(data3,"my_data.csv")#储存命名为my_data的文件
getwd()rm(list=ls())
gc()??fwrite
#======================================================================# tibble format
library(nycflights13)
flights
dim(flights)
head(flights)
unique(flights$month)#针对某一列具体值,对数据进 行 筛选
#方法1.利用dplyr包中的filter函数选出行,filter
library(dplyr)
res1<-filter(flights, month == 1, day == 1) ###or ###
res1#方法2.dplyr包中subset也可以 &连接,|连接
res2=subset(flights,month==1 & day==1)#方法3
res3=flights[flights$month == 1 & flights$day == 1, ] ### and ###nrow(res1)
nrow(res2)
#select是选出列
#筛选出bbch11 year day
res4<-select(flights,year,month,dep_time)
head(res4)
#=====================================================================
#sample_n()和 sample_frac() 随机选出samples.
#案例1 针对向量set.seed(200)
#set.seed这个函数的主要目的,是让你的模拟能够可重复出现,
#因为很多时候我们需要取随机数,但这段代码再跑一次的时候,结果就不一样了,
#如果需要重复出现同样的模拟结果的话,就可以用set.seed()。
#可以简单地理解为括号里的数只是一个编号而已,
#例如set.seed(100)不应将括号里的数字理解成“一百”,
#而是应该理解成“编号为一零零的随机数发生”,
#下一次再模拟可以采用二零零(200)或者一一一(111)
#等不同的编号即可,编号设定基本可以随意。
x<-15:20
x
sample(x,9,replace = T)
#replace 如果为F(默认),则是不重复抽样,此时size不能大于x的长度;
#如果为T,则是重复抽样,此时size允许大于x的长度#Randomly sample rows with sample_n() and sample_frac()
#1.sample_n()
library(nycflights13)
head(flights)
library(dplyr)
k=sample_n(flights,20) #sample_n对数据框进行重抽样
k
dim(k)#2.sample_frac()
sample_frac(flights,0.01) #筛选出0.01比例的数据
#====================================================================
#对数据进行计算
#方法1.直接使用plyr包中的ddply进行计算
getwd()
setwd("F:/02 博士课程相关/统计学课程/")
library(data.table)
data<-fread("data20221028.csv",header = T)
head(data)library(plyr)
d1<-ddply(data,c("year","sp"),summarise,bbch_mean = mean(bbch), day_sum = sum(day))
head(d1)#方法2.首先对数据进行分组,在分组基础上进行计算,这样每次计算都是按照分组后的进行计算
group_1<-group_by(data,year,sp)
d2<-dplyr::summarise(group_1,bbch_mean = mean(bbch), day_sum = sum(day))
head(d2)#方法3 直接进行计算,注意:没有涉及到分组
d0=c(mean(data$bbch,na.rm = TRUE),
mean(data$day,na.rm = TRUE))
d0#=====================================================================
#使用sacle进行标准化:平均数0,标准差1,选择mutate才能进行后续计算,c()占位
#案例1:标准化,消除Wind和Temp单位对Ozone的影响
z_air<-ddply(airquality,c(),mutate,
ZOzone=scale(airquality$Ozone),
ZWind=scale(airquality$Wind),
ZTemp=scale(airquality$Temp))
head(z_air)
dim(z_air)
dim(airquality)
z1<-lm(Ozone~Wind+Temp,data = z_air)
#未进行标准化时,判断wind和temp对ozone的影响大小
z2<-lm(ZOzone~ZWind+ZTemp,data = z_air)
#进行标准化之后判断wind和temp对ozone的影响大小
summary(z1)
#Estimate 分别是-3.0555和1.8402
summary(z2)
#Estimate 分别是-0.32632和0.52801,这样的标准化能消除单位的影响,便于比较分析
#案例2:消除不同的油缸对disp的影响,方能进行不同disp的比较
#(类似物候研究中消除不同站点(空间)的影响,便于比较不同站点的DOY
head(mtcars)
z_geartodisp<-ddply(mtcars,"gear",mutate,scale_disp=scale(disp))
head(z_geartodisp)#=========================================================================
#对数据框进行合并
#方法1.使用merge进行合并
dfA = data.frame(id=c("age","height","width"),value1=c(20,180,130))
dfB = data.frame(id=c("age","height","weight"),value2=c(30,165,120))
merge(x=dfA,y=dfB,"id",all.x=TRUE)#以x的标签合并
merge(x=dfA,y=dfB,"id")#合并两个数据框中id的交集的行
#方法2.使用plyr包中的join函数
x=data.frame(id=c("1","2","3","4"),value=c(3,3,4,4))
y=data.frame(id=c("1","2","3","5"),value1=c(3,3,4,4))
x
y
plyr::join(x,y,by="id",type="left")
plyr::join(x,y,by="id",type="right")
plyr::join(x,y,by="id",type="full")#并集
plyr::join(x,y,by="id",type="inner")#交集
#===========================================================================
#tidyverse包是一个包含了dplyr、ggplot2、tibble等包的集合包
library(tidyverse)
head(iris)
#随机取iris第5列和第1列的2组数据
d1<-sample_n(iris[,c(5,1)],2)
#随机取iris第5列、第1列和第2列的3组数据
d2<-sample_n(iris[,c(5,1,2)],3)
d1
d2
#合并d1和d2,by=c("Species","Sepal.Length")必须是d1和d2中都有的
d3<-plyr::join(d1,d2,by=c("Species","Sepal.Length"),type="full")
d3
nrow(d3)#====================================================================================
#subset选出需要的行,select选出需要的列
head(iris)
d4<-subset(iris,Petal.Length == c(1.3,1.4,1.7),
select = c("Sepal.Length","Sepal.Width","Petal.Length"))
head(d4)
dim(d4)#求和apply&lapply
#1表示求每行的总和 apply()将数据框或矩阵作为输入,并以矢量,列表或数组形式输出。
apply(d4,1,sum)
apply(d4,2,sum)
#2表示求每列的总和 lapply()函数可用于对列表对象执行操作,并返回与原始集合长度相同的列表对象。
lapply(d4, function(x) x+3)#%>% (向右操作符,forward-pipe operator)是最常用的一种操作符,
#就是把左侧准备的数据或表达式,传递给右侧的函数调用或表达式进行运行,
#可以连续操作就像一个链条一样。
lapply(d4, function(x) x+3) %>% as.data.frame()

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)