data.table的使用(与dplyr包比较)

library(dplyr)
library(data.table)
library(lubridate)
library(jsonlite)
library(tidyr)
library(ggplot2)
library(compare)

spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD")
names(spending)
meta=spending$meta
hospital_spending=data.frame(spending$data)
colnames(hospital_spending)=make.names(meta$view$columns$name)
hospital_spending=select(hospital_spending,-c(sid:meta))
#dplyr包,Get a glimpse of your data
glimpse(hospital_spending)

cols = 6:11; # 需要改变数据类型的列
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character)
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.numeric)

cols = 12:13; # 需要纠正的两列
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.POSIXct)

#检查下数据列是否是我们想要的数据类型
sapply(hospital_spending, class)

#使用data.table函数创建data.table类型数据
hospital_spending_DT = data.table(hospital_spending)
class(hospital_spending_DT)

#选取一个变量
from_dplyr = select(hospital_spending, Hospital.Name)
from_data_table = hospital_spending_DT[,.(Hospital.Name)]

#删除一个变量
from_dplyr = select(hospital_spending, -Hospital.Name)
from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE]

DT=copy(hospital_spending_DT)
DT=DT[,Hospital.Name:=NULL]
"Hospital.Name"%in%names(DT)

#选取多个变量
from_dplyr = select(hospital_spending, Hospital.Name,State,Measure.Start.Date,Measure.End.Date)
from_data_table = hospital_spending_DT[,.(Hospital.Name,State,Measure.Start.Date,Measure.End.Date)]

#删除多个变量
from_dplyr = select(hospital_spending, -c(Hospital.Name,State,Measure.Start.Date,Measure.End.Date))
from_data_table = hospital_spending_DT[,!c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"),with=FALSE]

DT=copy(hospital_spending_DT)
DT=DT[,c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"):=NULL]
c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date") %in% names(DT)

#将选取所有列名包含字符“Date”的列
from_dplyr = select(hospital_spending,contains("Date"))
from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT)))

#重命名列名
setnames(hospital_spending_DT,c("Hospital.Name", "Measure.Start.Date","Measure.End.Date"), c("Hospital","Start_Date","End_Date"))
hospital_spending = rename(hospital_spending,Hospital= Hospital.Name, Start_Date=Measure.Start.Date,End_Date=Measure.End.Date)

#对单个变量进行筛选
from_dplyr = filter(hospital_spending,State=='CA') # selecting rows for California
from_data_table = hospital_spending_DT[State=='CA']

#对多个变量进行筛选
from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX"))
from_data_table = hospital_spending_DT[State %in% c('CA','MA',"TX")]

#升序
from_dplyr = arrange(hospital_spending, State)
from_data_table = setorder(hospital_spending_DT, State)

#降序
from_dplyr = arrange(hospital_spending, desc(State))
from_data_table = setorder(hospital_spending_DT, -State)

#对多变量进行排序
#State变量升序,End_Date变量降序排序:
from_dplyr = arrange(hospital_spending, State,desc(End_Date))
from_data_table = setorder(hospital_spending_DT, State,-End_Date)

#添加或更新列
from_dplyr = mutate(hospital_spending, diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.]

#数据汇总
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))]
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),
          maximum=max(Avg.Spending.Per.Episode..Nation.),
          minimum=min(Avg.Spending.Per.Episode..Nation.),
          median=median(Avg.Spending.Per.Episode..Nation.))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),
                        maximum=max(Avg.Spending.Per.Episode..Nation.),
                        minimum=min(Avg.Spending.Per.Episode..Nation.),
                        median=median(Avg.Spending.Per.Episode..Nation.))]


head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)])

mygroup= group_by(hospital_spending,Hospital)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital)]

mygroup= group_by(hospital_spending,Hospital,State)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))

#链式操作
#在data.table中,我们可以使用%>%或[来实现链式操作
from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]


评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理