data.table的使用(与dplyr包比较)
library(dplyr)
library(data.table)
library(lubridate)
library(jsonlite)
library(tidyr)
library(ggplot2)
library(compare)
spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD")
names(spending)
meta=spending$meta
hospital_spending=data.frame(spending$data)
colnames(hospital_spending)=make.names(meta$view$columns$name)
hospital_spending=select(hospital_spending,-c(sid:meta))
#dplyr包,Get a glimpse of your data
glimpse(hospital_spending)
cols = 6:11; # 需要改变数据类型的列
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character)
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.numeric)
cols = 12:13; # 需要纠正的两列
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.POSIXct)
#检查下数据列是否是我们想要的数据类型
sapply(hospital_spending, class)
#使用data.table函数创建data.table类型数据
hospital_spending_DT = data.table(hospital_spending)
class(hospital_spending_DT)
#选取一个变量
from_dplyr = select(hospital_spending, Hospital.Name)
from_data_table = hospital_spending_DT[,.(Hospital.Name)]
#删除一个变量
from_dplyr = select(hospital_spending, -Hospital.Name)
from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE]
DT=copy(hospital_spending_DT)
DT=DT[,Hospital.Name:=NULL]
"Hospital.Name"%in%names(DT)
#选取多个变量
from_dplyr = select(hospital_spending, Hospital.Name,State,Measure.Start.Date,Measure.End.Date)
from_data_table = hospital_spending_DT[,.(Hospital.Name,State,Measure.Start.Date,Measure.End.Date)]
#删除多个变量
from_dplyr = select(hospital_spending, -c(Hospital.Name,State,Measure.Start.Date,Measure.End.Date))
from_data_table = hospital_spending_DT[,!c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"),with=FALSE]
DT=copy(hospital_spending_DT)
DT=DT[,c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"):=NULL]
c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date") %in% names(DT)
#将选取所有列名包含字符“Date”的列
from_dplyr = select(hospital_spending,contains("Date"))
from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT)))
#重命名列名
setnames(hospital_spending_DT,c("Hospital.Name", "Measure.Start.Date","Measure.End.Date"), c("Hospital","Start_Date","End_Date"))
hospital_spending = rename(hospital_spending,Hospital= Hospital.Name, Start_Date=Measure.Start.Date,End_Date=Measure.End.Date)
#对单个变量进行筛选
from_dplyr = filter(hospital_spending,State=='CA') # selecting rows for California
from_data_table = hospital_spending_DT[State=='CA']
#对多个变量进行筛选
from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX"))
from_data_table = hospital_spending_DT[State %in% c('CA','MA',"TX")]
#升序
from_dplyr = arrange(hospital_spending, State)
from_data_table = setorder(hospital_spending_DT, State)
#降序
from_dplyr = arrange(hospital_spending, desc(State))
from_data_table = setorder(hospital_spending_DT, -State)
#对多变量进行排序
#State变量升序,End_Date变量降序排序:
from_dplyr = arrange(hospital_spending, State,desc(End_Date))
from_data_table = setorder(hospital_spending_DT, State,-End_Date)
#添加或更新列
from_dplyr = mutate(hospital_spending, diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.]
#数据汇总
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))]
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),
maximum=max(Avg.Spending.Per.Episode..Nation.),
minimum=min(Avg.Spending.Per.Episode..Nation.),
median=median(Avg.Spending.Per.Episode..Nation.))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),
maximum=max(Avg.Spending.Per.Episode..Nation.),
minimum=min(Avg.Spending.Per.Episode..Nation.),
median=median(Avg.Spending.Per.Episode..Nation.))]
head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)])
mygroup= group_by(hospital_spending,Hospital)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital)]
mygroup= group_by(hospital_spending,Hospital,State)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
#链式操作
#在data.table中,我们可以使用%>%或[来实现链式操作
from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]
library(data.table)
library(lubridate)
library(jsonlite)
library(tidyr)
library(ggplot2)
library(compare)
spending=fromJSON("https://data.medicare.gov/api/views/nrth-mfg3/rows.json?accessType=DOWNLOAD")
names(spending)
meta=spending$meta
hospital_spending=data.frame(spending$data)
colnames(hospital_spending)=make.names(meta$view$columns$name)
hospital_spending=select(hospital_spending,-c(sid:meta))
#dplyr包,Get a glimpse of your data
glimpse(hospital_spending)
cols = 6:11; # 需要改变数据类型的列
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.character)
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.numeric)
cols = 12:13; # 需要纠正的两列
hospital_spending[,cols] <- lapply(hospital_spending[,cols],as.POSIXct)
#检查下数据列是否是我们想要的数据类型
sapply(hospital_spending, class)
#使用data.table函数创建data.table类型数据
hospital_spending_DT = data.table(hospital_spending)
class(hospital_spending_DT)
#选取一个变量
from_dplyr = select(hospital_spending, Hospital.Name)
from_data_table = hospital_spending_DT[,.(Hospital.Name)]
#删除一个变量
from_dplyr = select(hospital_spending, -Hospital.Name)
from_data_table = hospital_spending_DT[,!c("Hospital.Name"),with=FALSE]
DT=copy(hospital_spending_DT)
DT=DT[,Hospital.Name:=NULL]
"Hospital.Name"%in%names(DT)
#选取多个变量
from_dplyr = select(hospital_spending, Hospital.Name,State,Measure.Start.Date,Measure.End.Date)
from_data_table = hospital_spending_DT[,.(Hospital.Name,State,Measure.Start.Date,Measure.End.Date)]
#删除多个变量
from_dplyr = select(hospital_spending, -c(Hospital.Name,State,Measure.Start.Date,Measure.End.Date))
from_data_table = hospital_spending_DT[,!c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"),with=FALSE]
DT=copy(hospital_spending_DT)
DT=DT[,c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date"):=NULL]
c("Hospital.Name","State","Measure.Start.Date","Measure.End.Date") %in% names(DT)
#将选取所有列名包含字符“Date”的列
from_dplyr = select(hospital_spending,contains("Date"))
from_data_table = subset(hospital_spending_DT,select=grep("Date",names(hospital_spending_DT)))
#重命名列名
setnames(hospital_spending_DT,c("Hospital.Name", "Measure.Start.Date","Measure.End.Date"), c("Hospital","Start_Date","End_Date"))
hospital_spending = rename(hospital_spending,Hospital= Hospital.Name, Start_Date=Measure.Start.Date,End_Date=Measure.End.Date)
#对单个变量进行筛选
from_dplyr = filter(hospital_spending,State=='CA') # selecting rows for California
from_data_table = hospital_spending_DT[State=='CA']
#对多个变量进行筛选
from_dplyr = filter(hospital_spending,State %in% c('CA','MA',"TX"))
from_data_table = hospital_spending_DT[State %in% c('CA','MA',"TX")]
#升序
from_dplyr = arrange(hospital_spending, State)
from_data_table = setorder(hospital_spending_DT, State)
#降序
from_dplyr = arrange(hospital_spending, desc(State))
from_data_table = setorder(hospital_spending_DT, -State)
#对多变量进行排序
#State变量升序,End_Date变量降序排序:
from_dplyr = arrange(hospital_spending, State,desc(End_Date))
from_data_table = setorder(hospital_spending_DT, State,-End_Date)
#添加或更新列
from_dplyr = mutate(hospital_spending, diff=Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.)
from_data_table = copy(hospital_spending_DT)
from_data_table = from_data_table[,diff := Avg.Spending.Per.Episode..State. - Avg.Spending.Per.Episode..Nation.]
#数据汇总
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.))]
summarize(hospital_spending,mean=mean(Avg.Spending.Per.Episode..Nation.),
maximum=max(Avg.Spending.Per.Episode..Nation.),
minimum=min(Avg.Spending.Per.Episode..Nation.),
median=median(Avg.Spending.Per.Episode..Nation.))
hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Nation.),
maximum=max(Avg.Spending.Per.Episode..Nation.),
minimum=min(Avg.Spending.Per.Episode..Nation.),
median=median(Avg.Spending.Per.Episode..Nation.))]
head(hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)),by=.(Hospital)])
mygroup= group_by(hospital_spending,Hospital)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital)]
mygroup= group_by(hospital_spending,Hospital,State)
from_dplyr = summarize(mygroup,mean=mean(Avg.Spending.Per.Episode..Hospital.))
#链式操作
#在data.table中,我们可以使用%>%或[来实现链式操作
from_dplyr=hospital_spending %>% group_by(Hospital,State) %>% summarize(mean=mean(Avg.Spending.Per.Episode..Hospital.))
from_data_table=hospital_spending_DT[,.(mean=mean(Avg.Spending.Per.Episode..Hospital.)), by=.(Hospital,State)]
评论
发表评论