forcats包简介

forcats

安装

# install.packages("devtools")
devtools::install_github("hadley/forcats")

## Skipping install of 'forcats' from a github remote, the SHA1 (004279de) has not changed since last install.
##   Use `force = TRUE` to force installation

使用

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(forcats)
head(gss_cat)

## # A tibble: 6 × 9
##    year       marital   age   race        rincome            partyid
##   <int>        <fctr> <int> <fctr>         <fctr>             <fctr>
## 1  2000 Never married    26  White  $8000 to 9999       Ind,near rep
## 2  2000      Divorced    48  White  $8000 to 9999 Not str republican
## 3  2000       Widowed    67  White Not applicable        Independent
## 4  2000 Never married    39  White Not applicable       Ind,near rep
## 5  2000      Divorced    25  White Not applicable   Not str democrat
## 6  2000       Married    25  White $20000 - 24999    Strong democrat
## # ... with 3 more variables: relig <fctr>, denom <fctr>, tvhours <int>

改变因子的值(Change level values)

gss_cat %>% count(partyid)

## # A tibble: 10 × 2
##               partyid     n
##                <fctr> <int>
## 1           No answer   154
## 2          Don't know     1
## 3         Other party   393
## 4   Strong republican  2314
## 5  Not str republican  3032
## 6        Ind,near rep  1791
## 7         Independent  4119
## 8        Ind,near dem  2499
## 9    Not str democrat  3690
## 10    Strong democrat  3490

gss_cat %>%
  mutate(partyid = fct_recode(partyid,
    "Republican, strong"    = "Strong republican",
    "Republican, weak"      = "Not str republican",
    "Independent, near rep" = "Ind,near rep",
    "Independent, near dem" = "Ind,near dem",
    "Democrat, weak"        = "Not str democrat",
    "Democrat, strong"      = "Strong democrat"
  )) %>%
  count(partyid)

## # A tibble: 10 × 2
##                  partyid     n
##                   <fctr> <int>
## 1              No answer   154
## 2             Don't know     1
## 3            Other party   393
## 4     Republican, strong  2314
## 5       Republican, weak  3032
## 6  Independent, near rep  1791
## 7            Independent  4119
## 8  Independent, near dem  2499
## 9         Democrat, weak  3690
## 10      Democrat, strong  3490

gss_cat %>%
  mutate(partyid = fct_collapse(partyid,
    other = c("No answer", "Don't know", "Other party"),
    rep = c("Strong republican", "Not str republican"),
    ind = c("Ind,near rep", "Independent", "Ind,near dem"),
    dem = c("Not str democrat", "Strong democrat")
  )) %>%
  count(partyid)

## # A tibble: 4 × 2
##   partyid     n
##    <fctr> <int>
## 1   other   548
## 2     rep  5346
## 3     ind  8409
## 4     dem  7180

合并因子,保留最多的那n个因子

gss_cat %>% count(relig)

## # A tibble: 15 × 2
##                      relig     n
##                     <fctr> <int>
## 1                No answer    93
## 2               Don't know    15
## 3  Inter-nondenominational   109
## 4          Native american    23
## 5                Christian   689
## 6       Orthodox-christian    95
## 7             Moslem/islam   104
## 8            Other eastern    32
## 9                 Hinduism    71
## 10                Buddhism   147
## 11                   Other   224
## 12                    None  3523
## 13                  Jewish   388
## 14                Catholic  5124
## 15              Protestant 10846

#默认为1
gss_cat %>% 
  mutate(relig = fct_lump(relig)) %>% 
  count(relig)

## # A tibble: 2 × 2
##        relig     n
##       <fctr> <int>
## 1 Protestant 10846
## 2      Other 10637

gss_cat %>% 
  mutate(relig = fct_lump(relig, n = 5)) %>% 
  count(relig)

## # A tibble: 6 × 2
##        relig     n
##       <fctr> <int>
## 1  Christian   689
## 2       None  3523
## 3     Jewish   388
## 4   Catholic  5124
## 5 Protestant 10846
## 6      Other   913

#负数为反方向保留，prop为比例
gss_cat %>% 
  mutate(relig = fct_lump(relig, prop = -0.10)) %>% 
  count(relig)

## # A tibble: 12 × 2
##                      relig     n
##                     <fctr> <int>
## 1                No answer    93
## 2               Don't know    15
## 3  Inter-nondenominational   109
## 4          Native american    23
## 5                Christian   689
## 6       Orthodox-christian    95
## 7             Moslem/islam   104
## 8            Other eastern    32
## 9                 Hinduism    71
## 10                Buddhism   147
## 11                  Jewish   388
## 12                   Other 19717

Change order of levels:

fct_relevel(): move specified level up front.

fct_inorder(): order by first appearance of each level.

fct_reorder(): order by summary of another value (same as stats::reorder()).

fct_infreq():

order by frequency.

fct_shuffle(): randomly shuffle order of levels.

fct_rev(): reverse order of levels.

fct_shift(): shift levels to the left/right.

Change value of levels:

fct_anon(): anonymise factor levels.

fct_lump(): lump rarest (or most common) levels into “other”.

fct_recode(): manually recode levels.

Add new levels:

fct_expand(): add new levels to a factor.

fct_explicit_na(): turn missing values into an explicit factor.

A few other helpers:

fct_c(): concatenate factors using union of levels.

fct_count():

count occurences of levels, optionally sorting by frequency.

fct_unify(): ensure list of factors share the same levels.

fct_unique(): compute from levels of factor.

fct_drop(): drop levels without data (same as base::droplevels()).

lvls_union(): finds union of levels from list of factors.

relig <- gss_cat %>%
  group_by(relig) %>%
  summarise(
    age = mean(age, na.rm = TRUE),
    tvhours = mean(tvhours, na.rm = TRUE),
    n = n()
  )
ggplot(relig, aes(tvhours, relig)) + geom_point()

ggplot(relig, aes(tvhours, fct_reorder(relig, tvhours))) +
  geom_point()

by_age <- gss_cat %>%
  filter(!is.na(age)) %>%
  group_by(age, marital) %>%
  count() %>%
  mutate(prop = n / sum(n))

ggplot(by_age, aes(age, prop)) +
  geom_line(aes(colour = marital))

ggplot(by_age, aes(age, prop)) +
  geom_line(aes(colour = fct_reorder2(marital, age, prop))) +
  labs(colour = "marital")

搜索此博客

xuefliang