tidyverse 体系下的整合与重构:R base cheatsheet.pdf
drop_na()
tidyr::drop_na()
可以对数据框指定一到多个变量,删去指定的变量有缺失值的行。不指定变量时有任何变量缺失的行都会被删去,此时功能同 na.omit()
。
tidyr::fill(data, ..., direction=c("down", "up"))
,填充缺失值,填充值根据参数是缺失值附近的数值
tidyr::replace_na(data, replace=list(), ...)
,将数据中的缺失值替换为别的值
tidyr::separate()
,有时应该放在不同列的数据用分隔符分隔后放在同一列中了,应拆开。一般可用于日志数据或日期时间型数据的拆分。
separate(data, col, into, sep = "[^[:alnum:]]+", remove = TRUE, convert = FALSE, extra = "warn", fill = "warn", ...)
<- read_csv("testid, succ/total
d.sep 1, 1/10
2, 3/5
3, 2/8
")
d.sep#> # A tibble: 3 x 2
#> testid `succ/total`
#> <dbl> <chr>
#> 1 1 1/10
#> 2 2 3/5
#> 3 3 2/8
%>%
d.sep separate(
`succ/total`,
into = c("succ", "total"),
sep = "/",
convert = TRUE
)#> # A tibble: 3 x 3
#> testid succ total
#> <dbl> <int> <int>
#> 1 1 1 10
#> 2 2 3 5
#> 3 3 2 8
tidyr::separate_rows(data, ..., sep)
,将某列数据中的每个值拆分为同一列的两行
tidyr::unite()
函数可以将同一行的两列或多列的内容合并成一列。是 separate()
的反向操作
unite(data, col, …, sep = “_”, remove = TRUE)
%>%
d.sep separate(`succ/total`, into = c("succ", "total"),
sep = "/", convert = TRUE) %>%
unite(ratio, succ, total, sep = ":")
#> # A tibble: 3 x 2
#> testid ratio
#> <dbl> <chr>
#> 1 1 1:10
#> 2 2 3:5
#> 3 3 2:8
## 另外一个例子
# 先虚构一数据框
set.seed(1)
<- as.Date('2016-11-01') + 0:14
date <- sample(1:24, 15)
hour <- sample(1:60, 15)
min <- sample(1:60, 15)
second <- sample(letters, 15)
event <- data.table(date, hour, min, second, event)
data
# 把date,hour,min和second列合并为新列datetime
# R中的日期时间格式为"Year-Month-Day Hour:Min:Second"
<- data %>% unite(datehour, date, hour, sep = ' ') %>% unite(datetime, datehour, min, second, sep = ':')
dataNew
dataNew#> datetime event
#> 1: 2016-11-01 4:15:35 w
#> 2: 2016-11-02 7:21:6 x
#> 3: 2016-11-03 1:37:10 f
#> 4: 2016-11-04 2:41:42 g
#> 5: 2016-11-05 11:25:38 s
#> 6: 2016-11-06 14:46:47 j
#> 7: 2016-11-07 18:58:20 y
#> 8: 2016-11-08 22:54:28 n
#> 9: 2016-11-09 5:34:54 b
#> 10: 2016-11-10 16:42:44 m
#> 11: 2016-11-11 10:56:23 r
#> 12: 2016-11-12 6:44:59 t
#> 13: 2016-11-13 19:60:40 v
#> 14: 2016-11-14 23:33:51 o
#> 15: 2016-11-15 9:20:25 a
# input data
<- read.table(header = TRUE, sep = " ", text = "
mydata ID Time X1 X2
1 1 5 6
1 2 3 5
2 1 6 1
2 2 2 4
")
mydata#> ID Time X1 X2
#> 1 1 1 5 6
#> 2 1 2 3 5
#> 3 2 1 6 1
#> 4 2 2 2 4
melt()
将宽数据转换长数据,id 参数接受一个列名向量,其中的列会保留。其余列的列名会变为新variable列的值,值会变成新value列的值。
# id参数中的变量维持属性地位,其他变量的值都成为value,因此其他变量被合并成一列variable
<- melt(mydata, id = c("ID", "Time")) #or md <- melt(mydata, id=1:2)
md
md#> ID Time variable value
#> 1 1 1 X1 5
#> 2 1 2 X1 3
#> 3 2 1 X1 6
#> 4 2 2 X1 2
#> 5 1 1 X2 6
#> 6 1 2 X2 5
#> 7 2 1 X2 1
#> 8 2 2 X2 4
*cast()
dcast()
返回一个数据框,acast()
返回一个向量/矩阵/数组
熔化与重铸
# dcast(md, formula, fun.aggregate)
# formula参数中,~左边的变量保持不变,右边的变量作为一个因子,各取值水平分别成为一个变量名。
dcast(md, ID+Time~variable)
#> ID Time X1 X2
#> 1 1 1 5 6
#> 2 1 2 3 5
#> 3 2 1 6 1
#> 4 2 2 2 4
dcast(md, ID+variable~Time)
#> ID variable 1 2
#> 1 1 X1 5 3
#> 2 1 X2 6 5
#> 3 2 X1 6 2
#> 4 2 X2 1 4
dcast(md, ID~variable+Time)
#> ID X1_1 X1_2 X2_1 X2_2
#> 1 1 5 3 6 5
#> 2 2 6 2 1 4
dcast(md, ID~Time+variable)
#> ID 1_X1 1_X2 2_X1 2_X2
#> 1 1 5 6 3 5
#> 2 2 6 1 2 4
# formula中没有涉及的变量将从数据中消失,由fun.aggregate参数执行整合功能。
dcast(md, ID~Time,mean)
#> ID 1 2
#> 1 1 5.5 4
#> 2 2 3.5 3
dcast(md, ID~variable,mean)
#> ID X1 X2
#> 1 1 4 5.5
#> 2 2 4 2.5
tidyr::pivot_longer(cols, names_to, values_to)
,宽转长tidyr::pivot_wider(names_from, values_from)
,长转宽library(tidyverse)
<- data.frame(
plant_height Day = 1:5,
A = c(0.7, 1.0, 1.5, 1.8, 2.2),
B = c(0.5, 0.7, 0.9, 1.3, 1.8),
C = c(0.3, 0.6, 1.0, 1.2, 2.2),
D = c(0.4, 0.7, 1.2, 1.5, 3.2)
)
plant_height#> Day A B C D
#> 1 1 0.7 0.5 0.3 0.4
#> 2 2 1.0 0.7 0.6 0.7
#> 3 3 1.5 0.9 1.0 1.2
#> 4 4 1.8 1.3 1.2 1.5
#> 5 5 2.2 1.8 2.2 3.2
<- plant_height %>%
long pivot_longer(
cols = A:D,
names_to = "plant",
values_to = "height"
)
long#> # A tibble: 20 x 3
#> Day plant height
#> <int> <chr> <dbl>
#> 1 1 A 0.7
#> 2 1 B 0.5
#> 3 1 C 0.3
#> 4 1 D 0.4
#> 5 2 A 1
#> 6 2 B 0.7
#> 7 2 C 0.6
#> 8 2 D 0.7
#> 9 3 A 1.5
#> 10 3 B 0.9
#> 11 3 C 1
#> 12 3 D 1.2
#> 13 4 A 1.8
#> 14 4 B 1.3
#> 15 4 C 1.2
#> 16 4 D 1.5
#> 17 5 A 2.2
#> 18 5 B 1.8
#> 19 5 C 2.2
#> 20 5 D 3.2
%>%
long ggplot(aes(x = Day, y = height, color = plant)) +
geom_line()
<- long %>%
wide pivot_wider(
names_from = "plant",
values_from = "height"
)
wide#> # A tibble: 5 x 5
#> Day A B C D
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.7 0.5 0.3 0.4
#> 2 2 1 0.7 0.6 0.7
#> 3 3 1.5 0.9 1 1.2
#> 4 4 1.8 1.3 1.2 1.5
#> 5 5 2.2 1.8 2.2 3.2
<- data.frame(
plant_record day = c(1L, 2L, 3L, 4L, 5L),
A_height = c(1.1, 1.2, 1.3, 1.4, 1.5),
A_width = c(2.1, 2.2, 2.3, 2.4, 2.5),
A_depth = c(3.1, 3.2, 3.3, 3.4, 3.5),
B_height = c(4.1, 4.2, 4.3, 4.4, 4.5),
B_width = c(5.1, 5.2, 5.3, 5.4, 5.5),
B_depth = c(6.1, 6.2, 6.3, 6.4, 6.5),
C_height = c(7.1, 7.2, 7.3, 7.4, 7.5),
C_width = c(8.1, 8.2, 8.3, 8.4, 8.5),
C_depth = c(9.1, 9.2, 9.3, 9.4, 9.5)
)
plant_record#> day A_height A_width A_depth B_height B_width B_depth C_height C_width
#> 1 1 1.1 2.1 3.1 4.1 5.1 6.1 7.1 8.1
#> 2 2 1.2 2.2 3.2 4.2 5.2 6.2 7.2 8.2
#> 3 3 1.3 2.3 3.3 4.3 5.3 6.3 7.3 8.3
#> 4 4 1.4 2.4 3.4 4.4 5.4 6.4 7.4 8.4
#> 5 5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5
#> C_depth
#> 1 9.1
#> 2 9.2
#> 3 9.3
#> 4 9.4
#> 5 9.5
%>%
plant_record ::pivot_longer(
tidyrcols = !day,
names_to = c("species", "parameter"),
names_pattern = "(.*)_(.*)",
values_to = "value"
)#> # A tibble: 45 x 4
#> day species parameter value
#> <int> <chr> <chr> <dbl>
#> 1 1 A height 1.1
#> 2 1 A width 2.1
#> 3 1 A depth 3.1
#> 4 1 B height 4.1
#> 5 1 B width 5.1
#> 6 1 B depth 6.1
#> 7 1 C height 7.1
#> 8 1 C width 8.1
#> 9 1 C depth 9.1
#> 10 2 A height 1.2
#> # ... with 35 more rows
<- plant_record %>%
plant_record_longer ::pivot_longer(
tidyrcols = !day,
# 注意时 .value, 不是一个列
names_to = c("species", ".value"),
names_pattern = "(.*)_(.*)"
)
plant_record_longer#> # A tibble: 15 x 5
#> day species height width depth
#> <int> <chr> <dbl> <dbl> <dbl>
#> 1 1 A 1.1 2.1 3.1
#> 2 1 B 4.1 5.1 6.1
#> 3 1 C 7.1 8.1 9.1
#> 4 2 A 1.2 2.2 3.2
#> 5 2 B 4.2 5.2 6.2
#> 6 2 C 7.2 8.2 9.2
#> 7 3 A 1.3 2.3 3.3
#> 8 3 B 4.3 5.3 6.3
#> 9 3 C 7.3 8.3 9.3
#> 10 4 A 1.4 2.4 3.4
#> 11 4 B 4.4 5.4 6.4
#> 12 4 C 7.4 8.4 9.4
#> 13 5 A 1.5 2.5 3.5
#> 14 5 B 4.5 5.5 6.5
#> 15 5 C 7.5 8.5 9.5
%>%
us_rent_income pivot_wider(
names_from = variable,
names_glue = "{variable}_{.value}",
values_from = c(estimate, moe)
)#> # A tibble: 52 x 6
#> GEOID NAME income_estimate rent_estimate income_moe rent_moe
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 01 Alabama 24476 747 136 3
#> 2 02 Alaska 32940 1200 508 13
#> 3 04 Arizona 27517 972 148 4
#> 4 05 Arkansas 23789 709 165 5
#> 5 06 California 29454 1358 109 3
#> 6 08 Colorado 32401 1125 109 5
#> 7 09 Connecticut 35326 1123 195 5
#> 8 10 Delaware 31560 1076 247 10
#> 9 11 District of Columbia 43198 1424 681 17
#> 10 12 Florida 25952 1077 70 3
#> # ... with 42 more rows
%>%
plant_record_longer ::pivot_wider(
tidyrnames_from = species,
values_from = c(height, width, depth),
names_glue = "{species}_{.value}"
)#> # A tibble: 5 x 10
#> day A_height B_height C_height A_width B_width C_width A_depth B_depth
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1.1 4.1 7.1 2.1 5.1 8.1 3.1 6.1
#> 2 2 1.2 4.2 7.2 2.2 5.2 8.2 3.2 6.2
#> 3 3 1.3 4.3 7.3 2.3 5.3 8.3 3.3 6.3
#> 4 4 1.4 4.4 7.4 2.4 5.4 8.4 3.4 6.4
#> 5 5 1.5 4.5 7.5 2.5 5.5 8.5 3.5 6.5
#> # ... with 1 more variable: C_depth <dbl>
tidyr::gather()
gather(data, key, value, …, na.rm = FALSE, convert = FALSE)
<- head(datasets::mtcars)
mtcars
# 为方便处理,在数据集中增加一列car
$car <- rownames(mtcars)
mtcars#将添加的一列从最后一列移到最前列
<- mtcars[, c(12, 1:11)]
mtcars head(mtcars)
#> car mpg cyl disp hp drat wt qsec vs am
#> Mazda RX4 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1
#> Mazda RX4 Wag Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1
#> Datsun 710 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1
#> Hornet 4 Drive Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0
#> Hornet Sportabout Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0
#> Valiant Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0
#> gear carb
#> Mazda RX4 4 4
#> Mazda RX4 Wag 4 4
#> Datsun 710 4 1
#> Hornet 4 Drive 3 1
#> Hornet Sportabout 3 2
#> Valiant 3 1
# 除了car列外,其余列gather成两列,分别命名为attribute和value
<- mtcars %>%
mtcarsNew gather(key = attribute, value = value, -car)
head(mtcarsNew)
#> car attribute value
#> 1 Mazda RX4 mpg 21.0
#> 2 Mazda RX4 Wag mpg 21.0
#> 3 Datsun 710 mpg 22.8
#> 4 Hornet 4 Drive mpg 21.4
#> 5 Hornet Sportabout mpg 18.7
#> 6 Valiant mpg 18.1
# gather在map和gear之间的所有列,从而保持carb和car列不变
<- mtcars %>% gather(attribute, value, mpg:gear)
mtcarsNew head(mtcarsNew)
#> car carb attribute value
#> 1 Mazda RX4 4 mpg 21.0
#> 2 Mazda RX4 Wag 4 mpg 21.0
#> 3 Datsun 710 1 mpg 22.8
#> 4 Hornet 4 Drive 1 mpg 21.4
#> 5 Hornet Sportabout 2 mpg 18.7
#> 6 Valiant 1 mpg 18.1
# 只gather() `gear`和`carb`两列
<- mtcars %>%
mtcarsNew gather(key = attribute, value = value, gear, carb)
head(mtcarsNew)
#> car mpg cyl disp hp drat wt qsec vs am attribute value
#> 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 gear 4
#> 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 gear 4
#> 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 gear 4
#> 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 gear 3
#> 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 gear 3
#> 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 gear 3
unique(mtcarsNew$attribute)
#> [1] "gear" "carb"
## 或用上一节的例子
<- mydata %>% gather(variable, value, X1, X2)
md
md#> ID Time variable value
#> 1 1 1 X1 5
#> 2 1 2 X1 3
#> 3 2 1 X1 6
#> 4 2 2 X1 2
#> 5 1 1 X2 6
#> 6 1 2 X2 5
#> 7 2 1 X2 1
#> 8 2 2 X2 4
tidyr::spread()
spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)
<- mtcarsNew %>%
mtcarsSpread spread(key = attribute, value = value)
head(mtcarsSpread)
#> car mpg cyl disp hp drat wt qsec vs am carb gear
#> 1 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 1 4
#> 2 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 1 3
#> 3 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 2 3
#> 4 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
#> 5 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
#> 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 1 3
## 或用上一节的例子
%>% spread(variable, value)
md #> ID Time X1 X2
#> 1 1 1 5 6
#> 2 1 2 3 5
#> 3 2 1 6 1
#> 4 2 2 2 4
bind_cols(table1, table2)
纵向合并bind_rows(table1, table2)
横向合并
bind_rows("tableName1" = table1, "tableName2" = table2, .id = NULL)
,可选参数 .id
不为NULL而接收一个字符串时,可以添加一列,列名为 .id
的参数,用以标识各行数据的来源。新添加的这一列的各行值为 tableName1
和 tableName2