由单月求累计及计算同比增长率
Humoon / 2019-08-30
现有一个月度贸易数据框,名为total,各列为 year, month, export, import, trade surplus
1 由单月值求累计值
(1) 使用sqldf() 和 SQL 语句
# 对于单个变量而言,增加一列累计值的代码如下
sqldf(
"SELECT *, SUM(export) OVER (PARTITION BY year ORDER BY month) AS acc_export
FROM total")
# 对于多个要累计的变量,便需要合并查询字符串。在此基础上封装代码,追求通用性。
joint <- function(string){
str_c('SUM(', string, ') OVER (PARTITION BY year ORDER BY month) AS acc_', string)
}
add_acc <- function(df){
vars <- colnames(df %>% select(-(year:month))) # 要用到的变量名
string_acc <- joint(vars) %>% str_c(collapse = ', ') # joint(vars)使用了向量化运算
df_name <- deparse(substitute(df)) # 解析任一对象的名字为字符串
query <- str_c('SELECT *, ', string_acc, ' FROM ', df_name) # 组建SQL查询字符串
sqldf(query) %>% return()
}
total_sql <- add_acc(total)
(2) 使用 R 基本包内置的 cumsum() 函数
total_cumsum <- total %>% arrange(year, month) %>% group_by(year) %>%
mutate(acc_ex = cumsum(export), acc_im = cumsum(import),
acc_tra = cumsum(trade), acc_sur = cumsum(surplus)) %>%
ungroup()
不难发现,当表中所含列比较少时,用 group_by() + cumsum() 比较方便;而当表的列很多时,就会变得很繁琐,还是 sqldf() + SQL 语句的扩展性更强。
2 由累计值求单月值
# 自定义求单月值的差分函数
dif_mon <- function(acc) {
c(acc[1], diff(acc)) %>% return()
}
total_month <- total_cumsum %>% arrange(year, month) %>% group_by(year) %>%
mutate(export = dif_mon(acc_ex), import = dif_mon(acc_im),
trade = dif_mon(acc_tra), surplus = dif_mon(acc_sur)) %>%
ungroup()
3 求同比增长率
(1) 使用差分法
这种方法高度依赖数据的完整性,1年12个月不能有任何缺失,故此方法其实是不可靠的。
# 自定义同比增长率函数,a为时间序列, n为一个大周期的阶段数
growth <- function(a, n) {
Fai <- rep(NA, n)
b <- c(Fai, diff(a,n))
round(100*b/(a - b), 2) %>% return()
}
# apply()函数族或map()函数族
# 下面四种写法的结果一样,即使返回list,也可以一步转化为数据框
temp <- total_cumsum[,-(1:2)] %>% apply(2, growth, 12) %>% as.data.frame()
temp <- total_cumsum[,-(1:2)] %>% sapply(growth, 12) %>% as.data.frame()
temp <- total_cumsum[,-(1:2)] %>% map_dfc(growth, 12)
temp <- total_cumsum[,-(1:2)] %>% map(growth,12) %>% as.data.frame()
# growth前面的参数是可变的,后面的参数是不变的, n = 12
colnames(temp) <- str_c("g_", colnames(total_cumsum)[-(1:2)]) # 向量化运算
total_g <- cbind(total_cumsum, temp)
(2) 使用sqldf() 和 SQL 语句
对数据框total中的单个变量而言,求同比增长率的查询语句如下
SELECT a.*, 100*(a.export/b.export-1) AS g_export
FROM total AS a LEFT OUTER JOIN total AS b
ON a.year = b.year + 1 AND a.month = b.month;
在此基础上封装代码,追求通用性。
## 辅助字符串函数,参数为求同比的关键词
joint <- function(string){
str_c('100*(a.', string, '/b.', string, '-1) AS g_', string) %>% return()
}
## 求同比函数,输入输出均为一个数据框,输入中没有同比列,输出中补全同比列
add_g <- function(df){
vars <- colnames(df %>% select(-(year:month)))
string_g <- joint(vars) %>% str_c(collapse = ', ')
df_name <- deparse(substitute(df))
query <- str_c(
'SELECT a.*,', string_g,
' FROM ', df_name, ' AS a LEFT OUTER JOIN ', df_name,
' AS b ON a.year = b.year + 1 AND a.month = b.month')
sqldf(query) %>% return()
}
total_g <- add_g(total_cumsum)