万水千山

Across Mountains and Seas

由单月求累计及计算同比增长率

Humoon / 2019-08-30


现有一个月度贸易数据框,名为total,各列为 year, month, export, import, trade surplus

1 由单月值求累计值

(1) 使用sqldf() 和 SQL 语句

# 对于单个变量而言,增加一列累计值的代码如下
sqldf(
"SELECT *, SUM(export) OVER (PARTITION BY year ORDER BY month) AS acc_export
FROM total")

# 对于多个要累计的变量,便需要合并查询字符串。在此基础上封装代码,追求通用性。
joint <- function(string){
  str_c('SUM(', string, ') OVER (PARTITION BY year ORDER BY month) AS acc_', string)
}

add_acc <- function(df){
  vars <- colnames(df %>% select(-(year:month))) # 要用到的变量名
  string_acc <- joint(vars) %>% str_c(collapse = ', ') # joint(vars)使用了向量化运算
  df_name <- deparse(substitute(df)) # 解析任一对象的名字为字符串
  query <- str_c('SELECT *, ', string_acc, ' FROM ', df_name) # 组建SQL查询字符串
  sqldf(query) %>% return()
}

total_sql <- add_acc(total)

(2) 使用 R 基本包内置的 cumsum() 函数

total_cumsum <- total %>% arrange(year, month) %>% group_by(year) %>% 
  mutate(acc_ex = cumsum(export), acc_im = cumsum(import),
         acc_tra = cumsum(trade), acc_sur = cumsum(surplus)) %>% 
  ungroup()

不难发现,当表中所含列比较少时,用 group_by() + cumsum() 比较方便;而当表的列很多时,就会变得很繁琐,还是 sqldf() + SQL 语句的扩展性更强。

2 由累计值求单月值

# 自定义求单月值的差分函数
dif_mon <- function(acc) {
  c(acc[1], diff(acc)) %>% return()
}

total_month <- total_cumsum %>% arrange(year, month) %>% group_by(year) %>% 
  mutate(export = dif_mon(acc_ex), import = dif_mon(acc_im),
         trade = dif_mon(acc_tra), surplus = dif_mon(acc_sur)) %>% 
  ungroup()

3 求同比增长率

(1) 使用差分法

这种方法高度依赖数据的完整性,1年12个月不能有任何缺失,故此方法其实是不可靠的。

# 自定义同比增长率函数,a为时间序列, n为一个大周期的阶段数
growth <- function(a, n) {
  Fai <- rep(NA, n)
  b <- c(Fai, diff(a,n))
  round(100*b/(a - b), 2) %>% return()
}

# apply()函数族或map()函数族
# 下面四种写法的结果一样,即使返回list,也可以一步转化为数据框
temp <- total_cumsum[,-(1:2)] %>% apply(2, growth, 12) %>% as.data.frame()
temp <- total_cumsum[,-(1:2)] %>% sapply(growth, 12) %>% as.data.frame()
temp <- total_cumsum[,-(1:2)] %>% map_dfc(growth, 12)
temp <- total_cumsum[,-(1:2)] %>% map(growth,12) %>% as.data.frame()

# growth前面的参数是可变的,后面的参数是不变的, n = 12
colnames(temp) <- str_c("g_", colnames(total_cumsum)[-(1:2)]) # 向量化运算
total_g <- cbind(total_cumsum, temp)

(2) 使用sqldf() 和 SQL 语句

对数据框total中的单个变量而言,求同比增长率的查询语句如下

SELECT a.*, 100*(a.export/b.export-1) AS g_export
FROM total AS a LEFT OUTER JOIN total AS b
  ON a.year = b.year + 1 AND a.month = b.month;

在此基础上封装代码,追求通用性。

## 辅助字符串函数,参数为求同比的关键词
joint <- function(string){
    str_c('100*(a.', string, '/b.', string, '-1) AS g_', string) %>% return()
}

## 求同比函数,输入输出均为一个数据框,输入中没有同比列,输出中补全同比列
add_g <- function(df){
  vars <- colnames(df %>% select(-(year:month)))
  string_g <- joint(vars) %>% str_c(collapse = ', ') 
  df_name <- deparse(substitute(df))
  query <- str_c(
    'SELECT a.*,', string_g, 
    ' FROM ', df_name, ' AS a LEFT OUTER JOIN ', df_name, 
    ' AS b ON a.year = b.year + 1 AND a.month = b.month')
  sqldf(query) %>% return()
}

total_g <- add_g(total_cumsum)