Skip to content

Commit

Permalink
Faster rs_matrix()
Browse files Browse the repository at this point in the history
  • Loading branch information
marberts committed Aug 26, 2023
1 parent 7e35619 commit 271b57d
Show file tree
Hide file tree
Showing 7 changed files with 459 additions and 59 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
Package: rsmatrix
Title: Matrices for Repeat-Sales Price Indexes
Version: 0.2.7
Version: 0.2.7.9001
Authors@R: c(
person(given = "Steve", family = "Martin", role = c("aut", "cre", "cph"), email = "[email protected]", comment = c(ORCID = "0000-0003-2544-9480"))
)
Description: A small package for calculating the matrices in Shiller (1991, <doi:10.1016/S1051-1377(05)80028-2>) that serve as the foundation for many repeat-sales price indexes.
Depends: R (>= 4.0)
Imports: Matrix (>= 1.5-0), methods, stats
Imports: Matrix (>= 1.5-0)
Suggests:
knitr,
rmarkdown,
Expand Down
5 changes: 1 addition & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
export(rs_matrix, rs_pairs, rs_var)

importFrom(Matrix, sparse.model.matrix, rowSums)
importFrom(Matrix, sparseMatrix, rowSums)
importMethodsFrom(Matrix, solve, crossprod, tcrossprod)
importFrom(stats, model.matrix)
importFrom(methods, as)
importFrom(utils, packageVersion)
4 changes: 3 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## Changes in version 0.2.7
## Changes in version 0.2.8

- Added a vignette.

- `rs_matrix()` is about twice as fast now.

## Changes in version 0.2.6

- Updated to work with Matrix >= 1.5-0.
Expand Down
61 changes: 33 additions & 28 deletions R/rs_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ different_lengths <- function(...) {
any(res != res[1L])
}

#---- Z matrix (internal) ----
rs_z_ <- function(t2, t1, f = NULL, sparse = FALSE) {
# coerce t2 and t1 into characters prior to taking the union
# so that both dates and factors are treated the same
Expand All @@ -13,52 +12,56 @@ rs_z_ <- function(t2, t1, f = NULL, sparse = FALSE) {
lev <- sort.int(unique(c(lev2, lev1))) # usually faster than base::union()
t2 <- factor(t2, lev)
t1 <- factor(t1, lev)
if (any(as.numeric(t2) <= as.numeric(t1))) {
if (any(unclass(t2) <= unclass(t1))) {
warning("all elements of 't2' should be greater than the corresponding ",
"elements in 't1'")
}

# make row names before interacting with f
nm <- if (!is.null(names(t2))) {
names(t2)
} else if (!is.null(names(t1))) {
names(t1)
} else if (!is.null(names(f))) {
names(f)
} else {
seq_along(t2)
}
names(t2)
} else if (!is.null(names(t1))) {
names(t1)
} else if (!is.null(names(f))) {
names(f)
} else {
seq_along(t2)
}

if (!is.null(f)) {
f <- as.factor(f)
t2 <- interaction(f, t2)
t1 <- interaction(f, t1)
lev <- levels(t2)
}

# calculate Z
if (nlevels(t2) < 2L) {
# return a nx1 matrix of 0's if there's only one level
# return a 0x0 matrix if there are no levels
z <- matrix(rep.int(0, length(t2)), ncol = nlevels(t2))
if (sparse) {
z <- as(as(z, "generalMatrix"), "CsparseMatrix")
}
dims <- c(length(nm), length(lev))
attributes(t2) <- NULL
attributes(t1) <- NULL
non_zero <- which(t2 != t1)
i <- seq_along(t2)[non_zero]
t2 <- t2[non_zero]
t1 <- t1[non_zero]
if (sparse) {
res <- sparseMatrix(rep.int(i, 2), c(t2, t1),
x = rep(c(1, -1), each = length(i)),
dims = dims)
} else {
# model matrix otherwise
mm <- if (sparse) sparse.model.matrix else model.matrix
z <- mm(~ t2 - 1, contrasts.arg = list(t2 = "contr.treatment")) -
mm(~ t1 - 1, contrasts.arg = list(t1 = "contr.treatment"))
res <- rep.int(0, prod(dims))
res[(t2 - 1L) * dims[1L] + i] <- 1
res[(t1 - 1L) * dims[1L] + i] <- -1
dim(res) <- dims
}
if (nlevels(t2) > 0L) {
colnames(z) <- levels(t2)
rownames(z) <- nm

if (length(lev) > 0L) {
colnames(res) <- lev
rownames(res) <- nm
}
# remove model.matrix attributes
attributes(z)[c("assign", "contrasts")] <- NULL
z

res
}

#---- X matrix (internal) ----
rs_x_ <- function(z, p2, p1) (z > 0) * p2 - (z < 0) * p1

#---- All matrices ----
Expand All @@ -80,6 +83,8 @@ rs_matrix <- function(t2, t1, p2, p1, f = NULL, sparse = FALSE) {
}
}
z <- rs_z_(t2, t1, f, sparse)
p2 <- as.numeric(p2)
p1 <- as.numeric(p1)
# number of columns that need to be removed for base period
n <- max(1L, nlevels(f)) * (ncol(z) > 0)
# return value
Expand Down
79 changes: 55 additions & 24 deletions tests/testthat/test-rs_matrix.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
dense_to_sparse <- function(x) {
if (packageVersion("Matrix") < package_version("1.5-0")) {
as(x, "dgCMatrix")
} else {
as(as(x, "generalMatrix"), "CsparseMatrix")
}
}

# data for computations
x <- data.frame(date = c(3, 2, 3, 2, 3, 3),
date_prev = c(1, 1, 2, 1, 2, 1),
Expand All @@ -16,7 +8,8 @@ x <- data.frame(date = c(3, 2, 3, 2, 3, 3),

mat <- with(x, rs_matrix(date, date_prev, price, price_prev))
mats <- with(x, rs_matrix(date, date_prev, price, price_prev, sparse = TRUE))
matg <- with(x, rs_matrix(date, date_prev, price, price_prev, id2))
matg <- with(x, rs_matrix(date, date_prev, price, price_prev, id2,
sparse = TRUE))
mata <- with(subset(x, id2 == "a"),
rs_matrix(date, date_prev, price, price_prev))
matb <- with(subset(x, id2 == "b"),
Expand Down Expand Up @@ -53,16 +46,48 @@ test_that("corner cases work", {
factor(integer(0), letters),
sparse = TRUE
)
expect_identical(ms("Z"), sparseMatrix(numeric(0), numeric(0), x = 0))
expect_identical(ms("X"), sparseMatrix(numeric(0), numeric(0), x = 0))
expect_identical(ms("Y"), double(0))
expect_identical(ms("y"), double(0))
})

test_that("matrices are correct for a simple grouped case", {
m <- rs_matrix(c(2, 3, 2, 2, 4), c(1, 1, 1, 1, 3), 1:5, 1:5,
c("a", "b", "a", "b", "a"))
expect_identical(
ms("Z"),
as(as(matrix(double(0), ncol = 0), "generalMatrix"), "CsparseMatrix")
m("Z"),
matrix(c(1, 0, 1, 0, 0,
0, 0, 0, 1, 0,
0, 0, 0, 0, -1,
0, 1, 0, 0, 0,
0, 0, 0, 0, 1,
0, 0, 0, 0, 0),
ncol = 6,
dimnames = list(1:5, c("a.2", "b.2", "a.3", "b.3", "a.4", "b.4")))
)
expect_identical(
ms("X"),
as(as(matrix(double(0), ncol = 0), "generalMatrix"), "CsparseMatrix")
)
expect_identical(ms("Y"), double(0))
expect_identical(ms("y"), double(0))
m("X"),
matrix(c(1, 0, 3, 0, 0,
0, 0, 0, 4, 0,
0, 0, 0, 0, -5,
0, 2, 0, 0, 0,
0, 0, 0, 0, 5,
0, 0, 0, 0, 0),
ncol = 6,
dimnames = list(1:5, c("a.2", "b.2", "a.3", "b.3", "a.4", "b.4")))
)
expect_identical(m("Y"), c("1" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 0))
expect_identical(m("y"), c("1" = log(1), "2" = log(1), "3" = log(1),
"4" = log(1), "5" = log(1)))

ms <- rs_matrix(c(2, 3, 2, 2, 4), c(1, 1, 1, 1, 3), 1:5, 1:5,
c("a", "b", "a", "b", "a"), TRUE)
expect_identical(as.matrix(ms("X")), m("X"))
expect_identical(as.matrix(ms("Z")), m("Z"))
expect_identical(ms("Y"), c("1" = 1, "2" = 2, "3" = 3, "4" = 4, "5" = 0))
expect_identical(ms("y"), c("1" = log(1), "2" = log(1), "3" = log(1),
"4" = log(1), "5" = log(1)))
})

test_that("matrices are correct for a simple case", {
Expand All @@ -77,7 +102,7 @@ test_that("matrices are correct for a simple case", {
)
expect_identical(m("Y"), c("1" = 1, "2" = 0))
expect_identical(m("y"), c("1" = log(2), "2" = log(5 / 2)))

ms <- rs_matrix(c(2, 4), 1:2, c(2, 5), 1:2, sparse = TRUE)
expect_identical(as.matrix(ms("X")), m("X"))
expect_identical(as.matrix(ms("Z")), m("Z"))
Expand Down Expand Up @@ -144,21 +169,27 @@ test_that("Z matrix works correctly", {

test_that("sparse matrices work correctly", {
expect_identical(rsmatrix:::rs_z_(integer(0), integer(0), sparse = TRUE),
dense_to_sparse(matrix(integer(0), ncol = 0)))
sparseMatrix(numeric(0), numeric(0), x = 0))
expect_identical(
suppressWarnings(rsmatrix:::rs_z_(1, 1, sparse = TRUE)),
dense_to_sparse(matrix(0, ncol = 1, dimnames = list(1, 1)))
sparseMatrix(numeric(0), numeric(0), x = 0, dims = c(1, 1),
dimnames = list(1, 1))
)
expect_identical(
suppressWarnings(rsmatrix:::rs_z_(c(a = "a"), "a", sparse = TRUE)),
dense_to_sparse(matrix(0, ncol = 1, dimnames = list("a", "a")))
sparseMatrix(numeric(0), numeric(0), x = 0, dims = c(1, 1),
dimnames = list("a", "a"))
)
expect_identical(
rsmatrix:::rs_z_(c(2, 2), c(1, 1), c("a", "b"), TRUE),
dense_to_sparse(
matrix(c(-1, 0, 0, -1, 1, 0, 0, 1), ncol = 4,
dimnames = list(1:2, c("a.1", "b.1", "a.2", "b.2"))))
)
sparseMatrix(c(1, 2, 1, 2), 1:4, x = c(-1, -1, 1, 1),
dimnames = list(1:2, c("a.1", "b.1", "a.2", "b.2")))
)
expect_identical(
suppressWarnings(rsmatrix:::rs_z_(2:1, c(1, 1), sparse = TRUE)),
sparseMatrix(c(1, 1), c(1, 2), x = c(-1, 1), dims = c(2, 2),
dimnames = list(1:2, 1:2))
)
})

test_that("grouped indexes work", {
Expand Down
135 changes: 135 additions & 0 deletions tests/vignette.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
## ----setup--------------------------------------------------------------------
library(rsmatrix)
library(Matrix)

## ----data---------------------------------------------------------------------
set.seed(15243)

periods <- seq(as.Date("2010-01-01"), as.Date("2019-12-31"), "day")

prices <- data.frame(sale = sample(periods, 5e5, TRUE),
property = factor(sprintf("%05d", sample(1:5e4, 5e5, TRUE))),
city = factor(sample(1:5, 1e5, TRUE)),
price = round(rlnorm(5e5) * 5e5, -3))

prices <- prices[order(prices$city, prices$property, prices$sale), ]
row.names(prices) <- NULL

head(prices)

## ----duplicates---------------------------------------------------------------
interaction(prices$city, prices$property, drop = TRUE) |>
tabulate() |>
quantile()

## ----yearmon------------------------------------------------------------------
prices$period <- cut(prices$sale, "month")

## ----pairs--------------------------------------------------------------------
sales_pairs <- rs_pairs(prices$sale, interaction(prices$city, prices$property))
prices[c("price_prev", "period_prev")] <- prices[sales_pairs, c("price", "period")]

head(prices)

## ----removal1-----------------------------------------------------------------
prices$holding_period <- with(prices, as.numeric(period) - as.numeric(period_prev))

prices <- subset(prices, holding_period > 2)

## ----removal2-----------------------------------------------------------------
library(gpindex)
monthly_return <- with(prices, (price / price_prev)^(1 / holding_period))

robust_z <- grouped(robust_z)
prices <- subset(prices, !robust_z(monthly_return, group = city))

head(prices)

## ----matrices-----------------------------------------------------------------
matrices <- with(
prices,
rs_matrix(period, period_prev, price, price_prev, city, sparse = TRUE)
)

## ----grs----------------------------------------------------------------------
Z <- matrices("Z")
y <- matrices("y")

grs <- exp(solve(crossprod(Z), crossprod(Z, y)))
head(grs)

## ----weights------------------------------------------------------------------
grs_resid <- y - Z %*% log(grs)

mdl <- lm(as.numeric(grs_resid)^2 ~ prices$holding_period)
W <- Diagonal(x = 1 / fitted.values(mdl))

grs_cs <- exp(solve(crossprod(Z, W %*% Z), crossprod(Z, W %*% y)))
head(grs_cs)

## ----ars----------------------------------------------------------------------
X <- matrices("X")
Y <- matrices("Y")

ars <- 1 / solve(crossprod(Z, X), crossprod(Z, Y))
head(ars)

## ----weights2-----------------------------------------------------------------
ars_resid <- Y - X %*% (1 / ars)

mdl <- lm(as.numeric(ars_resid)^2 ~ prices$holding_period)
W <- Diagonal(x = 1 / fitted.values(mdl))

ars_cs <- 1 / solve(crossprod(Z, W %*% X), crossprod(Z, W %*% Y))
head(ars_cs)

## ----weights3-----------------------------------------------------------------
ars_ew <- with(
prices,
1 / solve(crossprod(Z, X / price_prev), crossprod(Z, Y / price_prev))
)

head(ars_ew)

## ----contrib------------------------------------------------------------------
grs <- c(setNames(rep(1, 5), paste(1:5, "2010-01-01", sep = ".")), grs[, 1])
ars <- c(setNames(rep(1, 5), paste(1:5, "2010-01-01", sep = ".")), ars[, 1])

## ---- contrib_grs-------------------------------------------------------------
grs_contributions <- Map(
\(df, df_prev) {
impute_back <- with(df, price_prev / grs[paste(city, period_prev, sep = ".")])
names(impute_back) <- row.names(df)
impute_forward <- with(df_prev, price / grs[paste(city, period, sep = ".")])
names(impute_forward) <- row.names(df_prev)
geometric_contributions(
c(df$price / impute_back, df_prev$price_prev / impute_forward)
)
},
split(prices, interaction(prices$city, prices$period)),
split(prices, interaction(prices$city, prices$period_prev))
)

all.equal(sapply(grs_contributions, sum) + 1, grs)

range(unlist(grs_contributions))

## ---- contrib_ars-------------------------------------------------------------
ars_contributions <- Map(
\(df, df_prev) {
impute_back <- with(df, price_prev / ars[paste(city, period_prev, sep = ".")])
names(impute_back) <- row.names(df)
impute_forward <- with(df_prev, price / ars[paste(city, period, sep = ".")])
names(impute_forward) <- row.names(df_prev)
arithmetic_contributions(
c(df$price / impute_back, df_prev$price_prev / impute_forward),
c(impute_back, impute_forward))
},
split(prices, interaction(prices$city, prices$period)),
split(prices, interaction(prices$city, prices$period_prev))
)

all.equal(sapply(ars_contributions, sum) + 1, ars)

range(unlist(ars_contributions))

Loading

0 comments on commit 271b57d

Please sign in to comment.