fixes #92

dselivanov · Feb 25, 2019 · a5d7ce6 · a5d7ce6
1 parent 5c8ea92
commit a5d7ce6
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 148 deletions.
diff --git a/R/distance_RWMD.R b/R/distance_RWMD.R
@@ -1,21 +1,3 @@
-# we assume wv matrix is already normalized. In this case L2 normalized
-# wv - word vectors matrix (WORDS = COLUMNS, because faster subsetting!)
-cosine_dist_internal = function(m_i, m_j) {
-  1 - crossprod(m_i, m_j)
-}
-
-# we assume wv matrix is already normalized. In this case L2 normalized
-# wv - word vectors matrix (WORDS = COLUMNS, because faster subsetting!)
-euclidean_dist_internal = function(m_i, m_j) {
-  euclidean_dist(m_i, m_j)
-}
-
-dist_internal = function(m_i, m_j, method) {
-  switch(method,
-         cosine = cosine_dist_internal(m_i, m_j),
-         euclidean = euclidean_dist_internal(m_i, m_j))
-}
-
 text2vec_dist = R6::R6Class(
   classname = "distance_model",
   public = list(
@@ -27,45 +9,43 @@ text2vec_dist = R6::R6Class(
     internal_matrix_format = NULL
   )
 )
+
 #' @name RelaxedWordMoversDistance
-#' @title Creates model which can be used for calculation of "relaxed word movers distance".
-#' @description Relaxed word movers distance tries to measure distance between documents by
-#' calculating how hard is to transform words from first document into words from second document
-#' and vice versa. For more detail see original article: \url{http://mkusner.github.io/publications/WMD.pdf}.
+#' @title Creates Relaxed Word Movers Distance (RWMD) model
+#' @description RWMD model can be used to query the "relaxed word movers distance" from a document to a
+#' collection of documents. RWMD tries to measure distance between query document and collection of documents by
+#' calculating how hard is to transform words from query document into words from each document in collection.
+#' For more detail see following article: \url{http://mkusner.github.io/publications/WMD.pdf}.
+#' However in contrast to the article above we calculate "easiness" of the convertion of one word into another
+#' by using \bold{cosine} similarity (but not a euclidean distance).
+#' Also here in text2vec we've implemented effiient RWMD using the tricks from the
+#' \href{https://arxiv.org/abs/1711.07227}{Linear-Complexity Relaxed Word Mover's Distance with GPU Acceleration} article.
 #' @section Usage:
 #' For usage details see \bold{Methods, Arguments and Examples} sections.
 #' \preformatted{
-#' rwmd = RelaxedWordMoversDistance$new(wv, method = c("cosine", "euclidean"), normalize = TRUE, progressbar = interactive())
-#' rwmd$dist2(x, y)
-#' rwmd$pdist2(x, y)
+#' rwmd = RelaxedWordMoversDistance$new(x, embeddings)
+#' rwmd$sim2(x)
 #' }
 #' @format \code{\link{R6Class}} object.
 #' @section Methods:
 #' \describe{
-#'   \item{\code{$new(wv, method = c("cosine", "euclidean"))}}{Constructor for RWMD model
-#'         For description of arguments see \bold{Arguments} section}
-#'   \item{\code{$dist2(x, y)}}{Computes distance between each row of sparse matrix
-#'         \code{x} and each row of sparse matrix \code{y}}
-#'   \item{\code{$pdist2(x, y)}}{Computes "parallel" distance between rows of
-#'         sparse matrix \code{x} and corresponding rows of the sparse matrix \code{y}}
-#' }
-#' @field progressbar \code{logical = TRUE} whether to display progressbar
-#' @section Arguments:
-#' \describe{
-#'  \item{rwmd}{\code{RWMD} object}
-#'  \item{x}{\code{x} sparse document term matrix}
-#'  \item{y}{\code{y = NULL} sparse document term matrix.
-#'          If \code{y = NULL} (as by default), we will assume \code{y = x} }
-#'  \item{wv}{word vectors. Numeric matrix which contains word embeddings. Rows - words,
-#'            columns - corresponding vectors. Rows should have word names.}
-#'  \item{method}{name of the distance for measuring similarity between two word vectors.
-#'                In original paper authors use \code{"euclidean"},
-#'                however we use \code{"cosine"} by default (better from our experience).
-#'                This means \code{distance = 1 - cosine_angle_betwen_wv}}
+#'   \item{\code{$new(x, embeddings)}}{Constructor for RWMD model.
+#'         \code{x} - docuent-term matrix which represents collection of
+#'         documents against which you want to perform queries. \code{embeddings} -
+#'         matrix of word embeddings which will be used to calculate similarities
+#'         between words (each row represents a word vector).}
+#'   \item{\code{$sim(x)}}{calculates similarity from a collection of documents
+#'   to collection query documents \code{x}.
+#'   \code{x} here is a document-term matrix which represents the set of query documents}
+#'   \item{\code{$dist(x)}}{calculates distance from a collection of documents
+#'   to collection query documents \code{x}
+#'   \code{x} here is a document-term matrix which represents the set of query documents}
 #' }
 #' @export
 #' @examples
 #' \dontrun{
+#' library(text2vec)
+#' library(rsparse)
 #' data("movie_review")
 #' tokens = word_tokenizer(tolower(movie_review$review))
 #' v = create_vocabulary(itoken(tokens))
@@ -74,128 +54,79 @@ text2vec_dist = R6::R6Class(
 #' vectorizer = vocab_vectorizer(v)
 #' dtm = create_dtm(it, vectorizer)
 #' tcm = create_tcm(it, vectorizer, skip_grams_window = 5)
-#' glove_model = GloVe$new(word_vectors_size = 50, vocabulary = v, x_max = 10)
-#' wv = glove_model$fit_transform(tcm, n_iter = 10)
+#' glove_model = GloVe$new(word_vectors_size = 50, x_max = 10)
+#' wv = glove_model$fit_transform(tcm, n_iter = 5)
 #' # get average of main and context vectors as proposed in GloVe paper
 #' wv = wv + t(glove_model$components)
-#' rwmd_model = RWMD$new(wv)
-#' rwmd_dist = dist2(dtm[1:100, ], dtm[1:10, ], method = rwmd_model, norm = 'none')
-#' head(rwmd_dist)
+#' rwmd_model = RelaxedWordMoversDistance$new(dtm, wv)
+#' rwms = rwmd_model$sim2(dtm[1:10, ])
+#' head(sort(rwms[1, ], decreasing = T))
 #'}
 RelaxedWordMoversDistance = R6::R6Class(
   classname = "RWMD",
-  inherit = text2vec_dist,
   public = list(
-    initialize = function(wv, method = c('cosine', 'euclidean'), normalize = TRUE, progressbar = interactive()) {
-      stopifnot(is.matrix(wv))
-      stopifnot(is.numeric(wv))
-      stopifnot(is.logical(normalize) && is.logical(progressbar))
+    x = NULL,
+    embedding_ids = NULL,
+    item_ids = NULL,
+    embeddings = NULL,
+    initialize = function(x, embeddings) {
+      stopifnot(is.matrix(embeddings))
+      stopifnot(is.numeric(embeddings))
+
+      self$embedding_ids = intersect(colnames(x), rownames(embeddings))
+      self$item_ids = rownames(x)
+
+      embeddings = embeddings[self$embedding_ids, , drop = FALSE]
+      x = x[, self$embedding_ids, drop = FALSE]
+      x = text2vec:::transform_rows_unit_norm(x, 1)
 
       private$internal_matrix_format = 'RsparseMatrix'
-      private$method = match.arg(method)
-      self$progressbar = progressbar
+      self$x = as(x, private$internal_matrix_format)
       # make shure  that word vectors are L2 normalized
       # and transpose them for faster column subsetting
       # R stores matrices in column-major format
-      private$wv = t(as.matrix(normalize(wv, "l2")))
+      self$embeddings = t(text2vec:::transform_rows_unit_norm(embeddings, 2))
     },
-    dist2 = function(x, y) {
-      stopifnot( inherits(x, "sparseMatrix") && inherits(y, "sparseMatrix"))
-      stopifnot( colnames(x) == colnames(y) )
-      # take only words that appear both in word vectors
-      terms = intersect(colnames(x), colnames(private$wv))
-      # make sure we don't have empty string - matrices doesn't allow subsetting by empty string
-      terms = setdiff(terms, "")
-      wv_internal = private$wv[, terms, drop = FALSE]
-      # convert matrices in row-major format
-      x_csr =  normalize(x[, terms, drop = FALSE], "l1")
-      x_csr =  as(x_csr, private$internal_matrix_format)
+    sim2 = function(x) {
+      stopifnot(identical(colnames(x), self$embedding_ids))
+      x = as(x, private$internal_matrix_format)
 
-      y_csr = normalize(y[, terms, drop = FALSE], "l1")
-      y_csr = as(y_csr, private$internal_matrix_format)
-
-      if (self$progressbar)
-        pb = txtProgressBar(initial = 1L, min = 2L, max = length(x_csr@p), style = 3)
-      # preallocate resulting matrix
-      res = matrix(Inf, nrow = nrow(x_csr), ncol = nrow(y_csr))
       # main loop
-      for (j in 2L:(length(x_csr@p))) {
-        if (self$progressbar) setTxtProgressBar(pb, j)
-        i1 = (x_csr@p[[j - 1]] + 1L):x_csr@p[[j]]
-        j1 = x_csr@j[i1] + 1L
-        m_j1 = wv_internal[, j1, drop = FALSE]
-        x1 = x_csr@x[i1]
+      res = vector("list", nrow(x))
 
-        dist_matrix = dist_internal(m_j1, wv_internal, private$method)
-        for (i in 2L:(length(y_csr@p))) {
-          # document offsets
-          i2 = (y_csr@p[[i - 1L]] + 1L):y_csr@p[[i]]
-          # word indices
-          j2 = y_csr@j[i2] + 1L
-          # nbow values
-          x2 = y_csr@x[i2]
-          res[j - 1L, i - 1L] = private$rwmd_cache(dist_matrix[, j2, drop = FALSE], x1, x2)
-        }
-      }
-      if (self$progressbar) close(pb)
-      res
-    },
-    pdist2 = function(x, y) {
-      stopifnot( inherits(x, "sparseMatrix") && inherits(y, "sparseMatrix"))
-      stopifnot( ncol(x) == ncol(y) )
-      stopifnot( colnames(x) == colnames(y) )
-      stopifnot( nrow(x) == nrow(y) )
-      # take only words that appear both in word vectors
-      terms = intersect(colnames(x), colnames(private$wv))
-      # make sure we don't have empty string - matrices doesn't allow subsetting by empty string
-      terms = setdiff(terms, "")
-      wv_internal = private$wv[, terms, drop = FALSE]
-
-      x_csr = normalize(x[, terms, drop = FALSE], "l1")
-      x_csr = as(x_csr, private$internal_matrix_format)
+      for (j in 2L:(length(x@p))) {
+        row_number = j - 1L
+        # futile.logger::flog.debug(sprintf("row %d", row_number))
+        i1 = (x@p[[row_number]] + 1L):x@p[[j]]
+        j1 = x@j[i1] + 1L
+        m_j1 = self$embeddings[, j1, drop = FALSE]
 
-      y_csr = normalize(y[, terms, drop = FALSE], "l1")
-      y_csr = as(y_csr, private$internal_matrix_format)
+        d = crossprod(m_j1, self$embeddings)
+        # calculates how easy is to transform each word in vocabulary into words from query
+        d = matrix(text2vec:::colMaxs(d), ncol = 1)
 
-
-      if (self$progressbar)
-        pb = txtProgressBar(initial = 1L, min = 2L, max = length(x_csr@p), style = 3)
-      # preallocate space for result
-      res = rep(Inf,  nrow(x_csr))
-      for (j in 2L:(length(x_csr@p))) {
-        if (self$progressbar) setTxtProgressBar(pb, j)
-        i1 = (x_csr@p[[j - 1]] + 1L):x_csr@p[[j]]
-        j1 = x_csr@j[i1] + 1L
-        m_j1 = wv_internal[ , j1, drop = FALSE]
-        x1 = x_csr@x[i1]
-        i2 = (y_csr@p[[j - 1L]] + 1L):y_csr@p[[j]]
-        j2 = y_csr@j[i2] + 1L
-        m_j2 = wv_internal[ , j2, drop = FALSE]
-        x2 = y_csr@x[i2]
-        res[j - 1L] = private$rwmd(m_j1, m_j2, x1, x2)
+        # matrix mult from rsparse
+        # calculates the cost of the best transformation from each of the
+        # documents from collection into query document. We transform each word from
+        # each document into closest word in the query
+        d = self$x %*% d
+        res[[row_number]]= d[, 1]
       }
-      if (self$progressbar) close(pb)
+      res = do.call(rbind, res)
+      colnames(res) = self$item_ids
+      rownames(res) = rownames(x)
       res
+    },
+    dist2 = function(x) {
+      1 - self$sim2(x)
     }
   ),
   private = list(
-    wv = NULL,
-    method = NULL,
-    # workhorse for rwmd calculation
-    rwmd = function(m_i, m_j, weight_i, weight_j) {
-      dist_matrix = dist_internal(m_i, m_j, private$method)
-      d1 = sum( text2vec:::rowMins(dist_matrix) * weight_i)
-      d2 = sum( text2vec:::colMins(dist_matrix) * weight_j)
-      max(d1, d2)
-    },
-    rwmd_cache = function(dist_matrix, weight_i, weight_j) {
-      d1 = sum( rowMins(dist_matrix) * weight_i)
-      d2 = sum( colMins(dist_matrix) * weight_j)
-      max(d1, d2)
-    }
+    internal_matrix_format = NULL
   )
 )
 
+
 #' @rdname RelaxedWordMoversDistance
 #' @export
 RWMD = RelaxedWordMoversDistance
diff --git a/R/utils_matrix.R b/R/utils_matrix.R
@@ -14,6 +14,28 @@
 #   // You should have received a copy of the GNU General Public License
 # // along with text2vec.  If not, see <http://www.gnu.org/licenses/>.
 
+
+transform_rows_unit_norm = function(x, norm = 1) {
+  if(!inherits(x, "matrix") && !inherits(x, "sparseMatrix"))
+    stop("x should inherit from `matrix`` or `Matrix::sparseMatrix`")
+  if(!is.numeric(norm) || length(norm) != 1)
+    stop("`norm` should be numeric of length 1")
+  rs = rowSums(x ^ norm)
+
+  if(isTRUE(all.equal(rep(1, length(rs)), rs, tolerance = 1e-5, check.attributes = FALSE)))
+    return(x)
+
+  norm_vec = 1 / rs ^ (1 / norm)
+
+  # case when sum row elements == 0
+  norm_vec[is.infinite(norm_vec)] = 0
+
+  if(inherits(x, "sparseMatrix"))
+    Diagonal(x = norm_vec) %*% x
+  else
+    x * norm_vec
+}
+
 #' @name normalize
 #' @title Matrix normalization
 #' @description normalize matrix rows using given norm

diff --git a/tests/testthat/test-distances.R b/tests/testthat/test-distances.R
@@ -68,13 +68,10 @@ test_that("euclidean", {
 
 test_that("relaxed word mover distance", {
   glove = GlobalVectors$new(word_vectors_size = 50, x_max = 10)
-  wv = glove$fit_transform(tcm, n_iter = 10)
-  rwmd_model = RWMD$new(wv)
-  rwmd_dist = dist2(dtm[i1, ], dtm[i2, ], method = rwmd_model, norm = "none")
+  wv = glove$fit_transform(tcm, n_iter = 5)
+  rwmd_model = RWMD$new(dtm[i2, ], wv)
+  rwmd_dist = rwmd_model$dist2(dtm[i1, ])
   expect_equal(nrow(rwmd_dist), length(i1))
   expect_equal(ncol(rwmd_dist), length(i2))
   expect_equal(rwmd_dist[1,1], 0, tol = tol)
-  expect_equal(dist2(dtm[i1, ], method = rwmd_model, norm = "none"),
-               dist2(dtm[i1, ], dtm[i1, ], method = rwmd_model, norm = "none"))
-
 })