README.Rmd

---
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, comment = "#>", cache = TRUE)
library(textfeatures)
library(magrittr)
options(width = 100)
skimrskim <- function(x) {
  skimr::skim(x[-1]) %>% 
    dplyr::filter(stat %in% c("p0", "p25", "p50", "p75", "p100", "hist", "n")) %>%
    dplyr::select(-value, -level, -type) %>%
    tidyr::spread(stat, formatted) %>%
    dplyr::select(variable, `min` = p0, `25%` = p25, `mid` = p50, `75%` = p75, `max` = p100, hist) %>% 
    knitr::kable()
}
```

# 👷 textfeatures 👷<img src="man/figures/logo.png" width="160px" align="right" /> 

[![Build status](https://travis-ci.org/mkearney/textfeatures.svg?branch=master)](https://travis-ci.org/mkearney/textfeatures)
[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/mkearney/textfeatures?branch=master&svg=true)](https://ci.appveyor.com/project/mkearney/textfeatures)
[![CRAN status](https://www.r-pkg.org/badges/version/textfeatures)](https://cran.r-project.org/package=textfeatures)
[![Coverage Status](https://codecov.io/gh/mkearney/textfeatures/branch/master/graph/badge.svg)](https://codecov.io/gh/mkearney/textfeatures?branch=master)
[![DOI](https://zenodo.org/badge/123046986.svg)](https://zenodo.org/badge/latestdoi/123046986)

![Downloads](https://cranlogs.r-pkg.org/badges/textfeatures)
![Downloads](https://cranlogs.r-pkg.org/badges/grand-total/textfeatures)
[![lifecycle](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental)

> Easily extract useful features from character objects.

## Install

Install from CRAN.

```{r cran, eval=FALSE}
## download from CRAN
install.packages("textfeatures")
```

Or install the development version from Github.

```{r github, eval=FALSE}
## install from github
devtools::install_github("mkearney/textfeatures")
```

## Usage

### `textfeatures()`

Input a character vector.

```{r chr}
## vector of some text
x <- c(
  "this is A!\t sEntence https://github.com about #rstats @github",
  "and another sentence here", "THe following list:\n- one\n- two\n- three\nOkay!?!"
)

## get text features
textfeatures(x, verbose = FALSE)
```

Or input a data frame with a column named `text`.

```{r df}
## data frame with rstats tweets
rt <- rtweet::search_tweets("rstats", n = 2000, verbose = FALSE)

## get text features
tf <- textfeatures(rt, verbose = FALSE)

## preview data
tf
```

Compare across multiple authors.

```{r news, eval = FALSE}
## data frame tweets from multiple news media accounts
news <- rtweet::get_timelines(
  c("cnn", "nytimes", "foxnews", "latimes", "washingtonpost"), 
  n = 2000)

## get text features (including ests for 20 word dims) for all observations
news_features <- textfeatures(news, word_dims = 20, verbose = FALSE)
```


```{r news_features, echo = FALSE, eval = FALSE}
## override id with screen names
news_features$user_id <- news$screen_name

## load the tidyverse
suppressPackageStartupMessages(library(tidyverse))

## convert to long (tidy) form and plot
p <- news_features %>%
  scale_count() %>%
  scale_standard() %>%
  group_by(user_id) %>%
  summarise_if(is.numeric, mean) %>%
  gather(var, val, -user_id) %>%
  arrange(-val) %>%
  mutate(var = factor(var, levels = unique(var)), 
    user_id = paste0("@", user_id)) %>%
  ggplot(aes(x = var, y = val, fill = user_id)) + 
  geom_col(width = .15, fill = "#000000bb") +
  geom_point(size = 2.5, shape = 21) + 
  tfse::theme_mwk(light = "#ffffff") + 
  facet_wrap(~ user_id, nrow = 1) + 
  coord_flip() + 
  theme(legend.position = "none",
    axis.text = element_text(colour = "black"),
    axis.text.x = element_text(size = rel(.7)),
    plot.title = element_text(face = "bold", size = rel(1.6)),
    panel.grid.major = element_line(colour = "#333333", size = rel(.05)),
    panel.grid.minor = element_line(colour = "#333333", size = rel(.025))) + 
  labs(y = NULL, x = NULL,
    title = "{textfeatures}: Extract Features from Text",
    subtitle = "Features extracted from text of the most recent 2,000 tweets posted by each news media account")

## save plot
ggsave("tools/readme/readme.png", p, width = 9, height = 6, units = "in")
```

<p style='align:center'><img src='tools/readme/readme.png' max-width="600px" /></p>


## Fast version

If you're looking for something faster try setting `sentiment = FALSE` and `word2vec = 0`.

```{r fast}
## get non-substantive text features
textfeatures(rt, sentiment = FALSE, word_dims = 0, verbose = FALSE)
```


## Example: NASA meta data

Extract text features from NASA meta data:


```{r, include=FALSE}
if (!file.exists(".nasa.rds")) {
  ## read NASA meta data
  nasa <- jsonlite::fromJSON("https://data.nasa.gov/data.json")
  
  ## identify non-public or restricted data sets
  nonpub <- grepl("Not publicly available|must register", 
    nasa$data$rights, ignore.case = TRUE) | 
    nasa$dataset$accessLevel %in% c("restricted public", "non-public")
  
  ## create data frame with ID, description (name it "text"), and nonpub
  nd <- data.frame(text = nasa$dataset$description, nonpub = nonpub, 
    stringsAsFactors = FALSE)
  
  ## drop duplicates (truncate text to ensure more distinct obs)
  nd <- nd[!duplicated(tolower(substr(nd$text, 1, 100))), ]
  
  ## filter via sampling to create equal number of pub/nonpub
  nd <- nd[c(sample(which(!nd$nonpub), sum(nd$nonpub)), which(nd$nonpub)), ]
  saveRDS(nd, ".nasa.rds")
} else {
  nd <- readRDS(".nasa.rds")
}
```


```{r nd, eval=FALSE}
## read NASA meta data
nasa <- jsonlite::fromJSON("https://data.nasa.gov/data.json")

## identify non-public or restricted data sets
nonpub <- grepl("Not publicly available|must register", 
  nasa$data$rights, ignore.case = TRUE) | 
  nasa$dataset$accessLevel %in% c("restricted public", "non-public")

## create data frame with ID, description (name it "text"), and nonpub
nd <- data.frame(text = nasa$dataset$description, nonpub = nonpub, 
  stringsAsFactors = FALSE)

## drop duplicates (truncate text to ensure more distinct obs)
nd <- nd[!duplicated(tolower(substr(nd$text, 1, 100))), ]

## filter via sampling to create equal number of pub/nonpub
nd <- nd[c(sample(which(!nd$nonpub), sum(nd$nonpub)), which(nd$nonpub)), ]
```


```{r nasafinal}
## get text features
nasa_tf <- textfeatures(nd, word_dims = 20, normalize = FALSE, verbose = FALSE)

## drop columns with little to no variance
min_var <- function(x, min = 1) {
  is_num <- vapply(x, is.numeric, logical(1))
  non_num <- names(x)[!is_num]
  yminvar <- names(x[is_num])[vapply(x[is_num], function(.x) stats::var(.x, 
      na.rm = TRUE) >= min, logical(1))]
  x[c(non_num, yminvar)]
}
nasa_tf <- min_var(nasa_tf)

## view summary
skimrskim(nasa_tf)

## add nonpub variable
nasa_tf$nonpub <- nd$nonpub

## run model predicting whether data is restricted
m1 <- glm(nonpub ~ ., data = nasa_tf[-1], family = binomial)

## view model summary
summary(m1)

## how accurate was the model?
table(predict(m1, type = "response") > .5, nasa_tf$nonpub)
```