## Code to reproduce examples in Section 2
library("tm")

data("acq")
acq[[6]]

stemmed <- stemDocument(acq[[6]])
stemmed

removed <- tm_map(acq, removeWords, stopwords("english"))
removed[[6]]

dtm <- DocumentTermMatrix(acq, list(removePunctuation = TRUE,
                                     stemming = TRUE))
dtm

library("slam")
head(sort(col_sums(dtm), decreasing = TRUE))

## Code to reproduce examples in Section 4

library("DSL")
ds <- DStorage(type = "HDFS", base_dir = tempdir(),
                chunksize = 10 * 1024^2)
## Note: if Hadoop framework not running please wait 10 sec and try
## above command again
ds

dl <- DList(letters = letters, numbers = 0:9)
dl
l <- as.list(letters)
names(l) <- LETTERS
dl <- as.DList(l)
identical(as.list(dl), l)

dl <- DList(letters = letters, numbers = 0:9, DStorage = ds)

dl <- DList(letters = letters, numbers = 0:9)
DL_storage(dl)
DL_storage(dl) <- ds
DL_storage(dl)

library("tm.plugin.dc")

storage <- DStorage(type = "HDFS", base_dir = "/tmp/dc")

data("acq")
dc <- as.DCorpus(acq, storage)
dc


dc <- tm_map(dc, stemDocument)
stemmed <- tm_map(acq, stemDocument)
all(sapply(seq_along(acq), function(x) identical(dc[[x]],
                                                  stemmed[[x]])))

revs <- getRevisions(dc)
revs
dc <- setRevision(dc, revs[length(revs)])
all(sapply(seq_along(acq),
            function(x) identical(dc[[x]], acq[[x]])))


keepRevisions(dc) <- FALSE

DocumentTermMatrix(dc, list(stemming = TRUE, removePunctuation = TRUE))

## Code to reproduce examples in Section 6

col_apply <-
function(x, FUN, ...)
    UseMethod("col_apply")

col_apply.simple_triplet_matrix <-
function(x, FUN, ...)
{
    i <- x$i
    v <- x$v
    nr <- x$nrow
    nc <- x$ncol
    pos <- split(seq_along(x$j), factor(x$j, levels = seq_len(nc)))
    ini <- vector(typeof(v), nr)
    out <- lapply(pos,
                  function(p) {
                      y <- ini
                      y[i[p]] <- v[p]
                      FUN(y, ...)
                  })
    names(out) <- x$dimnames[[2L]]
    out
}

row_apply <-
function(x, FUN, ...)
    UseMethod("row_apply")

row_apply.simple_triplet_matrix <-
function(x, FUN, ...)
{
    j <- x$j
    v <- x$v
    nr <- x$nrow
    nc <- x$ncol
    pos <- split(seq_along(x$i), factor(x$i, levels = seq_len(nr)))
    ini <- vector(typeof(v), nc)
    out <- lapply(pos,
                  function(p) {
                      y <- ini
                      y[j[p]] <- v[p]
                      FUN(y, ...)
                  })
    names(out) <- x$dimnames[[1L]]
    out
}

## calculate quantiles of term coverage, unique terms, sum tf per given date
daily_stats2 <- function(dtm){
    s <- row_apply(dtm, function(x) cumsum(sort(x[x>0], decreasing = TRUE)))
    out <- matrix(unlist(lapply(s, function(x) {
        n <- length(x)
        c(min(which(x > 0.8 * x[n])),
          min(which(x > 0.9 * x[n])),
          min(which(x > 0.95 * x[n])),
          min(which(x > 0.975 * x[n])),
          n,
          x[n])
    })), ncol = 6, byrow=TRUE)
    rownames(out) <- rownames(dtm)
    colnames(out) <- c("80%", "90%", "95%", "97.5%", "n", "sum")
    out
}

## load/generate stats dataset
## WARNING: run the following code block only on systems containing
##          more than 4 GB of RAM.
f <- "daily_stats.rda"
if(file.exists(f)){
    load(f)
}else{
    ## NYT_DTM, daily term frequencies
    load("NYT_DTM2.rda")
    stats <- daily_stats2(NYT_DTM2)
    save(stats, file = f, compress = TRUE)
}

## calculate quarters for axis description
months <- unlist(lapply(lapply(rownames(stats), strsplit, "-"),
                        function(s) sprintf("%s-%s", s[[1]][1], s[[1]][2])))
quarters <- unique(months)[seq(from = 1, to = length(unique(months)), by = 3)]
quarters <- sapply(quarters, function(x) min(which(months == x)))


f <- "corpus_stats.rda"
if(file.exists(f)){
    load(f)
}else{
    load("NYT_meta.rda")
    load("NYT_DTM2.rda")
    n_tok <- sum(row_sums(NYT_DTM2))
    corpus_stats <- data.frame(Docs = dim(NYT_meta)[1],
                                Terms = dim(NYT_DTM2)[2],
                                Tokens = n_tok,
                                AvgLength = n_tok/dim(NYT_meta)[1],
                                Chars = sum(as.numeric(NYT_meta[["nChar"]]), na.rm = TRUE)/n_tok)
    rownames(corpus_stats) <- "NYT"
    save(corpus_stats, file = f, compress = TRUE)
}


f <- "vocabulary.rda"
if(file.exists(f)){
    load(f)
}else{
    vocabulary <- data.frame(size = c(1:6 * 1000, 15851), coverage = c(0.72, 0.797, 0.84, 0.868, 0.887, 0.899, 0.978))
    ## NYT Vocab
    load("NYT_DTM2.rda")
    vocab_distrib <- function(dtm, n = 1000){
        hot_n <- order(col_sums(dtm), decreasing = TRUE)[1:n]
        sum(row_sums(dtm[, hot_n]))/sum(row_sums(dtm))
    }

    vocabulary <- cbind(vocabulary, unlist(lapply(vocabulary[["size"]], function(x) vocab_distrib(NYT_DTM2, x))))
    colnames(vocabulary) <- c("size", "coverage", "coverage_NYT")
    save(vocabulary, file = f, compress = TRUE)
}

vocabulary

f <- "plot_monthly_coverage.rda"
if(file.exists(f)){
    load(f)
}else{
    load("NYT_DTM2_monthly.rda")
    cs <- col_sums(NYT_DTM2)
    o <- order(cs, decreasing = TRUE)
    rf1000 <- row_sums(NYT_DTM2[, o[1:1000]]) / row_sums(NYT_DTM2)
    rf2000 <- row_sums(NYT_DTM2[, o[1:2000]]) / row_sums(NYT_DTM2)
    rf4000 <- row_sums(NYT_DTM2[, o[1:4000]]) / row_sums(NYT_DTM2)
    coverage <- list(rf1000 = rf1000,
                      rf2000 = rf2000,
                      rf4000 = rf4000,
                      rownames = rownames(NYT_DTM2))
    save(coverage, file = f, compress = TRUE)
}
par(mfrow = c(2,1))

## 1000 most frequent terms over time
x <- seq_along(coverage$rf1000)
y <- coverage$rf1000
plot(x, y, ylab = "Coverage", xlab = "Month", xaxt = "n")
quat <- sapply(names(quarters), function(x) min(which(coverage$rownames == x)))
axis(side = 1, at = x[quat], labels = coverage$rownames[quat], cex.axis = 0.6, las = 3)
abline(lm(y ~ x))
lines(lowess(x, y), col = "red")

## 4000 most frequent terms over time
x <- seq_along(coverage$rf4000)
y <- coverage$rf4000
plot(x, y, ylab = "Coverage", xlab = "Month", xaxt = "n")
quat <- sapply(names(quarters), function(x) min(which(coverage$rownames == x)))
axis(side = 1, at = x[quat], labels = coverage$rownames[quat], cex.axis = 0.6, las = 3)
abline(lm(y ~ x))
lines(lowess(x, y), col = "red")

par(mfrow = c(1,1))

load("NYT_DTM2_monthly.rda")
sw <- stemDocument(stopwords("en"))
ind <- colnames(NYT_DTM2) %in% sw
rf2 <- row_sums(NYT_DTM2[, ind]) / row_sums(NYT_DTM2)

plot(x, rf2, ylab = "Coverage", xlab = "Month", xaxt = "n")
quat <- sapply(names(quarters), function(x) min(which(rownames(NYT_DTM2)== x)))
axis(side = 1, at = x[quat], labels = rownames(NYT_DTM2)[quat], cex.axis = 0.6, las = 3)
lines(lowess(x, rf2), col = "red")

## Code to reproduce examples in Appendix

library("hive")

hadoop_home <- Sys.getenv("HADOOP_HOME")
hive(hive_create(hadoop_home))
hive()
summary(hive())
hive_is_available()
hive_start()
hive_is_available()


## wait until the namenode finishes to initialize
Sys.sleep(10)

DFS_list("/")
DFS_dir_create("/tmp/test")
DFS_write_lines(c("Hello HDFS", "Bye Bye HDFS"), "/tmp/test/hdfs.txt")
DFS_list("/tmp/test")
DFS_read_lines("/tmp/test/hdfs.txt")

DFS_dir_remove("/tmp/test")