## Code to reproduce examples in Section 2 library("tm") data("acq") acq[[6]] stemmed <- stemDocument(acq[[6]]) stemmed removed <- tm_map(acq, removeWords, stopwords("english")) removed[[6]] dtm <- DocumentTermMatrix(acq, list(removePunctuation = TRUE, stemming = TRUE)) dtm library("slam") head(sort(col_sums(dtm), decreasing = TRUE)) ## Code to reproduce examples in Section 4 library("DSL") ds <- DStorage(type = "HDFS", base_dir = tempdir(), chunksize = 10 * 1024^2) ## Note: if Hadoop framework not running please wait 10 sec and try ## above command again ds dl <- DList(letters = letters, numbers = 0:9) dl l <- as.list(letters) names(l) <- LETTERS dl <- as.DList(l) identical(as.list(dl), l) dl <- DList(letters = letters, numbers = 0:9, DStorage = ds) dl <- DList(letters = letters, numbers = 0:9) DL_storage(dl) DL_storage(dl) <- ds DL_storage(dl) library("tm.plugin.dc") storage <- DStorage(type = "HDFS", base_dir = "/tmp/dc") data("acq") dc <- as.DCorpus(acq, storage) dc dc <- tm_map(dc, stemDocument) stemmed <- tm_map(acq, stemDocument) all(sapply(seq_along(acq), function(x) identical(dc[[x]], stemmed[[x]]))) revs <- getRevisions(dc) revs dc <- setRevision(dc, revs[length(revs)]) all(sapply(seq_along(acq), function(x) identical(dc[[x]], acq[[x]]))) keepRevisions(dc) <- FALSE DocumentTermMatrix(dc, list(stemming = TRUE, removePunctuation = TRUE)) ## Code to reproduce examples in Section 6 col_apply <- function(x, FUN, ...) UseMethod("col_apply") col_apply.simple_triplet_matrix <- function(x, FUN, ...) { i <- x$i v <- x$v nr <- x$nrow nc <- x$ncol pos <- split(seq_along(x$j), factor(x$j, levels = seq_len(nc))) ini <- vector(typeof(v), nr) out <- lapply(pos, function(p) { y <- ini y[i[p]] <- v[p] FUN(y, ...) }) names(out) <- x$dimnames[[2L]] out } row_apply <- function(x, FUN, ...) UseMethod("row_apply") row_apply.simple_triplet_matrix <- function(x, FUN, ...) { j <- x$j v <- x$v nr <- x$nrow nc <- x$ncol pos <- split(seq_along(x$i), factor(x$i, levels = seq_len(nr))) ini <- vector(typeof(v), nc) out <- lapply(pos, function(p) { y <- ini y[j[p]] <- v[p] FUN(y, ...) }) names(out) <- x$dimnames[[1L]] out } ## calculate quantiles of term coverage, unique terms, sum tf per given date daily_stats2 <- function(dtm){ s <- row_apply(dtm, function(x) cumsum(sort(x[x>0], decreasing = TRUE))) out <- matrix(unlist(lapply(s, function(x) { n <- length(x) c(min(which(x > 0.8 * x[n])), min(which(x > 0.9 * x[n])), min(which(x > 0.95 * x[n])), min(which(x > 0.975 * x[n])), n, x[n]) })), ncol = 6, byrow=TRUE) rownames(out) <- rownames(dtm) colnames(out) <- c("80%", "90%", "95%", "97.5%", "n", "sum") out } ## load/generate stats dataset ## WARNING: run the following code block only on systems containing ## more than 4 GB of RAM. f <- "daily_stats.rda" if(file.exists(f)){ load(f) }else{ ## NYT_DTM, daily term frequencies load("NYT_DTM2.rda") stats <- daily_stats2(NYT_DTM2) save(stats, file = f, compress = TRUE) } ## calculate quarters for axis description months <- unlist(lapply(lapply(rownames(stats), strsplit, "-"), function(s) sprintf("%s-%s", s[[1]][1], s[[1]][2]))) quarters <- unique(months)[seq(from = 1, to = length(unique(months)), by = 3)] quarters <- sapply(quarters, function(x) min(which(months == x))) f <- "corpus_stats.rda" if(file.exists(f)){ load(f) }else{ load("NYT_meta.rda") load("NYT_DTM2.rda") n_tok <- sum(row_sums(NYT_DTM2)) corpus_stats <- data.frame(Docs = dim(NYT_meta)[1], Terms = dim(NYT_DTM2)[2], Tokens = n_tok, AvgLength = n_tok/dim(NYT_meta)[1], Chars = sum(as.numeric(NYT_meta[["nChar"]]), na.rm = TRUE)/n_tok) rownames(corpus_stats) <- "NYT" save(corpus_stats, file = f, compress = TRUE) } f <- "vocabulary.rda" if(file.exists(f)){ load(f) }else{ vocabulary <- data.frame(size = c(1:6 * 1000, 15851), coverage = c(0.72, 0.797, 0.84, 0.868, 0.887, 0.899, 0.978)) ## NYT Vocab load("NYT_DTM2.rda") vocab_distrib <- function(dtm, n = 1000){ hot_n <- order(col_sums(dtm), decreasing = TRUE)[1:n] sum(row_sums(dtm[, hot_n]))/sum(row_sums(dtm)) } vocabulary <- cbind(vocabulary, unlist(lapply(vocabulary[["size"]], function(x) vocab_distrib(NYT_DTM2, x)))) colnames(vocabulary) <- c("size", "coverage", "coverage_NYT") save(vocabulary, file = f, compress = TRUE) } vocabulary f <- "plot_monthly_coverage.rda" if(file.exists(f)){ load(f) }else{ load("NYT_DTM2_monthly.rda") cs <- col_sums(NYT_DTM2) o <- order(cs, decreasing = TRUE) rf1000 <- row_sums(NYT_DTM2[, o[1:1000]]) / row_sums(NYT_DTM2) rf2000 <- row_sums(NYT_DTM2[, o[1:2000]]) / row_sums(NYT_DTM2) rf4000 <- row_sums(NYT_DTM2[, o[1:4000]]) / row_sums(NYT_DTM2) coverage <- list(rf1000 = rf1000, rf2000 = rf2000, rf4000 = rf4000, rownames = rownames(NYT_DTM2)) save(coverage, file = f, compress = TRUE) } par(mfrow = c(2,1)) ## 1000 most frequent terms over time x <- seq_along(coverage$rf1000) y <- coverage$rf1000 plot(x, y, ylab = "Coverage", xlab = "Month", xaxt = "n") quat <- sapply(names(quarters), function(x) min(which(coverage$rownames == x))) axis(side = 1, at = x[quat], labels = coverage$rownames[quat], cex.axis = 0.6, las = 3) abline(lm(y ~ x)) lines(lowess(x, y), col = "red") ## 4000 most frequent terms over time x <- seq_along(coverage$rf4000) y <- coverage$rf4000 plot(x, y, ylab = "Coverage", xlab = "Month", xaxt = "n") quat <- sapply(names(quarters), function(x) min(which(coverage$rownames == x))) axis(side = 1, at = x[quat], labels = coverage$rownames[quat], cex.axis = 0.6, las = 3) abline(lm(y ~ x)) lines(lowess(x, y), col = "red") par(mfrow = c(1,1)) load("NYT_DTM2_monthly.rda") sw <- stemDocument(stopwords("en")) ind <- colnames(NYT_DTM2) %in% sw rf2 <- row_sums(NYT_DTM2[, ind]) / row_sums(NYT_DTM2) plot(x, rf2, ylab = "Coverage", xlab = "Month", xaxt = "n") quat <- sapply(names(quarters), function(x) min(which(rownames(NYT_DTM2)== x))) axis(side = 1, at = x[quat], labels = rownames(NYT_DTM2)[quat], cex.axis = 0.6, las = 3) lines(lowess(x, rf2), col = "red") ## Code to reproduce examples in Appendix library("hive") hadoop_home <- Sys.getenv("HADOOP_HOME") hive(hive_create(hadoop_home)) hive() summary(hive()) hive_is_available() hive_start() hive_is_available() ## wait until the namenode finishes to initialize Sys.sleep(10) DFS_list("/") DFS_dir_create("/tmp/test") DFS_write_lines(c("Hello HDFS", "Bye Bye HDFS"), "/tmp/test/hdfs.txt") DFS_list("/tmp/test") DFS_read_lines("/tmp/test/hdfs.txt") DFS_dir_remove("/tmp/test")