################################################### ### Preliminaries ################################################### options(width = 65, prompt = "R> ", continue = "+ ", useFancyQuotes = FALSE) library("lattice") library("topicmodels") ltheme <- canonical.theme("postscript", FALSE) lattice.options(default.theme=ltheme) ################################################### ### Section: Application: Main functions LDA() and CTM() ################################################### cat(paste("R> ", prompt(LDA, filename = NA)$usage[[2]], "\n", "R> ", prompt(CTM, filename = NA)$usage[[2]], sep = "")) ## control_LDA_VEM <- ## list(estimate.alpha = TRUE, alpha = 50/k, estimate.beta = TRUE, ## verbose = 0, prefix = tempfile(), save = 0, keep = 0, ## seed = as.integer(Sys.time()), nstart = 1, best = TRUE, ## var = list(iter.max = 500, tol = 10^-6), ## em = list(iter.max = 1000, tol = 10^-4), ## initialize = "random") ## control_LDA_Gibbs <- ## list(alpha = 50/k, estimate.beta = TRUE, ## verbose = 0, prefix = tempfile(), save = 0, keep = 0, ## seed = as.integer(Sys.time()), nstart = 1, best = TRUE, ## delta = 0.1, ## iter = 2000, burnin = 0, thin = 2000) ## control_CTM_VEM <- ## list(estimate.beta = TRUE, ## verbose = 0, prefix = tempfile(), save = 0, keep = 0, ## seed = as.integer(Sys.time()), nstart = 1L, best = TRUE, ## var = list(iter.max = 500, tol = 10^-6), ## em = list(iter.max = 1000, tol = 10^-4), ## initialize = "random", ## cg = list(iter.max = 500, tol = 10^-5)) ################################################### ### Section: Illustrative example: Abstracts of JSS papers ################################################### if (!("corpus.JSS.papers" %in% installed.packages()[,"Package"])) { try(install.packages("corpus.JSS.papers", repos = "http://datacube.wu.ac.at/", type = "source")) } if (!("corpus.JSS.papers" %in% installed.packages()[,"Package"])) { library("OAIHarvester") x <- oaih_list_records("http://www.jstatsoft.org/oai") JSS_papers <- oaih_transform(x[, "metadata"]) JSS_papers <- JSS_papers[order(as.Date(unlist(JSS_papers[, "date"]))), ] JSS_papers <- JSS_papers[grep("Abstract:", JSS_papers[, "description"]), ] JSS_papers[, "description"] <- sub(".*\nAbstract:\n", "", unlist(JSS_papers[, "description"])) } else { data("JSS_papers", package = "corpus.JSS.papers") } JSS_papers <- JSS_papers[JSS_papers[,"date"] < "2010-08-05",] JSS_papers <- JSS_papers[sapply(JSS_papers[, "description"], Encoding) == "unknown",] library("topicmodels") library("XML") remove_HTML_markup <- function(s) { doc <- htmlTreeParse(s, asText = TRUE, trim = FALSE) xmlValue(xmlRoot(doc)) } corpus <- Corpus(VectorSource(sapply(JSS_papers[, "description"], remove_HTML_markup))) Sys.setlocale("LC_COLLATE", "C") JSS_dtm <- DocumentTermMatrix(corpus, control = list(stemming = TRUE, stopwords = TRUE, minWordLength = 3, removeNumbers = TRUE, removePunctuation = TRUE)) dim(JSS_dtm) summary(col_sums(JSS_dtm)) term_tfidf <- tapply(JSS_dtm$v/row_sums(JSS_dtm)[JSS_dtm$i], JSS_dtm$j, mean) * log2(nDocs(JSS_dtm)/col_sums(JSS_dtm > 0)) summary(term_tfidf) JSS_dtm <- JSS_dtm[,term_tfidf >= 0.1] JSS_dtm <- JSS_dtm[row_sums(JSS_dtm) > 0,] summary(col_sums(JSS_dtm)) dim(JSS_dtm) k <- 30 SEED <- 2010 jss_TM <- list(VEM = LDA(JSS_dtm, k = k, control = list(seed = SEED)), VEM_fixed = LDA(JSS_dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)), Gibbs = LDA(JSS_dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000, thin = 100, iter = 1000)), CTM = CTM(JSS_dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3)))) sapply(jss_TM[1:2], slot, "alpha") methods <- c("VEM", "VEM_fixed", "Gibbs", "CTM") DF <- data.frame(posterior = unlist(lapply(jss_TM, function(x) apply(posterior(x)$topics, 1, max))), method = factor(rep(methods, each = nrow(posterior(jss_TM$VEM)$topics)), methods)) print(histogram(~ posterior | method, data = DF, col = "white", as.table = TRUE, xlab = "Probability of assignment to the most likely topic", ylab = "Percent of total", layout = c(4, 1))) sapply(jss_TM, function(x) mean(apply(posterior(x)$topics, 1, function(z) - sum(z * log(z))))) Topic <- topics(jss_TM[["VEM"]], 1) Terms <- terms(jss_TM[["VEM"]], 5) Terms[,1:5] (topics_v24 <- topics(jss_TM[["VEM"]])[grep("/v24/", JSS_papers[, "identifier"])]) most_frequent_v24 <- which.max(tabulate(topics_v24)) terms(jss_TM[["VEM"]], 10)[, most_frequent_v24] ################################################### ### Section: Associated Press data ################################################### data("AssociatedPress", package = "topicmodels") dim(AssociatedPress) fold <- 1 range(col_sums(AssociatedPress)) set.seed(0908) folding <- sample(rep(seq_len(10), ceiling(nrow(AssociatedPress)))[seq_len(nrow(AssociatedPress))]) testing <- which(folding == fold) training <- which(folding != fold) topics <- 10 * c(1:5, 10, 20) ## train <- LDA(AssociatedPress[training,], k = k, ## control = list(verbose = 100)) ## test <- LDA(AssociatedPress[testing,], model = train, ## control = list(estimate.beta = FALSE)) ## train <- LDA(AssociatedPress[training,], k = k, ## control = list(verbose = 100, estimate.alpha = FALSE)) ## test <- LDA(AssociatedPress[testing,], model = train, ## control = list(estimate.beta = FALSE)) ## train <- LDA(AssociatedPress[training,], k = k, method = "Gibbs", ## control = list(burnin = 1000, thin = 100, ## iter = 1000, best = FALSE)) ## test <- LDA(AssociatedPress[testing,], ## model = train[[which.max(sapply, train, logLik)]], ## control = list(estimate.beta = FALSE, burnin = 1000, ## thin = 100, iter = 1000, best = FALSE)) load("AP.rda") k <- 40 DF <- data.frame(logLik = unlist(lapply(c("VEM","VEM_fixed"), function(x) AP_test[[x]][seq_along(topics),])), k = topics, Fold = rep(1:10, each = length(topics)), Method = factor(rep(c("estimated", "fixed"), each = 10 * length(topics)))) print(xyplot(logLik ~ k | Method, group = Fold, data = DF, type = "b", xlab = "Number of topics", ylab = "Perplexity", col = 1, lty = 1)) print(xyplot(as.vector(AP_alpha[["VEM"]][seq_along(topics),]) ~ rep(topics, 10), groups = rep(1:10, each = length(topics)), type = "b", xlab = "Number of topics", ylab = expression(alpha), col = 1, lty = 1), split = c(1, 1, 2, 1), more = TRUE) mllh <- apply(AP_test[["Gibbs"]], 2, sapply, max) print(xyplot(as.vector(mllh[seq_along(topics),]) ~ rep(topics, 10), groups = rep(1:10, each = length(topics)), type = "b", xlab = "Number of topics", ylab = "Perplexity", col = 1, lty = 1), split = c(2, 1, 2, 1), more = FALSE) load("AP-40.rda") dist_models <- distHellinger(posterior(AP[["VEM"]])$terms, posterior(AP[["Gibbs"]])$terms) library("clue") matching <- solve_LSAP(dist_models) dist_models <- dist_models[, matching] d <- mean(diag(dist_models)) best_match <- order(diag(dist_models)) terms(AP$VEM, 8)[,best_match[1:4]] terms(AP$Gibbs, 8)[,matching[best_match[1:4]]] worst_match <- order(diag(dist_models), decreasing = TRUE) terms(AP$VEM, 8)[,worst_match[1:4]] terms(AP$Gibbs, 8)[,matching[worst_match[1:4]]] par(mar = rep(0, 4)) o <- order(diag(dist_models)) library("seriation") pimage(dist_models[o,o], col = gray.colors(64), axes = FALSE) ################################################### ### Section: Extending to new fit methods ################################################### BUGS_MODEL <- "model { for (i in 1:length(W)) { z[i] ~ dcat(theta[D[i],]); W[i] ~ dcat(beta[z[i],]); } for (j in 1:n) { theta[j,1:k] ~ ddirch(alpha); } for (K in 1:k) { beta[K,1:V] ~ ddirch(delta); } }" LDA_rjags.fit <- function(x, k, control = NULL, model = NULL, call, ....) { if (!require("rjags")) stop("\nThis method requires package 'rjags'") control <- as(control, "LDA_Gibbscontrol") if (length(control@alpha) == 0) control@alpha <- if (!is.null(model)) model@alpha else 50/k DATA <- list(D = rep(x$i, x$v), W = rep(x$j, x$v), n = nrow(x), k = k, V = ncol(x), alpha = rep(control@alpha, k), delta = rep(control@delta, ncol(x))) FILE <- file() cat(BUGS_MODEL, file = FILE) model <- jags.model(FILE, data = DATA, inits = list(.RNG.name = "base::Wichmann-Hill", .RNG.seed = control@seed)) close(FILE) if (control@burnin > 0) update(model, iter = control@burnin) SAMPLES <- coda.samples(model, c("theta", "beta", "z"), thin = control@thin, n.iter = control@iter)[[1]] index_beta <- seq_len(k * ncol(x)) index_gamma <- k * ncol(x) + seq_len(nrow(x) * k) obj <- lapply(seq_len(nrow(SAMPLES)), function(i) new("LDA_Gibbs", call = call, Dim = dim(x), k = as.integer(k), control = control, alpha = control@alpha, delta = control@delta, terms = colnames(x), documents = rownames(x), beta = matrix(SAMPLES[i, index_beta], nrow = k), gamma = matrix(SAMPLES[i, index_gamma], ncol = k))) if (nrow(SAMPLES) == 1) obj <- obj[[1]] obj } AP_small <- AssociatedPress AP_small <- AP_small[row_sums(AP_small) > 500,] AP_small <- AP_small[,col_sums(AP_small) > 10,] dim(AP_small) system.time({ lda <- LDA(AP_small, k = 5, method = "Gibbs", control = list(burnin = 0, iter = 20, seed = 2010)) }) system.time({ lda_rjags <- LDA(AP_small, k = 5, method = LDA_rjags.fit, control = list(burnin = 0, iter = 20, seed = 2010)) }) terms(lda_rjags, 4) ################################################### ### Section: R code for the Associated Press analysis ################################################### ################################################### ### Subsection: Simulation using 10-fold cross-validation ################################################### ## set.seed(0908) ## topics <- 10 * c(1:5, 10, 20) ## SEED <- 20080809 ## library("topicmodels") ## data("AssociatedPress", package = "topicmodels") ## D <- nrow(AssociatedPress) ## folding <- ## sample(rep(seq_len(10), ceiling(D))[seq_len(D)]) ## for (k in topics) { ## for (chain in seq_len(10)) { ## FILE <- paste("VEM_", k, "_", chain, ".rda", sep = "") ## training <- LDA(AssociatedPress[folding != chain,], k = k, ## control = list(seed = SEED)) ## testing <- LDA(AssociatedPress[folding == chain,], model = training, ## control = list(estimate.beta = FALSE, seed = SEED)) ## save(training, testing, file = file.path("results", FILE)) ## FILE <- paste("VEM_fixed_", k, "_", chain, ".rda", sep = "") ## training <- LDA(AssociatedPress[folding != chain,], k = k, ## control = list(seed = SEED, estimate.alpha = FALSE)) ## testing <- LDA(AssociatedPress[folding == chain,], model = training, ## control = list(estimate.beta = FALSE, seed = SEED)) ## save(training, testing, file = file.path("results", FILE)) ## FILE <- paste("Gibbs_", k, "_", chain, ".rda", sep = "") ## training <- LDA(AssociatedPress[folding != chain,], k = k, ## control = list(seed = SEED, burnin = 1000, thin = 100, ## iter = 1000, best = FALSE), method = "Gibbs") ## best_training <- training@fitted[[which.max(logLik(training))]] ## testing <- LDA(AssociatedPress[folding == chain,], ## model = best_training, ## control = list(estimate.beta = FALSE, seed = SEED, ## burnin = 1000, thin = 100, iter = 1000, best = FALSE)) ## save(training, testing, file = file.path("results", FILE)) ## } ## } ################################################### ### Subsection: Summarizing the cross-validation simulation results ################################################### ## set.seed(0908) ## topics <- 10 * c(1:5, 10, 20) ## library("topicmodels") ## data("AssociatedPress", package = "topicmodels") ## D <- nrow(AssociatedPress) ## folding <- ## sample(rep(seq_len(10), ceiling(D))[seq_len(D)]) ## AP_test <- AP_alpha <- list() ## for (method in c("VEM", "VEM_fixed", "Gibbs")) { ## AP_alpha[[method]] <- AP_test[[method]] <- ## matrix(NA, nrow = length(topics), ncol = 10, ## dimnames = list(topics, seq_len(10))) ## for (fold in seq_len(10)) { ## for (i in seq_along(topics)) { ## T <- topics[i] ## FILE <- paste(method, "_", T, "_", fold, ".rda", sep = "") ## load(file.path("results", FILE)) ## AP_alpha[[method]][paste(T),fold] <- ## if (is(training, "Gibbs_list")) training@fitted[[1]]@alpha ## else training@alpha ## AP_test[[method]][paste(T),fold] <- ## perplexity(testing, AssociatedPress[folding == fold,], ## use_theta = FALSE) ## } ## } ## } ## save(AP_alpha, AP_test, file = "AP.rda") ################################################### ### Subsection: Fitting the LDA model to the complete data set using ### 40 topics ################################################### ## library("topicmodels") ## data("AssociatedPress", package = "topicmodels") ## topics <- 10 * c(1:5, 10, 20) ## k <- 40 ## load("AP.rda") ## alpha <- mean(AP_alpha[["VEM"]][which(topics == k),]) ## rm(AP_alpha, AP_test) ## SEED <- 20080806 ## AP <- ## list(VEM = LDA(AssociatedPress, k = k, ## control = list(alpha = alpha, seed = SEED)), ## VEM_fixed = LDA(AssociatedPress, k = k, ## control = list(alpha = alpha, estimate.alpha = FALSE, seed = SEED)), ## Gibbs = LDA(AssociatedPress, k = k, method = "Gibbs", ## control = list(alpha = alpha, seed = SEED, burnin = 1000, ## thin = 100, iter = 1000))) ## save(AP, file = "AP-40.rda")