################################################### ### Preparations ################################################### library("slam") library("lattice") library("latticeExtra") library("movMF") library("reshape") library("tm") library("clue") library("skmeans") ltheme <- canonical.theme("pdf", FALSE) lattice.options(default.theme = ltheme) palette(colorspace::heat_hcl(4)) ################################################### ### Household expenses ################################################### ### Helper functions ################################################### rotationMatrix <- function(mu) { beta <- asin(mu[1]) alpha <- atan( - mu[2] / mu[3]) + sign(mu[2] * mu[3]) * (mu[3] < 0) * pi cosa <- cos(alpha); cosb <- cos(beta) sina <- sin(alpha); sinb <- sin(beta) matrix(c(cosb, sina * sinb, - cosa * sinb, 0, cosa, sina, sinb, - sina * cosb, cosa * cosb), nrow = 3) } getIsolines <- function(d, mu, length = 100) { theta <- seq(0, 2*pi, length = length) sinphi <- sin(acos(d)) tcrossprod(cbind(cos(theta) * sinphi, sin(theta) * sinphi, d), rotationMatrix(mu)) } plotGlobe <- function(x, class, pars = NULL, main = "", theta = 0, phi = 0, Q = c(0.5, 0.95), n = 100000, xlim = c(-1, 1), ylim = c(-1, 1), zlim = c(-1, 1), ...) { pmat <- persp(0:1, 0:1, matrix(, 2, 2), theta = theta, phi = phi, xlim = xlim, ylim = ylim, zlim = zlim, scale = FALSE, xlab="x", ylab="y", zlab="z", main = main, ...) trans3d <- function(x, y, z) { tr <- cbind(x, y, z, 1) %*% pmat list(x = tr[, 1]/tr[, 4], y = tr[, 2]/tr[, 4]) } x <- x / row_norms(x) x3D <- trans3d(x[,1], x[,2], x[,3]) theta <- seq(0, 2*pi, length = 41) phi <- seq(0, pi/2, length = 21) x <- cos(theta) %o% sin(phi) y <- sin(theta) %o% sin(phi) z <- rep(1, length(theta)) %o% cos(phi) for (j in seq(phi)[-1]) for (i in seq(theta)[-1]) { idx <- rbind(c(i-1,j-1), c(i,j-1), c(i,j), c(i-1,j)) polygon(trans3d(x[idx], y[idx], z[idx]), border = "grey") } points(x3D$x, x3D$y, col = class, pch = 20) if (!is.null(pars)) { kappa <- row_norms(pars) d <- sapply(kappa, function(K) rmovMF(n, K * c(1, 0, 0))[,1]) mu <- pars / kappa for (i in seq_along(Q)) { M <- apply(d, 2, quantile, 1-Q[i]) isos <- lapply(seq_len(nrow(mu)), function(i) getIsolines(M[i], mu[i,])) isostrans <- lapply(isos, function(x) trans3d(x[,1], x[,2], x[,3])) lapply(seq_along(isostrans), function(j) polygon(isostrans[[j]]$x, isostrans[[j]]$y, border = j, lwd = 2, lty = i)) } mu3D <- trans3d(mu[,1], mu[,2], mu[,3]) points(mu3D$x, mu3D$y, pch = 4, lwd = 2, col = seq_len(nrow(mu))) } invisible(pmat) } ################################################### data("household", package = "HSAUR2") x <- as.matrix(household[, c(1:2, 4)]) gender <- household$gender theta <- rbind(female = movMF(x[gender == "female",], k = 1)$theta, male = movMF(x[gender == "male",], k = 1)$theta) set.seed(2008) vMFs <- lapply(1:5, function(K) movMF(x, k = K, control= list(nruns = 20))) sapply(vMFs, BIC) kappa <- row_norms(theta) tab2 <- table(max.col(vMFs[[2]]$P), household$gender) mc2 <- min(c(sum(diag(tab2)), sum(tab2) - sum(diag(tab2)))) tab3 <- table(max.col(vMFs[[3]]$P), household$gender) ## Figure 1 par(mar = 0.1 + c(0, 0.5, 2, 0), mfrow = c(2, 2)) plotGlobe(x, household$gender, main = "Data", theta = -30, phi = 10, zlim = c(0, 1)) plotGlobe(x, household$gender, theta, main = "Known group membership", theta = -30, phi = 10, zlim = c(0, 1)) mu <- lapply(vMFs, function(x) x$theta / row_norms(x$theta)) ordering <- lapply(mu, function(x) order(x[,1], decreasing = TRUE)) class <- alpha <- kappa <- theta <- vector("list", length(ordering)) for (i in seq_along(ordering)) { alpha[[i]] <- vMFs[[i]]$alpha[ordering[[i]]] mu[[i]] <- mu[[i]][ordering[[i]],,drop=FALSE] theta[[i]] <- vMFs[[i]]$theta[ordering[[i]],,drop=FALSE] kappa[[i]] <- row_norms(vMFs[[i]]$theta[ordering[[i]],,drop=FALSE]) class[[i]] <- max.col(vMFs[[i]]$P[,ordering[[i]]]) } for (k in 2:3) { plotGlobe(x, class[[k]], theta[[k]], main = paste("Mixtures of vMFs with K =", k), theta = -30, phi = 10, zlim = c(0, 1)) } ################################################### ### Comparison to spherical k-means ################################################### ### Helper functions ################################################### entropy <- function(cluster, class) { tab <- table(class, cluster) tab2 <- colSums(tab) E_S <- apply(tab, 2, function(x) sum(x / sum(x) * ifelse(x > 0, log(x / sum(x)), 0)) * (-1 / log(length(x)))) sum(E_S * tab2 / sum(tab2)) } purity <- function(cluster, class) { tab <- table(class, cluster) tab2 <- colSums(tab) sum(apply(tab, 2, function(x) max(x) / sum(x)) * tab2 / sum(tab2)) } nmi <- function(cluster, class) { cl_agreement(as.cl_partition(cluster), as.cl_partition(class), method = "NMI") } ################################################### ### Using text mining corpora with known classifications ################################################### if (!require("tm.dtm.CLUTO", quietly = TRUE)) { install.packages("tm.dtm.CLUTO", repos = "http://datacube.wu.ac.at/", type = "source") } library("tm.dtm.CLUTO") files <- list.files(file.path(path.package("tm.dtm.CLUTO"), "data")) files <- files[grep("\\.rda", files)] omit <- paste(c("cacmcisi", "cranmed", "la1", "la2"), "rda", sep = ".") files <- files[!files %in% omit] LINE <- vector("character", length(files)) ## Table 1 for (i in seq_along(files)) { FILE <- files[i] DATASET <- gsub("\\.rda", "", FILE) data(list = DATASET, package = "tm.dtm.CLUTO") x <- get(DATASET) K <- length(unique(attr(x, "Category"))) x <- x[,col_sums(x) > 1] LINE[i] <- paste(paste(DATASET, nrow(x), ncol(x), K, sep = "&"), "\\\\ \n", sep = "") } ################################################### if ("cluto.rda" %in% list.files()) { load("cluto.rda") } else { criteria <- criteria2 <- list() for (FILE in files) { DATASET <- gsub(".rda", "", FILE, fixed = TRUE) cat(DATASET, "\n") data(list = DATASET, package = "tm.dtm.CLUTO") x <- get(DATASET) K <- length(unique(attr(x, "Category"))) class <- as.integer(factor(attr(x, "Category"))) x <- x[, slam::col_sums(x) > 1] kd <- slam::row_sums(x) > 0 x <- x[kd, ] class <- class[kd] x_tfidf <- weightTfIdf(x) for (X in c("x", "x_tfidf")) { assign(X, list(skmeans = list(skmeans(get(X), k = K, control = list(verbose = TRUE, control = "-ntrials=20 -seed=20111212"), method = "CLUTO"), skmeans(get(X), k = K, method = "pclust", control = list(start = list(class)))), movMF_general = list({ set.seed(20111212) movMF(get(X), k = K, nruns = 20)}, movMF(get(X), start = list(class))), movMF_common = list({ set.seed(20111212) movMF(get(X), k = K, nruns = 20, kappa = list(common = TRUE))}, movMF(get(X), start = list(class), kappa = list(common = TRUE))))) } criteria[[DATASET]] <- sapply(c("entropy", "purity", "nmi"), function(criterion) { lapply(c("x", "x_tfidf"), function(X) { sapply(get(X), sapply, function(z) get(criterion)(cl_predict(z), class))})}, simplify = FALSE) criteria2[[DATASET]] <- sapply(c("logLik", "BIC"), function(criterion) { lapply(c("x", "x_tfidf"), function(X) { sapply(get(X)[2:3], sapply, function(z) get(criterion)(z))})}, simplify = FALSE) } save(criteria, criteria2, file = "cluto.rda") } Entropy <- do.call("rbind", lapply(lapply(criteria, "[[", "entropy"), function(x) as.vector(sapply(x, "[", 2, TRUE)))) Purity <- do.call("rbind", lapply(lapply(criteria, "[[", "purity"), function(x) as.vector(sapply(x, "[", 2, TRUE)))) Nmi <- do.call("rbind", lapply(lapply(criteria, "[[", "nmi"), function(x) as.vector(sapply(x, "[", 2, TRUE)))) colnames(Entropy) <- colnames(Purity) <- colnames(Nmi) <- rep(c("TF", "TF-IDF"), each = 3) mentropy <- melt(Entropy); mpurity <- melt(Purity); mnmi <- melt(Nmi) mentropy$X3 <- mpurity$X3 <- mnmi$X3 <- factor(rep(c("skmeans", "movMF\ngeneral", "movMF\ncommon"), each = nrow(Entropy))) eval <- rbind(cbind(mentropy, Criterion = "Entropy"), cbind(mpurity, Criterion = "Purity"), cbind(mnmi, Criterion = "NMI")) ## Figure 2 print(bwplot(value ~ X3 | Criterion + X2, data = eval, ylim = c(-0.05, 1.05), pch = "|", ylab = "Criterion")) Entropy <- do.call("rbind", lapply(lapply(criteria, "[[", "entropy"), function(x) as.vector(sapply(x, "[", 1, TRUE)))) Purity <- do.call("rbind", lapply(lapply(criteria, "[[", "purity"), function(x) as.vector(sapply(x, "[", 1, TRUE)))) Nmi <- do.call("rbind", lapply(lapply(criteria, "[[", "nmi"), function(x) as.vector(sapply(x, "[", 1, TRUE)))) colnames(Entropy) <- colnames(Purity) <- colnames(Nmi) <- rep(c("TF", "TF-IDF"), each = 3) mentropy <- melt(Entropy); mpurity <- melt(Purity); mnmi <- melt(Nmi) mentropy$X3 <- mpurity$X3 <- mnmi$X3 <- factor(rep(c("skmeans", "movMF\ngeneral", "movMF\ncommon"), each = nrow(Entropy))) eval <- rbind(cbind(mentropy, Criterion = "Entropy"), cbind(mpurity, Criterion = "Purity"), cbind(mnmi, Criterion = "NMI")) ## Figure 3 print(bwplot(value ~ X3 | Criterion + X2, data = eval, ylim = c(-0.05, 1.05), pch = "|", ylab = "Criterion")) ################################################### ### Using artificial data ################################################### pis <- list(equal = prop.table(rep(1, 3)), unequal = prop.table(c(1, 9, 10))) kappas <- list(equal = rep(200, 3), unequal = c(10, 200, 100)) if ("artificial.rda" %in% list.files()) { load("artificial.rda") } else { N <- 100 mu <- matrix(0, nrow = 3, ncol = 300) for (h in 1:3) { mu[h, 100 * (h-1) + 1:10] <- 1 mu[, 100 * (h-1) + 90:100] <- 1 } mu <- skmeans:::row_normalize(mu) Nmi <- Entropy <- Purity <- rep(list(rep(list(matrix(nrow = 3, ncol = N)), length(kappas))), length(pis)) for (i in seq_along(pis)) { for (j in seq_along(kappas)) { theta <- sweep(mu, 1, kappas[[j]], "*") set.seed(2012) for (n in seq_len(N)) { x <- rmovMF(400, theta, pis[[i]]) CLASS <- attr(x, "z") m1 <- movMF(x, k = 3, nruns = 20) m2 <- movMF(x, k = 3, nruns = 20, kappa = list(common = TRUE)) m3 <- skmeans(x, k = 3, method = "CLUTO", control = list(control = "-ntrials=20 -seed=20111212", verbose = FALSE)) P <- list(cl_predict(m1), cl_predict(m2), cl_predict(m3)) Entropy[[i]][[j]][,n] <- sapply(P, entropy, CLASS) Purity[[i]][[j]][,n] <- sapply(P, purity, CLASS) Nmi[[i]][[j]][,n] <- sapply(P, nmi, CLASS) } } } save(Entropy, Nmi, Purity, file = "artificial.rda") } names(Entropy) <- c("Equal size", "Unequal size") for (i in seq_along(Entropy)) names(Entropy[[i]]) <- c("Common", "General") ent <- melt(Entropy) ent$X1 <- factor(ent$X1, c(2, 1, 3), c("movMF\ncommon", "movMF\ngeneral", "skmeans")) ent$L2 <- factor(ent$L2, c("General", "Common")) bw <- bwplot(value ~ X1 | L1 + L2, data = ent, pch = "|", ylab = "Entropy", scales = list(y = "free", rot = 0), strip = function(...) { args <- list(...) if (args$which.given == 2) args$factor.levels <- expression(paste("Free ", kappa), paste("Common ", kappa)) do.call("strip.default", args) }) ## Figure 4 print(combineLimits(bw)) ################################################### ### Application: useR! 2008 abstracts ################################################### if (!require("corpus.useR.2008.abstracts", quietly = TRUE)) { install.packages("corpus.useR.2008.abstracts", repos = "http://datacube.wu.ac.at/", type = "source") } data("useR_2008_abstracts", package = "corpus.useR.2008.abstracts") library("tm") abstracts_titles <- apply(useR_2008_abstracts[,c("Title", "Abstract")], 1, paste, collapse = " ") useR_2008_abstracts_corpus <- Corpus(VectorSource(abstracts_titles)) useR_2008_abstracts_DTM <- DocumentTermMatrix(useR_2008_abstracts_corpus, control = list( tokenize = "MC", stopwords = TRUE, stemming = TRUE, wordLengths = c(3, Inf))) library("slam") ColSums <- col_sums(useR_2008_abstracts_DTM > 0) sort(ColSums, decreasing = TRUE)[1:10] useR_2008_abstracts_DTM <- useR_2008_abstracts_DTM[, ColSums >= 5 & ColSums <= 90] useR_2008_abstracts_DTM useR_2008_abstracts_DTM <- weightTfIdf(useR_2008_abstracts_DTM) set.seed(2008) library("movMF") Ks <- c(1:5, 10, 20) splits <- sample(rep(1:10, length.out = nrow(useR_2008_abstracts_DTM))) useR_2008_movMF <- lapply(Ks, function(k) sapply(1:10, function(s) { m <- movMF(useR_2008_abstracts_DTM[splits != s,], k = k, nruns = 20) logLik(m, useR_2008_abstracts_DTM[splits == s,]) })) useR_2008_movMF_common <- lapply(Ks, function(k) sapply(1:10, function(s) { m <- movMF(useR_2008_abstracts_DTM[splits != s,], k = k, nruns = 20, kappa = list(common = TRUE)) logLik(m, useR_2008_abstracts_DTM[splits == s,]) })) logLiks <- data.frame(logLik = c(unlist(useR_2008_movMF), unlist(useR_2008_movMF_common)), K = c(rep(Ks, sapply(useR_2008_movMF, length)), rep(Ks, sapply(useR_2008_movMF_common, length))), Dataset = seq_len(length(useR_2008_movMF[[1]])), Method = factor(rep(1:2, each = length(unlist(useR_2008_movMF))), 1:2, c("free", "common"))) logLiks$logLik <- logLiks$logLik - rep(rep(with(logLiks, tapply(logLik, Dataset, mean)), length(Ks)), 2) print(xyplot(logLik ~ K | Method, data = logLiks, groups = Dataset, type = "l", lty = 1, xlab = "Number of components", ylab = "Predictive log-likelihood", strip = strip.custom(factor.levels = expression(paste("Free ", kappa), paste("Common ", kappa))))) set.seed(2008) best_model <- movMF(useR_2008_abstracts_DTM, k = 2, nruns = 20, kappa = list(common = TRUE)) apply(coef(best_model)$theta, 1, function(x) colnames(coef(best_model)$theta)[order(x, decreasing = TRUE)[1:10]]) clustering <- predict(best_model) keywords <- useR_2008_abstracts[,"Keywords"] keywords <- sapply(keywords, function(x) sapply(strsplit(x, ", ")[[1]], function(y) strsplit(y, "-")[[1]][1])) tab <- table(Keyword = unlist(keywords), Component = rep(clustering, sapply(keywords, length))) tab <- tab[rowSums(tab) > 8,] tab library("vcd") mosaic(tab, shade = TRUE, labeling_args = list(rot_labels = 0, just_labels = c("center", "right"), pos_varnames = c("left", "right"), rot_varnames = 0))