## Analyzing and Visualizing State Sequences in R with TraMineR
## R example code from the paper
###################################
### Section 2: The TraMineR package
###################################
## Load the library, retrieve the mvad data and create a state sequence object from the
## status variables (columns 17 to 86)
library("TraMineR")
data("mvad")
mvad.seq <- seqdef(mvad, 17:86, xtstep=6)
## compute pairwise optimal matching (OM) distances
mvad.om <- seqdist(mvad.seq, method = "OM", indel = 1, sm = "TRATE")
## agglomerative hierarchical clustering
library("cluster")
clusterward <- agnes(mvad.om, diss=TRUE, method="ward")
mvad.cl4 <- cutree(clusterward, k=4)
cl4.lab <- factor(mvad.cl4, labels = paste("Cluster",1:4))
## visualize the cluster patterns by plotting their transversal state distributions
seqdplot(mvad.seq, group=cl4.lab, border=NA)
## compute the longitudinal entropy and regress it on covariates
entropies <- seqient(mvad.seq)
lm.ent <- lm(entropies ~ male + funemp + gcse5eq, mvad)
###################################
## Section 4: State sequence object
###################################
## mvad-extract
mvad[1:2, 17:22]
## create 'mvad.seq' state sequence object
mvad.lab <- c("Employment", "Further education", "Higher education", "Joblessness", "School", "Training")
mvad.scode <- c("EM", "FE", "HE", "JL", "SC", "TR")
mvad.seq <- seqdef(mvad, 17:86, states = mvad.scode,
labels = mvad.lab, xtstep=6)
## print first sequences in SPS format
print(mvad.seq[1:5, ], format="SPS")
## weighted sequence object
mvad.seq <- seqdef(mvad, 17:86, states = mvad.scode,
labels = mvad.lab, weights=mvad$weight, xtstep=6)
####################################################
## Section 5: Visualizing individual state sequences
####################################################
## Weighted sequence index plot (Fig. 2)
seqiplot(mvad.seq, border=NA, withlegend="right")
## compute LCS pairwise distance matrix and MDS coordinates
mvad.lcs <- seqdist(mvad.seq, method="LCS")
mds <- cmdscale(mvad.lcs, k=1)
## compute distances to the most frequent sequence
dref <- seqdist(mvad.seq, ref=0, method="LCS")
## sorted full sequence index plots (Figure 3)
## stored in a '.png' file
png(file=paste(graphdir,"png_mvad_seqIplot-sorted.png",sep=""), unit="px", width=1600, height=700, pointsize=30)
par(mfrow=c(1,3))
seqIplot(mvad.seq, title="unsorted", withlegend=FALSE)
seqIplot(mvad.seq, sortv=dref, title="by distance to most frequent", withlegend=FALSE)
seqIplot(mvad.seq, sortv=mds, title="by 1st MDS factor", withlegend=FALSE)
dev.off()
## most frequent sequences
seqtab(mvad.seq, tlim=1:4)
## sequence frequency plots (Figure 4)
par(mfrow=c(1,2))
seqfplot(mvad.seq, border=NA, withlegend=FALSE,
title="Weighted frequencies")
seqfplot(mvad.seq, weighted=FALSE, border=NA, withlegend=FALSE,
title="Unweighted frequencies")
#######################################################################
## Section 6: Computing and plotting overall and transversal statistics
#######################################################################
## mean time spent in each state (Figure 5)
seqmtplot(mvad.seq, group=mvad$funemp, ylim=c(0,30))
## mean time values
by(mvad.seq, mvad$funemp, seqmeant)
## matrix of transition rates
mvad.trate <- seqtrate(mvad.seq)
round(mvad.trate,2)
## transversal state distribution
seqstatd(mvad.seq[,1:8])
## state distribution plot (Figure 6)
seqdplot(mvad.seq, group=mvad$gcse5eq, border=NA)
## sequence of modal states (Figure 7)
seqmsplot(mvad.seq, group=mvad$gcse5eq, border=NA)
## transversal entropy of state distributions
seqHtplot(mvad.seq, group=mvad$gcse5eq)
#################################################
## Section 7: Individual sequence characteristics
#################################################
## example sequences presented in Figure 9
e1m1 <- c("A","A","A","A","A","A","A","A","A","A","A","A")
## 2 states
e2m1 <- c("A","A","A","A","A","A","B","B","B","B","B","B")
e2m2 <- c("A","B","A","B","A","B","A","B","A","B","A","B")
e2m3 <- c("A","A","A","A","A","A","A","A","A","A","A","B")
e2m4 <- c("A","B","B","B","B","B","B","B","B","B","B","A")
e2m5 <- c("A","A","A","A","B","B","B","B","A","A","A","A")
e2m7 <- c("A","A","B","B","A","A","B","B","A","A","B","B")
e2m8 <- c("A","B","B","A","A","B","B","A","A","B","B","A")
e2m9 <- c("A","B","A","A","A","A","A","A","A","A","B","A")
e2m10 <- c("A","B","A","B","A","B","A","B","A","A","A","A")
## 3 states
e3m1 <- c("A","A","A","A","B","B","B","B","C","C","C","C")
e3m2 <- c("A","B","C","A","B","C","A","B","C","A","B","C")
e3m3 <- c("A","A","A","A","A","A","A","A","A","A","B","C")
e3m4 <- c("A","B","B","B","B","B","C","C","C","C","C","A")
e3m5 <- c("A","A","A","A","A","B","C","A","A","A","A","A")
e3m6 <- c("A","B","C","B","A","C","A","C","B","C","B","A")
e3m7 <- c("A","A","B","B","C","C","A","A","B","B","C","C")
e3m8 <- c("A","B","C","C","B","A","A","B","C","C","B","A")
e3m9 <- c("A","B","C","A","A","A","A","A","A","C","B","A")
e3m10 <- c("A","B","C","A","B","C","A","B","A","A","A","A")
## 4 states
e4m1 <- c("A","A","A","B","B","B","C","C","C","D","D","D")
e4m2 <- c("A","B","C","D","A","B","C","D","A","B","C","D")
e4m3 <- c("A","A","A","A","A","A","A","A","A","B","C","D")
e4m4 <- c("A","B","B","B","C","C","C","C","D","D","D","A")
e4m5 <- c("A","A","A","A","B","C","D","A","A","A","A","A")
e4m6 <- c("A","B","C","D","B","A","C","D","A","C","B","D")
e4m7 <- c("A","A","B","B","C","C","D","D","A","A","B","B")
e4m8 <- c("A","B","C","D","D","C","B","A","A","B","C","D")
e4m9 <- c("A","B","C","D","A","A","A","A","D","C","B","A")
e4m10 <- c("A","B","C","D","A","B","C","D","A","A","A","A")
## binding individual sequences
ex.comp <- rbind(e1m1,
e2m3,e2m1,e2m5,e2m4,e2m9,e2m10,e2m2,
e3m3,e3m1,e3m5,e3m4,e3m9,e3m10,e3m6,e3m2,
e4m3,e4m1,e4m5,e4m4,e4m9,e4m10,e4m6,e4m2)
## we keep sequence names to identify them
seqnames <- rownames(ex.comp)
## select example sequences presented in Figure 9
ex.comp <- ex.comp[c(1,2,3,4,8,17,18,19,24),]
## create state sequence object
ex.comp <- seqdef(ex.comp, id="auto")
## example sequences and normalized values of complexity measures (Figure 9)
par(mfrow=c(1,2))
seqiplot(ex.comp, border=NA, withlegend=FALSE, tlim=0, title="Example sequences")
ex.comp.indic <- cbind(seqtransn(ex.comp, norm=TRUE), seqient(ex.comp), seqST(ex.comp)/max(seqST(ex.comp)), seqici(ex.comp))
barplot(t(ex.comp.indic),
col=c("red","blue","cyan", "yellow"), horiz=TRUE, beside=TRUE,
## xlim=c(0,0.4),
main="Longitudinal characteristics")
legend(x="bottomright", lwd=6,
legend=c("Transitions", "Entropy", "Turbulence", "Complexity"),
col=c("red","blue","cyan", "yellow")
)
## total time spent in each state
seqistatd(mvad.seq[1:4,])
#################################################
## Section 8: Measuring sequence (dis)similarity
#################################################
## compute distances to the most frequent sequence with various metrics (Figure 10)
scost <- seqsubm(mvad.seq, method = "TRATE")
mvad.om.ref <- seqdist(mvad.seq, refseq=0, method="OM", sm=scost)
mvad.lcs.ref <- seqdist(mvad.seq, refseq=0, method="LCS")
mvad.lcp.ref <- seqdist(mvad.seq, refseq=0, method="LCP")
mvad.dhd.ref <- seqdist(mvad.seq, refseq=0, method="DHD")
mvad.ham.ref <- seqdist(mvad.seq, refseq=0, method="HAM")
## substitution cost matrix for the mvad data set
scost <- seqsubm(mvad.seq, method="TRATE")
round(scost,3)
## distances to the most frequent sequence obtained with various metrics (Figure 10)
mvad.metrics <- data.frame(LCP=mvad.lcp.ref, LCS=mvad.lcs.ref, OM=mvad.om.ref,
HAM=mvad.ham.ref, DHD=mvad.dhd.ref)
pairs(mvad.metrics, col="blue", pch=20, ps=0.1)
## pairwise distance matrix for the mvad data
mvad.om <- seqdist(mvad.seq, method="OM", indel=1, sm=scost)
## LCS versus OM distances
LCS.vs.OM <- abs(mvad.lcs.ref-mvad.om.ref)
## maximal theoretical OM distance for the mvad data
## with transition-rate-based substitution cost matrix
D.max <- 70*min(2, max(scost))
###################################################
## Section 9: Dissimilarity based sequence analysis
###################################################
## representative sequences: medoid
medoid <- seqrep(mvad.seq, dist.matrix=mvad.om, criterion="dist", nrep=1)
print(medoid, format="SPS")
## representative sequences: neighborhood density criterion (Figure 11)
seqrplot(mvad.seq, group=mvad$gcse5eq, dist.matrix=mvad.om, border=NA)
## clustering sequences: dendrogram (Figure 12)
plot(clusterward, which.plots=2, labels=FALSE)
## representative sequence sequences by cluster (Figure 13)
seqrplot(mvad.seq, group=cl4.lab, dist.matrix=mvad.om, trep=.35, border=NA)
## logistic regression for cluster membership: cluster 4
mb4 <- (cl4.lab=="Cluster 4")
glm.cl4 <- glm(mb4 ~ male + funemp + gcse5eq, data=mvad)
## logistic regression for cluster membership: cluster 1-3
glm.cl1 <- glm((cl4.lab=="Cluster 1") ~ male + funemp + gcse5eq, data=mvad)
glm.cl2 <- glm((cl4.lab=="Cluster 2") ~ male + funemp + gcse5eq, data=mvad)
glm.cl3 <- glm((cl4.lab=="Cluster 3") ~ male + funemp + gcse5eq, data=mvad)
cl1.coeff <- as.data.frame(summary(glm.cl1)$coefficients)
cl2.coeff <- as.data.frame(summary(glm.cl2)$coefficients)
cl3.coeff <- as.data.frame(summary(glm.cl3)$coefficients)
cl4.coeff <- as.data.frame(summary(glm.cl4)$coefficients)
OR.table <- data.frame(Cluster1 = exp(cl1.coeff$Estimate), sig1 = cl1.coeff[,"Pr(>|t|)"],
Cluster2 = exp(cl2.coeff$Estimate), sig2 = cl2.coeff[,"Pr(>|t|)"],
Cluster3 = exp(cl3.coeff$Estimate), sig3 = cl3.coeff[,"Pr(>|t|)"],
Cluster4 = exp(cl4.coeff$Estimate), sig4 = cl4.coeff[,"Pr(>|t|)"])