## ----preparation-------------------------------------------------------------- #render_sweave() #knitr::opts_chunk$set(fig.width=6, fig.height=6, dev = 'png', dpi = 600) #options(prompt = "R> ", continue = "+ ", width = 70, useFancyQuotes = FALSE) library("rgdal") library("spdep") library("rflexscan") library("RColorBrewer") # method for plotting semilog graph log10Tck <- function(side, type){ lim <- switch(side, x = par('usr')[1:2], y = par('usr')[3:4], stop("side argument must be 'x' or 'y'")) at <- floor(lim[1]) : ceiling(lim[2]) return(switch(type, minor = outer(1:9, 10^(min(at):max(at))), major = 10^at, stop("type argument must be 'major' or 'minor'") )) } # for replication set.seed(12345) ## ----load NYS cancer data----------------------------------------------------- library("rgdal") # Download data temp <- tempfile() download.file("https://www.satscan.org/datasets/nyscancer/NYS_Cancer.zip", temp) unzip(temp, exdir = "NYS_Cancer") nys <- readOGR("NYS_Cancer/NYSCancer_region.shp", stringsAsFactors = FALSE) ## ----figure 1----------------------------------------------------------------- manhattan <- nys[grep("36061", nys$DOHREGION),] SIR <- manhattan$OBREAST / manhattan$EBREAST nsteps <- 7 brks <- c(0, 1.0, 1.2, 1.4, 1.6, 1.8, 2, 10) brks[1] <- 0 cols <- colorRampPalette(c("white","royal blue"))(nsteps) grps <- as.ordered(cut(SIR, brks, include.lowest = TRUE)) par(mar = c(1, 1, 1, 1), oma = c(1, 1, 1, 1)) plot(manhattan, col=cols[unclass(grps)], lwd = 0.1) box() legend("topleft", legend = c("> 2.00", "1.80 - 2.00", "1.60 - 1.80", "1.40 - 1.60", "1.20 - 1.40", "1.00 - 1.20", "< 1.00"), fill=rev(cols), bty="n") ## ----install rflexscan-------------------------------------------------------- # install.packages("rflexscan", dependencies = TRUE) library("rflexscan") ## ----figure 2(a)-------------------------------------------------------------- cols <- rep("white", nrow(manhattan)) cols[manhattan$DOHREGION %in% c("360610056001", "360610056002", "360610058001", "360610066001", "360610068001", "360610068005", "360610068006", "360610070001", "360610070002", "360610072001", "360610072002", "360610072003", "360610072004", "360610072005", "360610072006", "360610072007", "360610074001", "360610080003", "360610080005", "360610082002", "360610082003", "36061DOH0022", "36061DOH0023")] <- "#3333ff" par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5)) plot(manhattan, border = "black", col = cols, lwd=0.2, ylim = c(40.734989,40.73499)) ## ----figure 2(b)-------------------------------------------------------------- cols <- rep("white", nrow(manhattan)) cols[manhattan$DOHREGION %in% c("360610056001", "360610056002", "360610074001", "360610082002", "360610082003", "36061DOH0022")] <- "#3333ff" par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5)) plot(manhattan, border = "black", col = cols, lwd=0.2, ylim = c(40.734989,40.73499)) ## ----figure 2(c)-------------------------------------------------------------- cols <- rep("white", nrow(manhattan)) cols[manhattan$DOHREGION %in% c("360610056001", "360610072002", "360610072005", "360610072006", "360610058001", "360610068001", "360610068005", "360610068006", "36061DOH0022", "36061DOH0023")] <- "#3333ff" par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5)) plot(manhattan, border = "black", col = cols, lwd=0.2, ylim = c(40.734989,40.73499)) ## ----figure 2(d)-------------------------------------------------------------- cols <- rep("white", nrow(manhattan)) cols[manhattan$DOHREGION %in% c("360610056001", "360610072002", "360610072005", "360610068001", "360610068006", "36061DOH0022", "360610072004")] <- "#3333ff" par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5)) plot(manhattan, border = "black", col = cols, lwd=0.2, ylim = c(40.734989,40.73499)) ## ----print NYS data----------------------------------------------------------- head(nys@data[c("DOHREGION", "LATITUDE", "LONGITUDE", "OBREAST", "EBREAST")]) ## ----extract Manhattan data--------------------------------------------------- manhattan <- nys[startsWith(nys$DOHREGION, "36061"),] ## ----transformation----------------------------------------------------------- coord <- data.frame(x=manhattan$LONGITUDE, y=manhattan$LATITUDE) coordinates(coord) <- c("x", "y") proj4string(coord) <- proj4string(manhattan) coord <- spTransform(coord, CRS("+init=epsg:32618")) ## ----make a neighbors list---------------------------------------------------- library("spdep") nb <- poly2nb(manhattan, queen = T) print(nb) ## ----print head of the list--------------------------------------------------- head(nb) ## ----figure 3----------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5)) plot(manhattan, border="darkgrey", col="white", lwd = 0.2) box() ## ----figure 4----------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5)) plot(manhattan, border = "white", col = "white", lwd=0.2) plot(nb, cbind(manhattan$LONGITUDE, manhattan$LATITUDE), col="darkgrey", add=TRUE, lwd=0.2, cex=0.1) box() ## ----run rflexscan------------------------------------------------------------ fls <- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST) ## ----print method for the rflexscan class------------------------------------- print(fls) ## ----print properties of the MLC---------------------------------------------- print(fls$cluster[[1]]) ## ----print properties of a secondary cluster---------------------------------- print(fls$cluster[[2]]) ## ----summary method for the rflexscan class----------------------------------- summary(fls) ## ----figure 5(a)-------------------------------------------------------------- library("RColorBrewer") par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5), lwd = 0.5, xaxt = 'n', yaxt = 'n') plot(fls, rank = 1:7, edge.width = 0.5, col = brewer.pal(7, "RdYlBu")) box(lwd = 1) legend("topleft", legend = 1:7, fill = brewer.pal(7, "RdYlBu"), bty="n") ## ----figure 5(b)-------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5), xaxt = 'n', yaxt = 'n') choropleth(manhattan, fls, rank = 1:7, col = brewer.pal(7, "RdYlBu")) legend("topleft", legend = 1:7, fill = brewer.pal(7, "RdYlBu"), bty="n") ## ----plot method for the rflexscan class-------------------------------------- plot(fls, rank = 1:7, col = brewer.pal(7, "RdYlBu")) box() legend("topleft", legend = 1:7, fill = brewer.pal(7, "RdYlBu"), bty="n") ## ----draw choropleth map------------------------------------------------------ choropleth(manhattan, fls, rank = 1:7, col = brewer.pal(7, "RdYlBu")) legend("topleft", legend = 1:7, fill = brewer.pal(7, "RdYlBu"), bty="n") ## ----run rflexscan with restricted LR----------------------------------------- fls2 <- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, stattype = "RESTRICTED", ralpha = 0.2) summary(fls2) ## ----run rflexscan with restricted LR (large clusters)------------------------ system.time({ fls3 <- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, stattype = "RESTRICTED", ralpha = 0.2, clustersize = 50) }) ## ----print summary------------------------------------------------------------ summary(fls3) ## ----figure 6(a)-------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5), xaxt = 'n', yaxt = 'n') choropleth(manhattan, fls2, rank = 1:4, col = brewer.pal(4, "RdYlBu")) legend("topleft", legend = 1:4, fill = brewer.pal(4, "RdYlBu"), bty="n") ## ----figure 6(b)-------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5), xaxt = 'n', yaxt = 'n') choropleth(manhattan, fls3, rank = 1:4, col = brewer.pal(4, "RdYlBu")) legend("topleft", legend = 1:4, fill = brewer.pal(4, "RdYlBu"), bty="n") ## ----flexscan with restricted LR (large cluster; 9999 replications)----------- system.time({ fls4<- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, stattype = "RESTRICTED", ralpha = 0.2, clustersize = 50, simcount = 9999) }) ## ----print summary------------------------------------------------------------ summary(fls4) ## ----circular scan------------------------------------------------------------ fls5 <- rflexscan(name = manhattan$DOHREGION, lat = manhattan$LATITUDE, lon = manhattan$LONGITUDE, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, scanmethod = "CIRCULAR") summary(fls5) ## ----circular scan (large cluster)-------------------------------------------- fls6 <- rflexscan(name = manhattan$DOHREGION, lat = manhattan$LATITUDE, lon = manhattan$LONGITUDE, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, scanmethod = "CIRCULAR", clustersize = 50) summary(fls6) ## ----figure 7(a)-------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5), xaxt = 'n', yaxt = 'n') choropleth(manhattan, fls5, rank = 1:5, col = brewer.pal(5, "RdYlBu")) legend("topleft", legend = 1:5, fill = brewer.pal(5, "RdYlBu"), bty="n") degAxis(1, at=c(-74.1, -74, -73.9)) degAxis(2, at=c(40.7, 40.8)) ## ----figure 7(b)-------------------------------------------------------------- par(mar = c(0.5, 0.5, 0.5, 0.5), oma = c(0.5, 0.5, 0.5, 0.5), xaxt = 'n', yaxt = 'n') choropleth(manhattan, fls6, rank = 1:5, col = brewer.pal(5, "RdYlBu")) legend("topleft", legend = 1:5, fill = brewer.pal(5, "RdYlBu"), bty="n") degAxis(1, at=c(-74.1, -74, -73.9)) degAxis(2, at=c(40.7, 40.8)) ## ----benchmark (varying the maximum cluster size "K")------------------------- library("smerc") #devtools::install_github("benjak/scanstatistics", ref = "develop") library("scanstatistics") # exclude Governors Island manhattan <- manhattan[-8,] # make neighbors list from shape nb <- poly2nb(manhattan, queen = T) # transform longlat to Cartesian coordinates (epsg:32618) coord <- data.frame(x=manhattan$LONGITUDE, y=manhattan$LATITUDE) coordinates(coord) <- c("x", "y") proj4string(coord) <- proj4string(manhattan) coord <- spTransform(coord, CRS("+init=epsg:32618")) # maximum cluster size considered k_max <- 50 k_max_rflexscan <- 20 k_max_smerc <- 12 k_max_scanstatistics <- 9 # ---- rflexscan ---- rflexscan.time <- NULL for (k in 2:k_max) { if (k <= k_max_rflexscan) { rflexscan.time <- c(rflexscan.time, system.time({ fls <- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, clustersize = k) })[3]) } else { rflexscan.time <- c(rflexscan.time, NA) } } # ---- rflexscan with restricted likelihood ratio ---- restricted.time <- NULL for (k in 2:k_max) { restricted.time <- c(restricted.time, system.time({ fls <- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, stattype = "RESTRICTED", clustersize = k) })[3]) } # ---- smerc package ---- # load population data for analysis using smerc nys_ses <- readOGR("NYS_Cancer/NYS_SES_Data_region.shp") manhattan_ses <- nys_ses[startsWith(as.character(nys_ses$DOHREGION), "36061"),] manhattan_ses <- manhattan_ses[-8,] # make adjacency matrix listw <- nb2listw(nb, style = "B", zero.policy = TRUE) w <- as(listw, "symmetricMatrix") # run main analysis smerc.time <- NULL for (k in 2:k_max) { if (k <= k_max_smerc) { smerc.time <- c(smerc.time, system.time({ fls <- flex.test(coord@coords, manhattan$OBREAST, pop=manhattan_ses$POP_TOT, ex=manhattan$EBREAST, w = as.matrix(w), k = k) })[3]) } else { smerc.time <- c(smerc.time, NA) } } # ---- scanstatistics package ---- # make adjacency matrix A <- nb2mat(nb, style = "B", zero.policy = TRUE) A <- matrix(as.logical(A), ncol = ncol(A), nrow = nrow(A)) # run main analysis scanstatistics.time <- NULL for (k in 2:k_max) { if (k <= k_max_scanstatistics) { scanstatistics.time <- c(scanstatistics.time, system.time({ knn <- coords_to_knn(coord@coords, k) zones <- flexible_zones(knn, A) fls <- scan_eb_poisson(manhattan$OBREAST, zones, baselines = manhattan$EBREAST, n_mcsim = 999) })[3]) } else { scanstatistics.time <- c(scanstatistics.time, NA) } } flexscan.bench <- data.frame(k = 2:k_max, rflexscan=rflexscan.time, restricted=restricted.time, smerc=smerc.time, scanstatistics=scanstatistics.time) ## ----figure 8----------------------------------------------------------------- xlim <- c(0,50) ylim <- c(1,1100) library("RColorBrewer") cols <- brewer.pal(4, "Set1") plot(flexscan.bench$k, flexscan.bench$rflexscan, xlim = xlim, ylim = ylim, axes = FALSE, xlab="", ylab="", col = cols[3], pch = 15, log = "y") par(new=T) plot(flexscan.bench$k, flexscan.bench$restricted, xlim = xlim, ylim = ylim, axes = FALSE, xlab="", ylab="", col = cols[4], pch = 15, log = "y") par(new=T) plot(flexscan.bench$k, flexscan.bench$scanstatistics, xlim = xlim, ylim = ylim, axes = FALSE, xlab="", ylab="", col = cols[1], pch = 15, log = "y") par(new=T) plot(flexscan.bench$k, flexscan.bench$smerc, xlim = xlim, ylim = ylim, axes = FALSE, xlab = "Cluster size (K)", ylab = "Time (sec)", col = cols[2], pch = 15, log = "y") legend("topright", pch = 15, legend = c("scanstatistics", "smerc", "rflexscan", "rflexscan (restricted)"), col = cols) axis(1, c(0, 10, 20, 30, 40, 50)) axis(2, c(1,10,100, 1000)) axis(2, at=log10Tck('y','minor')[,-1], tcl= 0.2, labels = NA) box() ## ----benchmark (varying the threshold parameter "alpha1")--------------------- K <- 50 ralpha <- seq(0.01, 0.5, 0.01) ralpha_max_rflexscan <- 0.341 ralpha_max_smerc <- 0.171 rflexscan.time <- NULL for (ra in ralpha) { if (ra <= ralpha_max_rflexscan) { rflexscan.time <- c(rflexscan.time, system.time({ fls <- rflexscan(name = manhattan$DOHREGION, x = coord$x, y = coord$y, nb = nb, observed = manhattan$OBREAST, expected = manhattan$EBREAST, stattype = "RESTRICTED", clustersize = K, ralpha = ra) })[3]) print(paste(ra, rflexscan.time[length(rflexscan.time)])) } else { rflexscan.time <- c(rflexscan.time, NA) } } smerc.time <- NULL for (ra in ralpha) { if (ra <= ralpha_max_smerc) { smerc.time <- c(smerc.time, system.time({ fls <- rflex.test(coord@coords, manhattan$OBREAST, pop=manhattan_ses$POP_TOT, ex=manhattan$EBREAST, w = as.matrix(w), k = K, alpha1 = ra) })[3]) } else { smerc.time <- c(smerc.time, NA) } } restricted.bench <- data.frame(ralpha = ralpha, restricted=rflexscan.time, smerc=smerc.time) ## ----figure 9----------------------------------------------------------------- xlim <- c(0,0.5) ylim <- c(1,1100) cols <- brewer.pal(4, "Set1") plot(restricted.bench$ralpha, restricted.bench$restricted, xlim = xlim, ylim = ylim, axes = FALSE, xlab="", ylab="", col = cols[4], pch = 15, log = "y") par(new=T) plot(restricted.bench$ralpha, restricted.bench$smerc, xlim = xlim, ylim = ylim, axes = FALSE, xlab = expression(paste("Threshold parameter of the middle p-value (", alpha[1], ")")), ylab = "Time (sec)", col = cols[2], pch = 15, log = "y") legend("bottomright", pch = 15, legend = c("smerc (rflex.test)", "rflexscan (restricted)"), col = cols[c(2, 4)]) axis(1, c(0, 0.1, 0.2, 0.3, 0.4, 0.5)) axis(2, c(1,10,100, 1000)) axis(2, at=log10Tck('y','minor')[,-1], tcl= 0.2, labels = NA) box() ## ----preparation for rsatscan------------------------------------------------- library("rsatscan") obreast <- manhattan@data[,c("DOHREGION", "OBREAST", "YEAR")] ebreast <- manhattan@data[,c("DOHREGION", "YEAR", "EBREAST")] geo <- data.frame(DOHREGION = manhattan@data$DOHREGION, coord@coords, stringsAsFactors = F) write.cas(obreast, "NYS_Cancer/", "breast") write.pop(ebreast, "NYS_Cancer/", "breast") write.geo(geo, "NYS_Cancer/", "breast") invisible(ss.options(reset=TRUE)) ss.options(list(CaseFile="breast.cas", PopulationFile="breast.pop")) ss.options(list(StartDate = "2009/1/1", EndDate="2009/12/31")) ss.options(list(CoordinatesFile="breast.geo", CoordinatesType=0, ModelType=0)) ss.options(list(TimeAggregationUnits = 3, NonCompactnessPenalty=0)) ss.options(list(ReportGiniClusters="n", LogRunToHistoryFile="n")) ss.options(list(MaxSpatialSizeInPopulationAtRisk=50, SpatialWindowShapeType=0)) write.ss.prm("NYS_Cancer/", "breast_circular") invisible(ss.options(reset=TRUE)) ss.options(list(CaseFile="breast.cas", PopulationFile="breast.pop")) ss.options(list(StartDate = "2009/1/1", EndDate="2009/12/31")) ss.options(list(CoordinatesFile="breast.geo", CoordinatesType=0, ModelType=0)) ss.options(list(TimeAggregationUnits = 3, NonCompactnessPenalty=0)) ss.options(list(ReportGiniClusters="n", LogRunToHistoryFile="n")) ss.options(list(MaxSpatialSizeInPopulationAtRisk=50, SpatialWindowShapeType=1)) write.ss.prm("NYS_Cancer/", "breast_elliptic") path_satscan <- "/home/lvana/SaTScan" # change to location of SaTScan ## ----run rsatscan (circular)-------------------------------------------------- ## Note that tweaking according the OS may be required ## e.g., setting the location of SaTScan sslocation=/home/user/.. ## or ssbatchfilename ="SaTScanBatch" # system.time({satscan("NYS_Cancer/", "breast_circular", # sslocation = path_satscan, # ssbatchfilename ="SaTScanBatch")}) system.time({satscan("NYS_Cancer/", "breast_circular")}) ## ----run rsatscan (elliptic)-------------------------------------------------- # system.time({satscan("NYS_Cancer/", # "breast_elliptic", # sslocation = path_satscan, # ssbatchfilename ="SaTScanBatch")}) system.time({satscan("NYS_Cancer/", "breast_elliptic")})