########################################################################### ####### ANALYSE PARAMETER OPTIMISATION OF OUTLIER DETECTION METHODS ######## ########################################################################### # This script is for the analysis of the results from the parameter optimisation. # The data is analysed to identify the optimal parameter for each dataset. # The methods can be run for either the hourly or the monthly datasets. # The script should be run method by method to inspect manually each result. # First separate analyses per dataset and methodare given. # Then methods for combined analysis of all datasets per method are provided. #### define working environment #### #setwd("D:/ETC-ACM/Tasks/1022_3/R/finalisedScripts") # load libraries library(lattice) # load estimated parameters #load("parameterOptimisation_month.RData") datatype = "monthly" #load("parameterOptimisation_hour.RData") #datatype = "hourly" ########################################################### ################# Anaylsis separately #################### ########################################################### ################# AUTOREGRESSIVE MODEL ###################### # variables to store optimal parameters ar1_thr <- NULL ar1_jac_max <- NULL for(i in 1:length(I)){ # plot results plot(ar1_jac[[i]]$thr, ar1_jac[[i]]$jac, main=paste(names[i])) # new parameter ar1_jac_max = c(ar1_jac_max, max(as.numeric(ar1_jac[[i]]$jac))) # very small 0.12/0.08 ar1_thr = c(ar1_thr, list(as.numeric(ar1_jac[[i]]$thr)[ar1_jac[[i]]$jac==max(as.numeric(ar1_jac[[i]]$jac))])) } # show results ar1_df = data.frame(dataset=names[I], max_jac=round(ar1_jac_max, digits=2)) names(ar1_thr) = dataset=names[I] print(ar1_df) print(ar1_thr) ################# MOVING WINDOW - WHOLE WINDOW ###################### # variables to store optimal parameters mw_jac_max <- NULL mw_thr <- NULL mw_stats <- NULL mw_thr_meth <- NULL mw_q <- NULL for(i in 1:length(I)){ print(paste("####", names[I[i]], "####")) # general plots boxplot(mw_jac[[i]]$jac~mw_jac[[i]]$thr_method, main=paste(names[I[i]])) boxplot(mw_jac[[i]]$jac~mw_jac[[i]]$stats, main=paste(names[I[i]])) # get maximum jac value mw_jac_max = c(mw_jac_max, max(mw_jac[[i]]$jac)) ### 1) select optimal method (statistic) table_stats = table(mw_jac[[i]]$stats[mw_jac[[i]]$jac==mw_jac_max[i]]) # choose statistic with the highest count in maximum jac values stats = names(table_stats)[table_stats==max(table_stats)] if(length(stats)>1){ cat("More than one optimal statistics method found. \n Only the first method is used:",stats[1], "\n") } mw_stats = c(mw_stats, stats[1]) ### 2) select optimal threshold method table_thr_method = table(mw_jac[[i]]$thr_method[mw_jac[[i]]$jac==max(mw_jac[[i]]$jac)]) # choose method with the highest count in maximum jac values meth = names(table_thr_method)[table_thr_method==max(table_thr_method)] if(length(meth)>1){ cat("More than one optimal threshold method found. \n Only the first method is used:",meth[1], "\n") } mw_thr_meth = c(mw_thr_meth, meth[1]) ### 3) extract data with selected statistics and threshold method mw_jac_sel = subset(mw_jac[[i]], thr_method==mw_thr_meth[i]&stats==mw_stats[i]) # plot results for window size and thresholds boxplot(mw_jac_sel$jac~mw_jac_sel$q, main=paste(names[I[i]], "window size")) boxplot(mw_jac_sel$jac~mw_jac_sel$thr, main=paste(names[I[i]], "thresholds")) ### 4) assess optimal threshold values and window size # estimate average jac value per window size and threshold mean_q = lapply(split(mw_jac_sel$jac, mw_jac_sel$q), mean) mean_thr = lapply(split(mw_jac_sel$jac, mw_jac_sel$thr), mean) # derive optimal parameters (upper 5 percent of the average jac values) mw_q = c(mw_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)])) mw_thr = c(mw_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)])) } # show results mw_df = data.frame(dataset=names[I], max_jac=round(mw_jac_max, digits=2)) names(mw_stats) = dataset=names[I] names(mw_thr_meth) = dataset=names[I] names(mw_thr) = dataset=names[I] names(mw_q) = dataset=names[I] print(mw_df) print(mw_stats) print(mw_thr_meth) print(mw_thr) print(mw_q) # create levelplots of jac values per window size and threshold # select id of the data set to plot i=1 jac_sel = subset(mw_jac[[i]], thr_method==mw_thr_meth[i]&stats==mw_stats[i]) levelplot(jac~thr+q,data=jac_sel, main=paste(names[I[i]]), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) ################# MOVING WINDOW - TWO-SIDED WINDOW ###################### # variables to store optimal parameters mw2_jac_max <- NULL mw2_thr <- NULL mw2_stats <- NULL mw2_thr_meth <- NULL mw2_q <- NULL for(i in 1:length(II)){ print(paste("####", names[II[i]], "####")) # general plot boxplot(mw2_jac[[i]]$jac~mw2_jac[[i]]$thr_method, main=paste(names[II[i]])) boxplot(mw2_jac[[i]]$jac~mw2_jac[[i]]$stats, main=paste(names[II[i]])) # get maximum jac value mw2_jac_max = c(mw2_jac_max, max(mw2_jac[[i]]$jac)) ### 1) select optimal method (statistic) table_stats = table(mw2_jac[[i]]$stats[mw2_jac[[i]]$jac==mw2_jac_max[i]]) # choose statistic with the highest count in maximum jac values stats = names(table_stats)[table_stats==max(table_stats)] if(length(stats)>1){ cat("More than one optimal statistics method found. \n Only the first method is used:",stats[1], "\n") } mw2_stats = c(mw2_stats, stats[1]) ### 2) select optimal threshold method if(mw2_stats[i]=="t.test"){ meth = "t.test" }else{ table_thr_method = table(mw2_jac[[i]]$thr_method[mw2_jac[[i]]$jac==max(mw2_jac[[i]]$jac)]) # choose method with the highest count in maximum jac values meth = names(table_thr_method)[table_thr_method==max(table_thr_method)] if(length(meth)>1){ cat("More than one optimal threshold method found. \n Only the first method is used:",meth[1], "\n") } } mw2_thr_meth = c(mw2_thr_meth, meth[1]) ### 3) extract data with selected statistics and threshold method mw2_jac_sel = subset(mw2_jac[[i]], thr_method==mw2_thr_meth[i]&stats==mw2_stats[i]) # plot results boxplot(mw2_jac_sel$jac~mw2_jac_sel$q, main=paste(names[I[i]], "window size")) boxplot(mw2_jac_sel$jac~mw2_jac_sel$thr, main=paste(names[I[i]], "thresholds")) ### 4) assess optimal threshold values and window size mean_q = lapply(split(mw2_jac_sel$jac, mw2_jac_sel$q), mean) mean_thr = lapply(split(mw2_jac_sel$jac, mw2_jac_sel$thr), mean) # derive optimal parameters (upper 5 percent of the average jac values) mw2_q = c(mw2_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)])) mw2_thr = c(mw2_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)])) } # show results mw2_df = data.frame(dataset=names[II], max_jac=round(mw2_jac_max, digits=2)) names(mw2_stats) = dataset=names[II] names(mw2_thr_meth) = dataset=names[II] names(mw2_thr) = dataset=names[II] names(mw2_q) = dataset=names[II] print(mw2_df) print(mw2_stats) print(mw2_thr_meth) print(mw2_thr) print(mw2_q) # results round(mw2_jac_max, digits=2) mw2_stats mw2_thr_meth mw2_thr mw2_q # create levelplots of jac values per window size and threshold # select id of the data set to plot i=1 jac_sel = subset(mw2_jac[[i]], thr_method==mw2_thr_meth[i]&stats==mw2_stats[i]) levelplot(jac~thr+q,data=jac_sel, main=paste(names[II[i]]), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) ################# MOVING WINDOW - LAG 1 DIFFS ###################### # variables to store optimal parameters lag1_jac_max <- NULL lag1_thr <- NULL lag1_stats <- NULL lag1_thr_meth <- NULL lag1_q <- NULL for(i in 1:length(names)){ print(paste("####", names[i], "####")) # general plot boxplot(lag1_jac[[i]]$jac~lag1_jac[[i]]$thr_method, main=paste(names[i])) # get maximum jac value lag1_jac_max = c(lag1_jac_max, max(lag1_jac[[i]]$jac)) ### 1) select optimal method (statistic) table_stats = table(lag1_jac[[i]]$stats[lag1_jac[[i]]$jac==lag1_jac_max[i]]) stats = names(table_stats)[table_stats==max(table_stats)] if(length(stats)>1){ cat("More than one optimal statistics method found. \n Only the first method is used:",stats[1], "\n") } lag1_stats = c(lag1_stats, stats[1]) ### 2) select optimal threshold method table_thr_method = table(lag1_jac[[i]]$thr_method[lag1_jac[[i]]$jac==max(lag1_jac[[i]]$jac)]) meth = names(table_thr_method)[table_thr_method==max(table_thr_method)] if(length(meth)>1){ cat("More than one optimal threshold method found. \n Only the first method is used:",meth[1], "\n") } lag1_thr_meth = c(lag1_thr_meth, meth[1]) ### 3) extract data with selected statistics and threshold method lag1_jac_sel = subset(lag1_jac[[i]], thr_method==lag1_thr_meth[i]&stats==lag1_stats[i]) # plot results boxplot(lag1_jac_sel$jac~lag1_jac_sel$q, main=paste(names[i], "window size")) boxplot(lag1_jac_sel$jac~lag1_jac_sel$thr, main=paste(names[i], "thresholds")) ### 4) assess optimal threshold values and window size # estimate average jac value per window size and threshold mean_q = lapply(split(lag1_jac_sel$jac, lag1_jac_sel$q), mean) mean_thr = lapply(split(lag1_jac_sel$jac, lag1_jac_sel$thr), mean) # derive optimal parameters (upper 5 percent of the average jac values) lag1_q = c(lag1_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)])) lag1_thr = c(lag1_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)])) } # show results lag1_df = data.frame(dataset=names, max_jac=round(lag1_jac_max, digits=2)) names(lag1_stats) = dataset=names names(lag1_thr_meth) = dataset=names names(lag1_thr) = dataset=names names(lag1_q) = dataset=names print(lag1_df) print(lag1_stats) print(lag1_thr_meth) print(lag1_thr) print(lag1_q) # results round(lag1_jac_max, digits=2) lag1_stats lag1_thr_meth lag1_thr lag1_q # create levelplots of jac values per window size and threshold # select id of the data set to plot i=1 jac_sel = subset(lag1_jac[[i]], thr_method==lag1_thr_meth[i]&stats==lag1_stats[i]) levelplot(jac~thr+q,data=jac_sel, main=paste(names[i]), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) ################# MA FILTER ###################### # variables to store optimal parameters kz_jac_max <- NULL kz_thr <- NULL kz_it <- NULL kz_thr_meth <- NULL kz_q <- NULL for(i in 1:length(II)){ print(paste("####", names[II[i]], "####")) # general plot boxplot(kz_jac[[i]]$jac~kz_jac[[i]]$thr_method, main=paste(names[II[i]])) # get maximum jac value kz_jac_max = c(kz_jac_max, max(kz_jac[[i]]$jac)) ### 1) select optimal number of iterations table_it = table(kz_jac[[i]]$k[kz_jac[[i]]$jac==kz_jac_max[i]]) opt_it = names(table_it)[table_it==max(table_it)] if(length(opt_it>1)){ cat("More than one optimal iteration number found. \n Only the first one is used:",opt_it[1], "\n") # opt_it=min(opt_it) } kz_it = c(kz_it, opt_it[1]) ### 3) extract data with selected statistics and threshold method kz_jac_sel = subset(kz_jac[[i]], k==kz_it[i]) # plot results boxplot(kz_jac_sel$jac~kz_jac_sel$q, main=paste(names[II[i]], "window size")) boxplot(kz_jac_sel$jac~kz_jac_sel$thr, main=paste(names[II[i]], "thresholds")) ### 4) assess optimal threshold values and window size mean_q = lapply(split(kz_jac_sel$jac, kz_jac_sel$q), mean) mean_thr = lapply(split(kz_jac_sel$jac, kz_jac_sel$thr), mean) # derive optimal parameters (upper 5 percent of the average jac values) if(kz_jac_max[[i]]==1){ # if maximum is 1 use only those values with jac=1 as these are more than 5 % kz_q = c(kz_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)==max(as.numeric(mean_q))])) kz_thr = c(kz_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)==max(as.numeric(mean_thr))])) }else{ kz_q = c(kz_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)])) kz_thr = c(kz_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)])) } } # show results kz_df = data.frame(dataset=names[II], max_jac=round(kz_jac_max, digits=2)) names(kz_it) = dataset=names[II] names(kz_thr) = dataset=names[II] names(kz_q) = dataset=names[II] print(kz_df) print(kz_it) print(kz_thr) print(kz_q) # levelplots i=1 kz_jac_sel = subset(kz_jac[[i]], k==kz_it[i]) levelplot(jac~thr+q, data=kz_jac_sel, main=paste(names[II[i]]), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) ######################################################### ################# Analysis combined #################### ######################################################### #### AR(2) #### ar1_all <- NULL for(i in 1:length(ar1_jac)){ ar1_all = rbind(ar1_all, ar1_jac[[i]]) } mean_ar1 = lapply(split(ar1_all$jac, ar1_all$thr), mean) plot(names(mean_ar1), mean_ar1) # optimal threshold with the jac averages print(mean_ar1[as.numeric(mean_ar1)==max(as.numeric(mean_ar1))]) #### MW #### # select optimal version mw_all <- NULL for(i in 1:length(I)){ # for all #for(i in 1:2){ # for I data sets only mw_all = rbind(mw_all, mw_jac[[i]]) } # get data for optimal method mw_sel = subset(mw_all, thr_method=="sd_global"&stats=="mean") boxplot(mw_sel$jac~mw_sel$q, main="window size") boxplot(mw_sel$jac~mw_sel$thr, main="thresholds") # optimal window size mean_q = lapply(split(mw_sel$jac, mw_sel$q), mean) as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q), 0.95)] # optimal threshold mean_thr = lapply(split(mw_sel$jac, mw_sel$thr), mean) as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr), 0.95)] # get averages over all data sets q.vector = as.numeric(names(table(mw_sel$q))) thr.vector = as.numeric(names(table(mw_sel$thr))) jac.mean = NULL for(t in thr.vector){ jac.mean = c(jac.mean, as.numeric(lapply(split(mw_sel$jac[mw_sel$thr==t], mw_sel$q[mw_sel$thr==t]), mean))) } mw_sel.mean = data.frame(jac=jac.mean, q=rep(q.vector, length(thr.vector)), thr=rep(thr.vector, each=length(q.vector))) # levelplot for combined analysis (change title between I/all") levelplot(jac~thr+q,data=mw_sel.mean, main=paste("MW all", datatype), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) # maximum average jac value max(mw_sel.mean$jac) #### MW2 #### # select optimal version mw2_all <- NULL #for(i in 1:length(II)){ # for all #for(i in c(1,2,3,7)){ # for IIa1/IIb data sets only for(i in c(4,5,6)){ # for IIa2 data sets only mw2_all = rbind(mw2_all, mw2_jac[[i]]) } # find optimal method table_stats = table(mw2_all$stats[mw2_all$jac==1]) opt_meth = names(table_stats)[table_stats==max(table_stats)] print(opt_meth) # get data for optimal method mw2_sel = subset(mw2_all, thr_method=="sd"&stats==opt_meth) boxplot(mw2_sel$jac~mw2_sel$q, main="window size") boxplot(mw2_sel$jac~mw2_sel$thr, main="thresholds") # get averages over all data sets q.vector = as.numeric(names(table(mw2_sel$q))) thr.vector = names(table(mw2_sel$thr)) jac.mean = NULL for(t in thr.vector){ temp = as.numeric(lapply(split(mw2_sel$jac[mw2_sel$thr==t], mw2_sel$q[mw2_sel$thr==t]), mean)) jac.mean = c(jac.mean, temp) } mw2_sel.mean = data.frame(jac=jac.mean, q=rep(q.vector, length(thr.vector)), thr=rep(as.numeric(thr.vector), each=length(q.vector))) # optimal window size mean_q = lapply(split(mw2_sel$jac, mw2_sel$q), mean) as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q), 0.90)] # optimal threshold mean_thr = lapply(split(mw2_sel$jac, mw2_sel$thr), mean) as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr), 0.90)] # levelplot for combined analysis (change title between II/all") levelplot(jac~thr+q,data=mw2_sel.mean, main=paste("MW2 all", datatype), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) # maximum average jac value max(mw2_sel.mean$jac) #### MA filter #### # select optimal version kz_all <- NULL for(i in 1:length(II)){ # for all #for(i in 1:7){ # for II data sets only kz_all = rbind(kz_all, kz_jac[[i]]) } # find optimal iteration number table_it = table(kz_all$k[kz_all$jac==1]) opt_it = names(table_it)[table_it==max(table_it)] print(opt_it) # get data for optimal method kz_sel = subset(kz_all, k==opt_it) boxplot(kz_sel$jac~kz_sel$q, main="window size") boxplot(kz_sel$jac~kz_sel$thr, main="thresholds") # optimal window size mean_q = lapply(split(kz_sel$jac, kz_sel$q), mean) as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q), 0.90)] # optimal threshold mean_thr = lapply(split(kz_sel$jac, kz_sel$thr), mean) as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr), 0.90)] # get averages over all data sets q.vector = as.numeric(names(table(kz_sel$q))) thr.vector = names(table(kz_sel$thr)) jac.mean = NULL for(t in thr.vector){ jac.mean = c(jac.mean, as.numeric(lapply(split(kz_sel$jac[kz_sel$thr==t], kz_sel$q[kz_sel$thr==t]), mean))) } kz_sel.mean = data.frame(jac=jac.mean, q=rep(q.vector, length(thr.vector)), thr=rep(thr.vector, each=length(q.vector))) # levelplot for combined analysis (change title between II/all") levelplot(jac~thr+q,data=kz_sel.mean, main=paste("MA filter all", datatype), par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01))))) # maximum average jac value max(kz_sel.mean$jac)