###########################################################################
####### ANALYSE PARAMETER OPTIMISATION OF OUTLIER DETECTION METHODS ########
###########################################################################
# This script is for the analysis of the results from the parameter optimisation.
# The data is analysed to identify the optimal parameter for each dataset.
# The methods can be run for either the hourly or the monthly datasets.
# The script should be run method by method to inspect manually each result.
# First separate analyses per dataset and methodare given.
# Then methods for combined analysis of all datasets per method are provided.


#### define working environment ####
#setwd("D:/ETC-ACM/Tasks/1022_3/R/finalisedScripts")

# load libraries
library(lattice)

# load estimated parameters
#load("parameterOptimisation_month.RData")
datatype = "monthly"
#load("parameterOptimisation_hour.RData")
#datatype = "hourly"

###########################################################
################# Anaylsis separately ####################
###########################################################

################# AUTOREGRESSIVE MODEL ######################
# variables to store optimal parameters
ar1_thr <- NULL
ar1_jac_max <- NULL

for(i in 1:length(I)){
 # plot results
  plot(ar1_jac[[i]]$thr, ar1_jac[[i]]$jac, main=paste(names[i]))
  
  # new parameter
  ar1_jac_max = c(ar1_jac_max, max(as.numeric(ar1_jac[[i]]$jac)))  # very small 0.12/0.08
  ar1_thr = c(ar1_thr, list(as.numeric(ar1_jac[[i]]$thr)[ar1_jac[[i]]$jac==max(as.numeric(ar1_jac[[i]]$jac))]))
}  


# show results
ar1_df = data.frame(dataset=names[I], max_jac=round(ar1_jac_max, digits=2))
names(ar1_thr) = dataset=names[I]

print(ar1_df)
print(ar1_thr)


################# MOVING WINDOW - WHOLE WINDOW ######################
# variables to store optimal parameters
mw_jac_max <- NULL
mw_thr <- NULL
mw_stats <- NULL
mw_thr_meth <- NULL
mw_q <- NULL

for(i in 1:length(I)){
  print(paste("####", names[I[i]], "####"))
  
  # general plots
  boxplot(mw_jac[[i]]$jac~mw_jac[[i]]$thr_method, main=paste(names[I[i]]))
  boxplot(mw_jac[[i]]$jac~mw_jac[[i]]$stats, main=paste(names[I[i]]))
  
  # get maximum jac value
  mw_jac_max = c(mw_jac_max, max(mw_jac[[i]]$jac))
   
  ### 1) select optimal method (statistic)
  table_stats = table(mw_jac[[i]]$stats[mw_jac[[i]]$jac==mw_jac_max[i]])
  # choose statistic with the highest count in maximum jac values
  stats = names(table_stats)[table_stats==max(table_stats)]
  if(length(stats)>1){
    cat("More than one optimal statistics method found. \n Only the first method is used:",stats[1], "\n")
  }
  mw_stats = c(mw_stats, stats[1])
  
  ### 2) select optimal threshold method
  table_thr_method = table(mw_jac[[i]]$thr_method[mw_jac[[i]]$jac==max(mw_jac[[i]]$jac)])
  # choose method with the highest count in maximum jac values
  meth = names(table_thr_method)[table_thr_method==max(table_thr_method)]
  if(length(meth)>1){
    cat("More than one optimal threshold method found. \n Only the first method is used:",meth[1], "\n")
  }
  mw_thr_meth = c(mw_thr_meth, meth[1])
  
  ### 3) extract data with selected statistics and threshold method
  mw_jac_sel = subset(mw_jac[[i]], thr_method==mw_thr_meth[i]&stats==mw_stats[i])
  # plot results for window size and thresholds
  boxplot(mw_jac_sel$jac~mw_jac_sel$q, main=paste(names[I[i]], "window size"))
  boxplot(mw_jac_sel$jac~mw_jac_sel$thr, main=paste(names[I[i]], "thresholds"))
  
  ### 4) assess optimal threshold values and window size
  # estimate average jac value per window size and threshold
  mean_q = lapply(split(mw_jac_sel$jac, mw_jac_sel$q), mean)
  mean_thr = lapply(split(mw_jac_sel$jac, mw_jac_sel$thr), mean)
  
  # derive optimal parameters (upper 5 percent of the average jac values)
  mw_q = c(mw_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)]))
  mw_thr = c(mw_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)]))
}

# show results
mw_df = data.frame(dataset=names[I], max_jac=round(mw_jac_max, digits=2))
names(mw_stats) = dataset=names[I]
names(mw_thr_meth) = dataset=names[I]
names(mw_thr) = dataset=names[I]
names(mw_q) = dataset=names[I]

print(mw_df)
print(mw_stats)
print(mw_thr_meth)
print(mw_thr)
print(mw_q)

# create levelplots of jac values per window size and threshold
# select id of the data set to plot
i=1
jac_sel = subset(mw_jac[[i]], thr_method==mw_thr_meth[i]&stats==mw_stats[i])
levelplot(jac~thr+q,data=jac_sel, main=paste(names[I[i]]),
        par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))


################# MOVING WINDOW - TWO-SIDED WINDOW ######################
# variables to store optimal parameters
mw2_jac_max <- NULL
mw2_thr <- NULL
mw2_stats <- NULL
mw2_thr_meth <- NULL
mw2_q <- NULL

for(i in 1:length(II)){
  print(paste("####", names[II[i]], "####"))
  
  # general plot
  boxplot(mw2_jac[[i]]$jac~mw2_jac[[i]]$thr_method, main=paste(names[II[i]]))
  boxplot(mw2_jac[[i]]$jac~mw2_jac[[i]]$stats, main=paste(names[II[i]]))
  
  # get maximum jac value
  mw2_jac_max = c(mw2_jac_max, max(mw2_jac[[i]]$jac))
   
  ### 1) select optimal method (statistic)
  table_stats = table(mw2_jac[[i]]$stats[mw2_jac[[i]]$jac==mw2_jac_max[i]])
  # choose statistic with the highest count in maximum jac values
  stats = names(table_stats)[table_stats==max(table_stats)]
  if(length(stats)>1){
    cat("More than one optimal statistics method found. \n Only the first method is used:",stats[1], "\n")
  }
  mw2_stats = c(mw2_stats, stats[1])
   
  ### 2) select optimal threshold method
  if(mw2_stats[i]=="t.test"){
    meth = "t.test"
  }else{
     table_thr_method = table(mw2_jac[[i]]$thr_method[mw2_jac[[i]]$jac==max(mw2_jac[[i]]$jac)])
    # choose method with the highest count in maximum jac values
    meth = names(table_thr_method)[table_thr_method==max(table_thr_method)]
    if(length(meth)>1){
      cat("More than one optimal threshold method found. \n Only the first method is used:",meth[1], "\n")
    }
  }

  mw2_thr_meth = c(mw2_thr_meth, meth[1])
  
  ### 3) extract data with selected statistics and threshold method
  mw2_jac_sel = subset(mw2_jac[[i]], thr_method==mw2_thr_meth[i]&stats==mw2_stats[i])
  # plot results
  boxplot(mw2_jac_sel$jac~mw2_jac_sel$q, main=paste(names[I[i]], "window size"))
  boxplot(mw2_jac_sel$jac~mw2_jac_sel$thr, main=paste(names[I[i]], "thresholds"))
  
  ### 4) assess optimal threshold values and window size
  mean_q = lapply(split(mw2_jac_sel$jac, mw2_jac_sel$q), mean)
  mean_thr = lapply(split(mw2_jac_sel$jac, mw2_jac_sel$thr), mean)
  
  # derive optimal parameters (upper 5 percent of the average jac values)
  mw2_q = c(mw2_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)]))
  mw2_thr = c(mw2_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)]))
}

# show results
mw2_df = data.frame(dataset=names[II], max_jac=round(mw2_jac_max, digits=2))
names(mw2_stats) = dataset=names[II]
names(mw2_thr_meth) = dataset=names[II]
names(mw2_thr) = dataset=names[II]
names(mw2_q) = dataset=names[II]

print(mw2_df)
print(mw2_stats)
print(mw2_thr_meth)
print(mw2_thr)
print(mw2_q)

# results
round(mw2_jac_max, digits=2)
mw2_stats
mw2_thr_meth
mw2_thr
mw2_q

# create levelplots of jac values per window size and threshold
# select id of the data set to plot
i=1
jac_sel = subset(mw2_jac[[i]], thr_method==mw2_thr_meth[i]&stats==mw2_stats[i])
levelplot(jac~thr+q,data=jac_sel, main=paste(names[II[i]]),
        par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))


################# MOVING WINDOW - LAG 1 DIFFS ######################
# variables to store optimal parameters
lag1_jac_max <- NULL
lag1_thr <- NULL
lag1_stats <- NULL
lag1_thr_meth <- NULL
lag1_q <- NULL

for(i in 1:length(names)){
  print(paste("####", names[i], "####"))
  
  # general plot
  boxplot(lag1_jac[[i]]$jac~lag1_jac[[i]]$thr_method, main=paste(names[i]))
  
  # get maximum jac value
  lag1_jac_max = c(lag1_jac_max, max(lag1_jac[[i]]$jac))
  
  ### 1) select optimal method (statistic)
  table_stats = table(lag1_jac[[i]]$stats[lag1_jac[[i]]$jac==lag1_jac_max[i]])
  stats = names(table_stats)[table_stats==max(table_stats)]
  if(length(stats)>1){
    cat("More than one optimal statistics method found. \n Only the first method is used:",stats[1], "\n")
  }
  lag1_stats = c(lag1_stats, stats[1])
  
  ### 2) select optimal threshold method
  table_thr_method = table(lag1_jac[[i]]$thr_method[lag1_jac[[i]]$jac==max(lag1_jac[[i]]$jac)])
  meth = names(table_thr_method)[table_thr_method==max(table_thr_method)]
  if(length(meth)>1){
    cat("More than one optimal threshold method found. \n Only the first method is used:",meth[1], "\n")
  }
  lag1_thr_meth = c(lag1_thr_meth, meth[1])
  
  ### 3) extract data with selected statistics and threshold method
  lag1_jac_sel = subset(lag1_jac[[i]], thr_method==lag1_thr_meth[i]&stats==lag1_stats[i])  
  # plot results
  boxplot(lag1_jac_sel$jac~lag1_jac_sel$q, main=paste(names[i], "window size"))
  boxplot(lag1_jac_sel$jac~lag1_jac_sel$thr, main=paste(names[i], "thresholds"))
  
  ### 4) assess optimal threshold values and window size
  # estimate average jac value per window size and threshold
  mean_q = lapply(split(lag1_jac_sel$jac, lag1_jac_sel$q), mean)
  mean_thr = lapply(split(lag1_jac_sel$jac, lag1_jac_sel$thr), mean)
 
  # derive optimal parameters (upper 5 percent of the average jac values)
  lag1_q = c(lag1_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)]))
  lag1_thr = c(lag1_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)]))
}

# show results
lag1_df = data.frame(dataset=names, max_jac=round(lag1_jac_max, digits=2))
names(lag1_stats) = dataset=names
names(lag1_thr_meth) = dataset=names
names(lag1_thr) = dataset=names
names(lag1_q) = dataset=names

print(lag1_df)
print(lag1_stats)
print(lag1_thr_meth)
print(lag1_thr)
print(lag1_q)

# results
round(lag1_jac_max, digits=2)
lag1_stats
lag1_thr_meth
lag1_thr
lag1_q

# create levelplots of jac values per window size and threshold
# select id of the data set to plot
i=1
jac_sel = subset(lag1_jac[[i]], thr_method==lag1_thr_meth[i]&stats==lag1_stats[i])
levelplot(jac~thr+q,data=jac_sel, main=paste(names[i]),
        par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))


################# MA FILTER ######################
# variables to store optimal parameters
kz_jac_max <- NULL
kz_thr <- NULL
kz_it <- NULL
kz_thr_meth <- NULL
kz_q <- NULL

for(i in 1:length(II)){
  print(paste("####", names[II[i]], "####"))
  
  # general plot
  boxplot(kz_jac[[i]]$jac~kz_jac[[i]]$thr_method, main=paste(names[II[i]]))
 
  # get maximum jac value
  kz_jac_max = c(kz_jac_max, max(kz_jac[[i]]$jac))
    
  ### 1) select optimal number of iterations
  table_it = table(kz_jac[[i]]$k[kz_jac[[i]]$jac==kz_jac_max[i]]) 
  opt_it =  names(table_it)[table_it==max(table_it)]
  if(length(opt_it>1)){
    cat("More than one optimal iteration number found. \n Only the first one is used:",opt_it[1], "\n")
   # opt_it=min(opt_it)
  }
  kz_it = c(kz_it, opt_it[1])  
  
  ### 3) extract data with selected statistics and threshold method
  kz_jac_sel = subset(kz_jac[[i]], k==kz_it[i]) 
  # plot results
  boxplot(kz_jac_sel$jac~kz_jac_sel$q, main=paste(names[II[i]], "window size"))
  boxplot(kz_jac_sel$jac~kz_jac_sel$thr, main=paste(names[II[i]], "thresholds"))
   
  ### 4) assess optimal threshold values and window size
  mean_q = lapply(split(kz_jac_sel$jac, kz_jac_sel$q), mean)
  mean_thr = lapply(split(kz_jac_sel$jac, kz_jac_sel$thr), mean)
  
  # derive optimal parameters (upper 5 percent of the average jac values)
  if(kz_jac_max[[i]]==1){ # if maximum is 1 use only those values with jac=1 as these are more than 5 %
    kz_q = c(kz_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)==max(as.numeric(mean_q))]))
    kz_thr = c(kz_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)==max(as.numeric(mean_thr))])) 
  }else{
    kz_q = c(kz_q, list(as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q),0.95)]))
    kz_thr = c(kz_thr, list(as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr),0.95)])) 
  }
}

# show results
kz_df = data.frame(dataset=names[II], max_jac=round(kz_jac_max, digits=2))
names(kz_it) = dataset=names[II]
names(kz_thr) = dataset=names[II]
names(kz_q) = dataset=names[II]

print(kz_df)
print(kz_it)
print(kz_thr)
print(kz_q)


# levelplots
i=1
kz_jac_sel = subset(kz_jac[[i]], k==kz_it[i])
levelplot(jac~thr+q, data=kz_jac_sel, main=paste(names[II[i]]),
        par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))


#########################################################
################# Analysis combined ####################
#########################################################            
#### AR(2) ####
ar1_all <- NULL
for(i in 1:length(ar1_jac)){
  ar1_all = rbind(ar1_all, ar1_jac[[i]])
}
mean_ar1 = lapply(split(ar1_all$jac, ar1_all$thr), mean)
plot(names(mean_ar1), mean_ar1)

# optimal threshold with the jac averages
print(mean_ar1[as.numeric(mean_ar1)==max(as.numeric(mean_ar1))])
  
               
#### MW ####
# select optimal version
mw_all <- NULL
for(i in 1:length(I)){ # for all
#for(i in 1:2){ # for I data sets only
  mw_all = rbind(mw_all, mw_jac[[i]])
}

# get data for optimal method
mw_sel = subset(mw_all, thr_method=="sd_global"&stats=="mean")
boxplot(mw_sel$jac~mw_sel$q, main="window size")
boxplot(mw_sel$jac~mw_sel$thr, main="thresholds")
  
# optimal window size
mean_q = lapply(split(mw_sel$jac, mw_sel$q), mean)
as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q), 0.95)]

# optimal threshold 
mean_thr = lapply(split(mw_sel$jac, mw_sel$thr), mean)
as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr), 0.95)]

# get averages over all data sets
q.vector = as.numeric(names(table(mw_sel$q)))
thr.vector = as.numeric(names(table(mw_sel$thr)))
jac.mean = NULL
for(t in thr.vector){
  jac.mean = c(jac.mean, as.numeric(lapply(split(mw_sel$jac[mw_sel$thr==t], mw_sel$q[mw_sel$thr==t]), mean)))
}
mw_sel.mean = data.frame(jac=jac.mean, q=rep(q.vector, length(thr.vector)), thr=rep(thr.vector, each=length(q.vector)))

# levelplot for combined analysis (change title between I/all")
levelplot(jac~thr+q,data=mw_sel.mean, main=paste("MW all", datatype),
    par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))

# maximum average jac value
max(mw_sel.mean$jac)

#### MW2 ####
# select optimal version
mw2_all <- NULL
#for(i in 1:length(II)){ # for all
#for(i in c(1,2,3,7)){ # for IIa1/IIb data sets only
for(i in c(4,5,6)){ # for IIa2 data sets only
  mw2_all = rbind(mw2_all, mw2_jac[[i]])
}

# find optimal method
table_stats = table(mw2_all$stats[mw2_all$jac==1])
opt_meth = names(table_stats)[table_stats==max(table_stats)]
print(opt_meth)

# get data for optimal method
mw2_sel = subset(mw2_all, thr_method=="sd"&stats==opt_meth)    
boxplot(mw2_sel$jac~mw2_sel$q, main="window size")
boxplot(mw2_sel$jac~mw2_sel$thr, main="thresholds")

# get averages over all data sets
q.vector = as.numeric(names(table(mw2_sel$q)))
thr.vector = names(table(mw2_sel$thr))
jac.mean = NULL
for(t in thr.vector){
  temp  = as.numeric(lapply(split(mw2_sel$jac[mw2_sel$thr==t], mw2_sel$q[mw2_sel$thr==t]), mean))
  jac.mean = c(jac.mean, temp)
}
mw2_sel.mean = data.frame(jac=jac.mean, q=rep(q.vector, length(thr.vector)), thr=rep(as.numeric(thr.vector), each=length(q.vector)))

# optimal window size
mean_q = lapply(split(mw2_sel$jac, mw2_sel$q), mean)
as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q), 0.90)]
  
# optimal threshold 
mean_thr = lapply(split(mw2_sel$jac, mw2_sel$thr), mean)
as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr), 0.90)]

# levelplot for combined analysis (change title between II/all")
levelplot(jac~thr+q,data=mw2_sel.mean, main=paste("MW2 all", datatype),
    par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))
 
# maximum average jac value
max(mw2_sel.mean$jac)

#### MA filter ####
# select optimal version
kz_all <- NULL
for(i in 1:length(II)){ # for all
#for(i in 1:7){ # for II data sets only
  kz_all = rbind(kz_all, kz_jac[[i]])
}

# find optimal iteration number
table_it = table(kz_all$k[kz_all$jac==1]) 
opt_it =  names(table_it)[table_it==max(table_it)]
print(opt_it)

# get data for optimal method
kz_sel = subset(kz_all, k==opt_it)
boxplot(kz_sel$jac~kz_sel$q, main="window size")
boxplot(kz_sel$jac~kz_sel$thr, main="thresholds")

# optimal window size
mean_q = lapply(split(kz_sel$jac, kz_sel$q), mean)
as.numeric(names(mean_q))[as.numeric(mean_q)>=quantile(as.numeric(mean_q), 0.90)]

# optimal threshold   
mean_thr = lapply(split(kz_sel$jac, kz_sel$thr), mean)
as.numeric(names(mean_thr))[as.numeric(mean_thr)>=quantile(as.numeric(mean_thr), 0.90)]

# get averages over all data sets
q.vector = as.numeric(names(table(kz_sel$q)))
thr.vector = names(table(kz_sel$thr))
jac.mean = NULL
for(t in thr.vector){
  jac.mean = c(jac.mean, as.numeric(lapply(split(kz_sel$jac[kz_sel$thr==t], kz_sel$q[kz_sel$thr==t]), mean)))
}
kz_sel.mean = data.frame(jac=jac.mean, q=rep(q.vector, length(thr.vector)), thr=rep(thr.vector, each=length(q.vector)))

# levelplot for combined analysis (change title between II/all")
levelplot(jac~thr+q,data=kz_sel.mean, main=paste("MA filter all", datatype),
    par.settings=list(regions=list(col=grey(seq(0.99, 0.2, -0.01)))))

# maximum average jac value
max(kz_sel.mean$jac)