############################################################################ ################## Analysis of air quality time series ################## ########################################################################### # This script loads data from AirBase which can be analysed on outliers. # The outlier locations have to be identified by plots and added manually. # The data used here is monthly and hourly data which is provided separately. #### define working environment #### #setwd("D:/ETC-ACM/Tasks/1022_3/R/finalisedScripts") ################# MONTHLY DATA #################### ################################################### ################# Data import #################### ################################################### # load data for CO co <- read.csv("CO_month.csv") co.t <- as.data.frame(t(co[,2:ncol(co)])) names(co.t) <- co[,1] co.t$date <- names(co)[2:ncol(co)] co.t$month <- rep(1:12,nrow(co.t)/12) co.t$year <- rep(1997:2009, each=12) # load data for NO2 no2 <- read.csv("NO2_month.csv") no2.t <- as.data.frame(t(no2[,2:ncol(no2)])) names(no2.t) <- no2[,1] no2.t$date <- names(no2)[2:ncol(no2)] no2.t$month <- rep(1:12,nrow(no2.t)/12) no2.t$year <- rep(1990:2009, each=12) # load data for O3 o3 <- read.csv("O3_month.csv") o3.t <- as.data.frame(t(o3[,2:ncol(o3)])) names(o3.t) <- o3[,1] o3.t$date <- names(o3)[2:ncol(o3)] o3.t$month <- rep(1:12,nrow(o3.t)/12) o3.t$year <- rep(1990:2009, each=12) # load data for SO2 so2 <- read.csv("SO2_month.csv") so2.t <- as.data.frame(t(so2[,2:ncol(so2)])) names(so2.t) <- so2[,1] so2.t$date <- names(so2)[2:ncol(so2)] so2.t$month <- rep(1:12,nrow(so2.t)/12) so2.t$year <- rep(1990:2009, each=12) ######################################################## ######## Find outliers in time series manually ######## ######################################################## ################ Plots ################# # plot timeseries for different stations # change the indices to look at different stations for(i in 700:730) { ts.plot(no2.t[,i], ylab="NO2", main=paste(i,": ",names(no2.t)[i],sep="")) } ##################################################### ######## Save test and evaluation data sets ######## ##################################################### #### I Outliers #### # selected data sets # 188 ts.plot(co.t[,188]) I.co = co.t[,188] I.co_out = c(78, 91, 94, 127) # 115 ts.plot(so2.t[,115]) I.so2 = so2.t[,115] I.so2_out = c(14, 73) # printable plots for I plot(as.numeric(so2.t[,115]), type="l", xaxt="n", xlab="", ylab=expression(paste(SO[2]," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(1, nrow(so2.t), 12), cex.axis=0.8, las=2, labels=no2.t$date[seq(1, nrow(so2.t), 12)]) plot(as.numeric(co.t[,188]), type="l", xaxt="n", xlab="", ylab=expression(paste(CO," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(1, nrow(co.t), 12), cex.axis=0.8, las=2, labels=co.t$date[seq(1, nrow(co.t), 12)]) points(I.co_out, co.t[I.co_out,188], pch=19, col="red") #### IIa1 Permanent change - break in mean #### # selected data sets # 1847 ts.plot(so2.t[,1847]) IIa1.so2.1 = so2.t[,1847] IIa1.so2.1_out = c(140, 186) # 24 ts.plot(so2.t[,24]) IIa1.so2.2 = so2.t[,24] IIa1.so2.2_out = c(96, 146) # 610 ts.plot(so2.t[,610]) IIa1.so2.3 = so2.t[,610] IIa1.so2.3_out = 224 # printable plots for IIa1 plot(so2.t[,24], type="l", xaxt="n", xlab="", ylab=expression(paste(SO[2]," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(1, nrow(so2.t), 12), cex.axis=0.8, las=2, labels=so2.t$date[seq(1, nrow(so2.t), 12)]) points(IIa1.so2.2_out-1, so2.t[IIa1.so2.2_out-1,24],pch=19, col="red") plot(so2.t[,1847], type="l", xaxt="n", xlab="", xlim=c(73, nrow(so2.t)), ylab=expression(paste(SO[2]," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(73, nrow(so2.t), 12), cex.axis=0.8, las=2, labels=so2.t$date[seq(73, nrow(so2.t), 12)]) points(IIa1.so2.1_out-1, so2.t[IIa1.so2.1_out-1,1847], pch=19, col="red") #### IIa2 Permanent change - break in minimum #### # selected data sets # 375 ts.plot(o3.t[,375]) IIa2.o3.1 = as.numeric(o3.t[,375]) IIa2.o3.1_out = 86 # 1328 ts.plot(no2.t[,1328]) IIa2.no2 = no2.t[,1382] IIa2.no2_out = 210 # 2117 ts.plot(o3.t[,2117]) IIa2.o3.2 = as.numeric(o3.t[,2117]) IIa2.o3.2_out = 219 # printable plots for IIa2 plot(as.numeric(o3.t[,375]), type="l", xaxt="n", xlab="", ylab=expression(paste(O[3]," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(1, nrow(o3.t), 12), cex.axis=0.8, las=2, labels=o3.t$date[seq(1, nrow(o3.t), 12)]) points(IIa2.o3.1_out, o3.t[IIa2.o3.1_out,375], pch=19, col="red") plot(as.numeric(no2.t[,1328]), type="l", xaxt="n", xlab="", xlim=c(120,nrow(no2.t)), ylab=expression(paste(NO[2]," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(121, nrow(no2.t), 12), cex.axis=0.8, las=2, labels=no2.t$date[seq(121, nrow(no2.t), 12)]) points(IIa2.no2_out, no2.t[IIa2.no2_out,1328], pch=19, col="red") #### IIb Transient change - break #### # selected data sets # 66 ts.plot(co.t[,66]) IIb.co = co.t[,66] IIb.co_out = c(109, 131) # printable plots for IIb plot(as.numeric(co.t[,66]), type="l", xaxt="n", xlab="", ylab=expression(paste(CO," (",mu * g~m^-3,")")), main="Monthly averages") axis(1, at=seq(1, nrow(co.t), 12), cex.axis=0.8, las=2, labels=o3.t$date[seq(1, nrow(co.t), 12)]) points(IIb.co_out, co.t[IIb.co_out,66], pch=19, col="red") #### III No outliers #### # selected data sets # 6 ts.plot(co.t[20:nrow(co.t),6]) III.co = co.t[20:nrow(co.t),6] # 393 ts.plot(o3.t[,393]) III.o3 = as.numeric(o3.t[,393]) # 129 ts.plot(no2.t[,129]) III.no2 = no2.t[,129] III_out = integer(0) #### Evaluation data sets #### # 115 ts.plot(so2.t[,115]) I.eval = so2.t[,115] I_out = c(14,73) # 118 ts.plot(no2.t[,118]) II.eval = no2.t[,118] II_out = 54 # 621 ts.plot(o3.t[,621]) III.eval = as.numeric(o3.t[,621]) ################# HOURLY DATA #################### ################################################### ################# Data import #################### ################################################### # load raw data for Germany pm10.raw <- read.csv("PM10_hour.csv") # reorder data.frame # time days <- as.POSIXct(strptime(as.character(pm10.raw$start_date[1:365]), "%d-%m-%Y")) time <- rep(days, each=24) + rep(seq(3600, (24*3600), 3600), length(days)) # data per station pm10 <- NULL pm10.names <- NULL for(st in names(table(pm10.raw$id))){ sub <- subset(pm10.raw, id==st, select=c(5:28)) if(nrow(sub)==365){ # only proceed for complete years pm10 = cbind(pm10, as.vector(as.matrix(t(sub)))) pm10.names = c(pm10.names, st) } } # data.frame pm10.r <- data.frame(pm10) names(pm10.r) =pm10.names pm10.r$time = time ######################################################## ######## Find outliers in time series manually ######## ######################################################## ################ Plots ################# # plot timeseries for different stations # change the indices to look at different stations for(i in 270:300) { ts.plot(pm10.r[,i], ylab="PM10", main=paste(i,": ",names(pm10.r)[i],sep="")) } ##################################################### ######## Save test and evaluation data sets ######## ##################################################### #### I Outliers #### # selected data sets # 28 ts.plot(pm10.r[,28]) I.pm10.1 = pm10.r[,28] I.pm10.1_out = c(1980, 5683) # 13 ts.plot(pm10.r[,13]) I.pm10.2 = pm10.r[,13] I.pm10.2_out = c(1, 1981, 5399, 5422) # printable plots for I Sys.getlocale("LC_TIME") Sys.setlocale("LC_TIME", "English") plot(as.numeric(pm10.r[,28]), type="l", xaxt="n", xlab="", ylab=expression(paste(PM[10]," (",mu * g~m^-3,")")), main="Raw hourly data") axis(1, at=which(as.character(pm10.r$time, format="%d.%H")=="01.01"), cex.axis=0.8, las=2, labels=as.character(pm10.r$time[as.character(pm10.r$time, format="%d.%H")=="01.01"], format="%b.%y")) points(I.pm10.1_out, pm10.r[I.pm10.1_out,28], pch=19, col="red") plot(as.numeric(pm10.r[,13]), type="l", xaxt="n", xlab="", ylab=expression(paste(PM[10]," (",mu * g~m^-3,")")), main="Raw hourly data") axis(1, at=which(as.character(pm10.r$time, format="%d.%H")=="01.01"), cex.axis=0.8, las=2, labels=as.character(pm10.r$time[as.character(pm10.r$time, format="%d.%H")=="01.01"], format="%b.%y")) points(I.pm10.2_out, pm10.r[I.pm10.2_out,13], pch=19, col="red") #### IIb: transient change #### # selected data sets # 297 ts.plot(pm10.r[,297], ylim=c(0,50)) IIb.pm10 = pm10.r[,297] IIb.pm10_out = c(1975, 2223) # printable plots for IIb plot(as.numeric(pm10.r[,297]), type="l", xlim=c(1,24*30*5), ylim=c(0,50), xaxt="n", xlab="", ylab=expression(paste(PM[10]," (",mu * g~m^-3,")")), main="Raw hourly data") axis(1, at=which(as.character(pm10.r$time, format="%d.%H")=="01.01"), cex.axis=0.8, las=2, labels=as.character(pm10.r$time[as.character(pm10.r$time, format="%d.%H")=="01.01"], format="%b.%y")) points(IIb.pm10_out+c(-1,1), pm10.r[IIb.pm10_out+c(-1,1),297], pch=19, col="red") #### III: clean data sets #### # selected data sets # 310 ts.plot(pm10.r[,310]) III.pm10.1 = pm10.r[,310] # 250 ts.plot(pm10.r[,250]) III.pm10.2 = pm10.r[,250] #### Evaluation data #### # 140 ts.plot(pm10.r[,140]) I.r.eval = pm10.r[,140] I.r_out = c(1983, 2611, 2721, 3735, 4727) # 163 ts.plot(pm10.r[,163]) II.r.eval = pm10.r[,163] II.r.eval[!is.na(II.r.eval)&II.r.eval>100] = 50 II.r_out = c(1978, 2208) # 230 ts.plot(pm10.r[,230]) III.r.eval = pm10.r[,230] # Save data save.image("airbase.RData")