-
Notifications
You must be signed in to change notification settings - Fork 0
/
clear-invalid-env-Penaflor.Rmd
105 lines (79 loc) · 3.4 KB
/
clear-invalid-env-Penaflor.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
In this notebook we look when the sensors have been moved and, therefore, the logged data isn't valid.
```{r}
# import libraries
source('lib-dendro.R')
library(plotly)
library(datacleanr) # to find invalid periods (interactively)
library(readr) # for write_csv() function
library(tidyverse) # for %>%
library(glue)
# global variables
PATH = dirname(rstudioapi::getSourceEditorContext()$path)
setwd(PATH)
PLACE = 'Penaflor'
BUFFER_ENV_DIR = glue('processed/{PLACE}-env-buffer-toclear')
OUTPUT_ENV_DIR = glue('processed/{PLACE}-env-processed')
```
```{r}
# importing processed environmental data #
list_files <- list.files(file.path(PATH,BUFFER_ENV_DIR), pattern="*.csv$", full.names=TRUE)
db.env<-read.all.env.processed(list_files)
str(db.env)
```
With VWC we can better see when the sensor is moved (VWC equals to 0 for a long period):
```{r}
f <- plot_ly(db.env, x = ~ts, y = ~vwc, color = ~series, type = 'scatter', mode = 'lines')
f
```
Defining and clearing the periods of invalid data. There's quite a bunch of invalid data (timeframes where the sensor has been removed by an animal), we need to remove those:
First, we remove all the data prior to installation:
```{r}
ts_start = "2022-03-15 12:30:00"
head(db.env)
db.env <- db.env[which(db.env$ts>=ts_start),]
head(db.env)
```
Then, we set NA when there's no valid data:
```{r}
# Then we clear the invalid data for each series:
# first for series1
series1 = (db.env$series == "94231949")
# first interval:
interval1.1 = (db.env$ts >= "2022-04-08 20:30:00") & (db.env$ts <= "2022-05-04 14:00:00")
# second interval, first series:
interval1.2 = (db.env$ts >= "2022-09-11 09:45:00") & (db.env$ts < "2022-11-24 14:45:00")
interval1.3 = (db.env$ts > "2023-09-30 20:45:00")
db.env[series1 & (interval1.1 | interval1.2 | interval1.3), c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
# now second series
series2 = db.env$series == "94231942"
# interval0: prior to installation
interval2.0 = db.env$ts < "2022-03-29 10:15:00"
interval2.1 = db.env$ts >= "2022-06-29 20:45:00" & db.env$ts <= "2022-09-16 11:00:00"
db.env[series2 & (interval2.0 | interval2.1), c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
# now series3 == "94231947"
series3 = db.env$series == "94231947"
# first interval:
interval3.1 = (db.env$ts >= "2023-07-04 09:30:00" & db.env$ts < "2023-09-27 15:00:00")
db.env[series3 & interval3.1, c("vwc", "soil.temp", "surface.temp", "air.temp")] <- NA
```
This is the result of adding the NA to the invalid periods:
```{r}
f <- plot_ly(db.env, x = ~ts, y = ~vwc, color = ~series, type = 'scatter', mode = 'lines')
f
```
save result
```{r}
OUTPUT_PATH = file.path(PATH, OUTPUT_ENV_DIR)
if (!dir.exists(OUTPUT_PATH)) {dir.create(OUTPUT_PATH)}
write_csv(db.env, file.path(OUTPUT_PATH, "proc-env.csv" ), append = F, col_names = T)
db.agg <- subset(db.env, select = c("ts", "soil.temp", "surface.temp", "air.temp", "vwc") )
# Do mean of all sensors
db.agg <- db.env %>%
#filter(ts < ymd_hms("2022-12-31 00:00:00")) %>%
group_by(ts) %>%
dplyr::summarise(soil.temp = mean(soil.temp, na.rm = T), surface.temp = mean(surface.temp, na.rm = T), air.temp = mean(air.temp, na.rm = T), vwc = mean(vwc, na.rm = T))
summary(db.agg)
# write aggregated data to file.
if (!dir.exists(file.path(OUTPUT_PATH, 'aggregated'))) {dir.create(file.path(OUTPUT_PATH, 'aggregated'))}
write_csv(db.agg, file.path(OUTPUT_PATH, 'aggregated', "proc-agg-env.csv"), append = F, col_names = T)
```