diff --git a/docs/make.jl b/docs/make.jl index 8a401aa..19114a3 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -6,12 +6,14 @@ Literate.markdown( "docs/src/tutorial1.jl","docs/src", execute = true, documenter = true, - # page already credits julia and Documenter; having an additional credit - # does not look nice + # We add the credit to Literate.jl the footer credit = false, ) -# remove datafile -rm("docs/src/sst.day.mean.2023.nc") + +if get(ENV, "CI", "false") == "true" + # remove datafile on CI + rm("docs/src/sst.day.mean.2023.nc") +end makedocs(; modules=[CommonDataModel], diff --git a/docs/src/index.md b/docs/src/index.md index 11bf7a5..639e625 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -39,7 +39,7 @@ Every struct deriving from `AbstractVariable` has the properties `dim`, and `att Current functionalities of CommonDataModel include: * virtually concatenating files along a given dimension * create a virtual subset (([`view`](@ref Base.view))) by indices or by values of coordinate variables ([`select`](@ref CommonDataModel.select), [`@select`](@ref CommonDataModel.@select)) -* group, map and reduce a variable ([`groupby`](@ref CommonDataModel.groupby), [`@groupby`](@ref CommonDataModel.@groupby)) +* group, map and reduce a variable ([`groupby`](@ref CommonDataModel.groupby), [`@groupby`](@ref CommonDataModel.@groupby), [`rolling`](@ref CommonDataModel.rolling)) diff --git a/src/CommonDataModel.jl b/src/CommonDataModel.jl index 7343d33..4b15f91 100644 --- a/src/CommonDataModel.jl +++ b/src/CommonDataModel.jl @@ -12,6 +12,7 @@ import Base: display, filter, getindex, + in, isopen, iterate, ndims, @@ -46,6 +47,7 @@ include("subvariable.jl") include("select.jl") include("aggregation.jl") include("groupby.jl") +include("rolling.jl") end # module CommonDataModel diff --git a/src/groupby.jl b/src/groupby.jl index 9567518..0478a7d 100644 --- a/src/groupby.jl +++ b/src/groupby.jl @@ -2,23 +2,31 @@ # types # -struct GroupedDataset{TDS,TF,TClass,TM,TRF} <: AbstractDataset +# mappings between groups and source data + +abstract type AbstractGroupMapping; end + +# non-overlapping groups +struct GroupMapping{TClass,TUC} <: AbstractGroupMapping where {TClass <: AbstractVector{T}, TUC <: Union{T,AbstractVector{T}}} where T + class::TClass + unique_class::TUC +end + +struct GroupedDataset{TDS,TF,TGM,TM,TRF} <: AbstractDataset ds::TDS # dataset coordname::Symbol group_fun::TF # mapping function - class::Vector{TClass} - unique_class::Vector{TClass} + groupmap::TGM map_fun::TM reduce_fun::TRF end -struct GroupedVariable{TV,TF,TClass,TM,TG} <: AbstractVector{TG} where TV <: AbstractArray{T,N} where {T,N} +struct GroupedVariable{TV,TF,TGM,TM,TG} <: AbstractVector{TG} where TV <: AbstractArray{T,N} where {T,N} v::TV # dataset coordname::Symbol # mapping function to create groups when applied to coordinate group_fun::TF - class::Vector{TClass} - unique_class::Vector{TClass} + groupmap::TGM dim::Int map_fun::TM end @@ -30,6 +38,55 @@ struct ReducedGroupedVariable{T,N,TGV,TF} <: AbstractVariable{T,N} reduce_fun::TF end + +function dataindices(gmap::GroupMapping,ku) + class_ku = gmap.unique_class[ku] + return findall(==(class_ku),gmap.class) +end + +function groupindices(gmap::GroupMapping,k) + # Types like Dates.Month behave different than normal scalars, e.g. + # julia> length(Dates.Month(1)) + # ERROR: MethodError: no method matching length(::Month) + + if gmap.unique_class isa DatePeriod + if gmap.unique_class == gmap.class[k] + return (1,) + else + return () + end + end + + for ku = 1:length(gmap.unique_class) + if gmap.unique_class[ku] == gmap.class[k] + return (ku,) + end + end + return () +end + +grouplabel(gmap::GroupMapping,ku) = gmap.unique_class[ku] + + +# Types like Dates.Month behave different than normal scalars, e.g. +# julia> length(Dates.Month(1)) +# ERROR: MethodError: no method matching length(::Month) + +_length(x) = length(x) +_length(x::DatePeriod) = 1 + +ngroups(gmap::GroupMapping) = _length(gmap.unique_class) +ndata(gmap::GroupMapping) = length(gmap.class) + + +function groupsubset(gmap::GroupMapping,kus) + return GroupMapping(gmap.class,gmap.unique_class[kus]) +end + +groupsubset(gmap::GroupMapping,kus::Colon) = gmap + +#-------------- + # helper function _indices_helper(j,ku,i,val) = () @@ -69,8 +126,7 @@ function variable(gds::GroupedDataset,varname::SymbolOrString) v, gds.coordname, gds.group_fun, - gds.class, - gds.unique_class, + gds.groupmap, dim, gds.map_fun) return ReducedGroupedVariable(gv,gds.reduce_fun) @@ -94,36 +150,30 @@ end Base.show(io::IO,gv::GroupedVariable) = Base.show(io,MIME"text/plain",gv) Base.ndims(gv::GroupedVariable) = 1 -Base.size(gv::GroupedVariable) = (length(gv.unique_class),) -Base.eltype(gv::GroupedVariable{TV,TF,TClass,TM,TG}) where {TV,TF,TClass,TM,TG} = TG - -function group(gv::GroupedVariable,k::Integer) - class_k = gv.unique_class[k] - indices = findall(==(class_k),gv.class) - return class_k, indices -end +Base.size(gv::GroupedVariable) = (ngroups(gv.groupmap),) +Base.eltype(gv::GroupedVariable{TV,TF,TGM,TM,TG}) where {TV,TF,TGM,TM,TG} = TG function Base.getindex(gv::GroupedVariable,k::Integer) class_k,indices = group(gv,k) return gv.map_fun(Array(selectdim(gv.v,gv.dim,indices))) end -# Types like Dates.Month behave different than normal scalars, e.g. -# julia> length(Dates.Month(1)) -# ERROR: MethodError: no method matching length(::Month) - -_val(x) = x -_val(x::DatePeriod) = Dates.value(x) +function group(gv::GroupedVariable,ku::Integer) + # make generic + class_ku = grouplabel(gv.groupmap,ku) + k = dataindices(gv.groupmap,ku) + return class_ku, k +end function _mapreduce(map_fun,reduce_op,gv::GroupedVariable{TV},indices; init = reduce(reduce_op,T[])) where TV <: AbstractArray{T,N} where {T,N} data = gv.v dim = findfirst(==(Symbol(gv.coordname)),Symbol.(dimnames(data))) - class = _val.(gv.class) - unique_class = _val.(gv.unique_class[indices[dim]]) group_fun = gv.group_fun - nclass = length(unique_class) + groupmap = groupsubset(gv.groupmap,indices[dim]) + + nclass = ngroups(groupmap) sz_all = ntuple(i -> (i == dim ? nclass : size(data,i) ),ndims(data)) sz = size_getindex(sz_all,indices...) @@ -132,13 +182,10 @@ function _mapreduce(map_fun,reduce_op,gv::GroupedVariable{TV},indices; count = zeros(Int,nclass) for k = 1:size(data,dim) - ku = findfirst(==(class[k]),unique_class) - if !isnothing(ku) + for ku in groupindices(groupmap,k) dest_ind = _dest_indices(dim,ku,indices) src_ind = ntuple(i -> (i == dim ? k : indices[i] ),ndims(data)) - #@show size(data_by_class),dest_ind, indices - #@show src_ind data_by_class_ind = view(data_by_class,dest_ind...) data_by_class_ind .= reduce_op.( @@ -156,11 +203,10 @@ end function _mapreduce_aggregation(map_fun,ag,gv::GroupedVariable{TV},indices) where TV <: AbstractArray{T,N} where {T,N} data = gv.v dim = findfirst(==(Symbol(gv.coordname)),Symbol.(dimnames(data))) - class = _val.(gv.class) - unique_class = _val.(gv.unique_class[indices[dim]]) group_fun = gv.group_fun + groupmap = groupsubset(gv.groupmap,indices[dim]) - nclass = length(unique_class) + nclass = ngroups(groupmap) sz_all = ntuple(i -> (i == dim ? nclass : size(data,i) ),ndims(data)) sz = size_getindex(sz_all,indices...) @@ -168,16 +214,11 @@ function _mapreduce_aggregation(map_fun,ag,gv::GroupedVariable{TV},indices) wher count = zeros(Int,nclass) for k = 1:size(data,dim) - ku = findfirst(==(class[k]),unique_class) - - if !isnothing(ku) + for ku in groupindices(groupmap,k) dest_ind = _dest_indices(dim,ku,indices) src_ind = ntuple(i -> (i == dim ? k : indices[i] ),ndims(data)) - #@show size(data_by_class),dest_ind, indices - #@show src_ind data_by_class_ind = view(data_by_class,dest_ind...) std_data_ind = map_fun(data[src_ind...]) - data_by_class_ind .= update.(data_by_class_ind,std_data_ind) end end @@ -209,7 +250,7 @@ function Base.similar(bc::Broadcasted{GroupedVariableStyle}, ::Type{ElType}) wh return A end -function Base.broadcasted(f,A::GroupedVariable{TV,TF,TClass,TM,TG}) where {TV,TF,TClass,TM,TG} +function Base.broadcasted(f,A::GroupedVariable{TV,TF,TGM,TM,TG}) where {TV,TF,TGM,TM,TG} # TODO change output TG map_fun = ∘(f,A.map_fun) @@ -220,19 +261,21 @@ function Base.broadcasted(f,A::GroupedVariable{TV,TF,TClass,TM,TG}) where {TV,TF #TG = Base.return_types(selectdim,(TV,Int,Int,))[1] TG2 = Base.return_types(ff,(TV,Int,Int,))[1] - GroupedVariable{TV,TF,TClass,TM2,TG2}( - A.v,A.coordname,A.group_fun,A.class,A.unique_class,A.dim,map_fun) + GroupedVariable{TV,TF,TGM,TM2,TG2}( + A.v,A.coordname,A.group_fun,A.groupmap,A.dim,map_fun) end -function GroupedVariable(v::TV,coordname,group_fun::TF,class,unique_class,dim,map_fun::TM) where TV <: AbstractVariable where {TF,TM} - TClass = eltype(class) +function GroupedVariable(v::TV,coordname,group_fun::TF,groupmap,dim,map_fun::TM) where TV <: AbstractVariable where {TF,TM} + TGM = typeof(groupmap) #TG = Base.return_types(selectdim,(TV,Int,Int,))[1] TG = Base.return_types(_array_selectdim,(TV,Int,Vector{Int}))[1] - @debug "inferred types" TV TF TClass TM TG - GroupedVariable{TV,TF,TClass,TM,TG}( - v,Symbol(coordname),group_fun,class,unique_class,dim,map_fun) + @debug "inferred types" TV TF TGM TM TG +# groupmap = GroupMapping(class,unique_class) + + GroupedVariable{TV,TF,TGM,TM,TG}( + v,Symbol(coordname),group_fun,groupmap,dim,map_fun) end @@ -303,7 +346,6 @@ monthly_anomalies = data .- mean(gv); close(ds) ``` - """ function groupby(v::AbstractVariable,(coordname,group_fun)::Pair{<:SymbolOrString,TF}) where TF # for NCDatasets 0.12 @@ -312,7 +354,8 @@ function groupby(v::AbstractVariable,(coordname,group_fun)::Pair{<:SymbolOrStrin unique_class = sort(unique(class)) dim = findfirst(==(Symbol(coordname)),Symbol.(dimnames(v))) map_fun = identity - return GroupedVariable(v,coordname,group_fun,class,unique_class,dim,map_fun) + groupmap = GroupMapping(class,unique_class) + return GroupedVariable(v,coordname,group_fun,groupmap,dim,map_fun) end """ @@ -365,7 +408,7 @@ end Base.ndims(gr::ReducedGroupedVariable) = ndims(gr.gv.v) Base.size(gr::ReducedGroupedVariable) = ntuple(ndims(gr)) do i if i == gr.gv.dim - length(gr.gv.unique_class) + length(gr.gv) else size(gr.gv.v,i) end @@ -408,10 +451,7 @@ function broadcasted_gvr!(C,f,A,B) gv = gr.gv dim = gr.gv.dim - unique_class = gv.unique_class - class = gv.class - - for k = 1:length(unique_class) + for k = 1:length(gv) class_k, indices = group(gv,k) selectdim(C,dim,indices) .= broadcast( @@ -493,7 +533,7 @@ function dataset(gr::ReducedGroupedVariable) return GroupedDataset( ds,gv.coordname,gv.group_fun, - gv.class,gv.unique_class, + gv.groupmap, gv.map_fun, gr.reduce_fun, ) diff --git a/src/rolling.jl b/src/rolling.jl new file mode 100644 index 0000000..f3764d2 --- /dev/null +++ b/src/rolling.jl @@ -0,0 +1,167 @@ +# rolling means (also called running means) and other reductions + + +struct OverlappingGroupMapping{T,TRC} <: AbstractGroupMapping + coord::Vector{T} + rolling_classes::TRC + groupindex_to_dataindex::Vector{Vector{Int}} + dataindex_to_groupindex::Vector{Vector{Int}} +end + + + +# multiple classes for overlapping groups +struct MClass{N,TC} + class::NTuple{N,TC} +end + +Base.length(mc::MClass) = 1 +Base.getindex(mc::MClass,i) = mc +Base.in(item,collection::MClass) = item in collection.class + +function OverlappingGroupMapping(coord::AbstractVector{T},rolling_classes) where T + # k is data index + # ku is the group index + + coord_to_dataindex = Dict{T,Int}() + + for k = 1:length(coord) + coord_to_dataindex[coord[k]] = k + end + + groupindex_to_dataindex = [Int[] for i in 1:length(rolling_classes)] + dataindex_to_groupindex = [Int[] for i in 1:length(coord)] + + for ku = 1:length(rolling_classes) + for class in rolling_classes[ku].class + k = get(coord_to_dataindex,class,nothing) + + if !isnothing(k) + push!(groupindex_to_dataindex[ku],k) + push!(dataindex_to_groupindex[k],ku) + end + end + end + + return OverlappingGroupMapping(coord,rolling_classes,groupindex_to_dataindex,dataindex_to_groupindex) +end + +dataindices(gmap::OverlappingGroupMapping,ku) = gmap.groupindex_to_dataindex[ku] +groupindices(gmap::OverlappingGroupMapping,k) = gmap.dataindex_to_groupindex[k] + +ngroups(gmap::OverlappingGroupMapping) = length(gmap.groupindex_to_dataindex) +ndata(gmap::OverlappingGroupMapping) = length(gmap.dataindex_to_groupindex) + +function groupsubset(gmap::OverlappingGroupMapping,kus) + OverlappingGroupMapping(gmap.coord,gmap.rolling_classes[kus]) +end + +groupsubset(gmap::OverlappingGroupMapping,kus::Colon) = gmap + +grouplabel(gmap::OverlappingGroupMapping,ku) = gmap.rolling_classes[ku].class + +#-------------- +""" + gv = CommonDataModel.rolling(v::AbstractVariable,:coordname => n) + gv = CommonDataModel.rolling(v::AbstractVariable,:coordname => rolling_classes) + +Create a grouped variable `gv` whose elements composed by all elements in `v` +grouped by a rolling window of length `n` along the coordinate variable `coordname`. +One can also specify a vector classes (`rolling_classes`) with as many elements +as they are groups and the elements coorespond to the values of the coordinate. +Unlike `CommonDataModel.groupby`, the groups defined by `rolling` can overlap. + +The grouped variable `gv` and be reduced using the functions `sum` `mean`, +`median`, `var` or `std`, for example `gr = mean(gv)`. +The result `gr` is a lazy structure representing the +outcome of these operations performed over the grouped dimension. Only when the +result `gr` is indexed the actually values are computed. + +Broadcasting for `gv` is overloaded. Broadcasting over all elements of +`gv` means that a mapping function is to be applied to all elements of `gv` +before a possible the reduction. + +This operations is also called "running mean" when using `mean` as reduction +function. + +Example: + +```julia +using NCDatasets, Dates +using CommonDataModel: rolling + +# create same test data + +time = DateTime(2000,1,1):Day(1):DateTime(2009,12,31); # 10 years +data = rand(Float32.(-9:99),360,180,length(time)); +fname = "test_file.nc" +ds = NCDataset(fname,"c"); +defVar(ds,"time",time,("time",)); +defVar(ds,"data",data,("lon","lat","time")); + +# running weekly mean + +gv = rolling(ds["data"],:time => 7) +length(gv) +# output 3653 as a mean will be compute for all time instance (including a +# partial mean and the beginning and the end) +size(gv[1]) +# output 360 x 180 x 4: the first time slice will only be a mean of 4 values +size(gv[4]) +# output 360 x 180 x 7: data from the first 7 days + +# compute basic statistics + +using Statistics +weekly_running_mean = mean(gv); +size(weekly_running_mean) +# 360 x 180 x 3653 array with the running weekly mean + +# get a regular julia array +weekly_running_mean_array = weekly_running_mean[:,:,:]; +typeof(weekly_running_mean_array) +# Array{Float32, 3} + +# computing a centred 3 monthly mean taking into account that month do not have the +# same length + +rolling_classes = [(t - Dates.Month(1)):Day(1):(t + Dates.Month(1)) for t in time] +extrema(length.(rolling_classes)) +# output (60, 63) +data_3monthly = mean(rolling(ds["data"],:time => rolling_classes))[:,:,:]; + +close(ds) +``` + +""" +function rolling(v,(coordname,rolling_classes)::Pair) + coord = v[coordname][:] + _rolling(v,coord,coordname,rolling_classes) +end + +function _rolling(v,coord,coordname::SymbolOrString,rolling_classes::AbstractVector) + rolling_classes_ = [MClass(tuple(r...)) for r in rolling_classes] + + groupmap = OverlappingGroupMapping(coord,rolling_classes_) + + group_fun = identity # still necessary? + dim = findfirst(==(Symbol(coordname)),Symbol.(dimnames(v))) + map_fun = identity + return GroupedVariable(v,coordname,group_fun,groupmap,dim,map_fun) +end + +function _rolling(v,coord,coordname::SymbolOrString,nn::Integer) + + # if nn = 7, n0:n1 = -3:3 + # if nn = 8, n0:n1 = -4:3 + # for even nn where are biased towards lower indices as in xarray + # for odd nn where are perfectly centred + + n0 = nn ÷ 2 + n1 = nn - n0 - 1 + + rolling_classes = [coord[max(n-n0,1):min(n+n1,length(coord))] + for n = 1:length(coord)] + + return _rolling(v,coord,coordname,rolling_classes) +end diff --git a/test/runtests.jl b/test/runtests.jl index f72ecec..1bce997 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -33,4 +33,5 @@ end @testset "groupby" begin include("test_groupby.jl") + include("test_rolling.jl") end diff --git a/test/test_groupby.jl b/test/test_groupby.jl index 7eb419e..0dd2580 100644 --- a/test/test_groupby.jl +++ b/test/test_groupby.jl @@ -84,6 +84,7 @@ for reduce_fun in (sum,mean,var,std,median,maximum) (:,:,2), ] + #@show reduce_fun,indices local data_by_class data_by_class = reduce_fun(groupby(ds[varname],:time => Dates.Month))[indices...] @test data_by_class ≈ data_by_class_ref[indices...] diff --git a/test/test_rolling.jl b/test/test_rolling.jl new file mode 100644 index 0000000..560f8ec --- /dev/null +++ b/test/test_rolling.jl @@ -0,0 +1,144 @@ +using Test +using Dates + +using CommonDataModel: + GroupMapping, + MClass, + OverlappingGroupMapping, + dataindices, + rolling, + groupindices, + ndata, + ngroups + +using Statistics + + +#include("memory_dataset.jl"); + +function test_mapping(gmap,coord,rolling_classes) + for k = 1:ndata(gmap) + for ku in groupindices(gmap,k) + @test time[k] in rolling_classes[ku] + end + end + + + for ku = 1:ngroups(gmap) + for k in dataindices(gmap,ku) + @test time[k] in rolling_classes[ku] + end + end +end + + +time = collect(DateTime(1980,1,1):Day(1):DateTime(1980,1,20)) +coordinate = time + +nn = 7 + +rolling_classes_ = [time[max(n-3,1):min(n+3,length(time))] for n = 1:length(time)] + +#rolling_classes_ = [time[max(n-3,1):min(n+3,length(time))] for n = [4,8]] +#rolling_classes_ = [time[2] .+ Day.(-3:3), time[6] .+ Day.(-3:3)] + + +rolling_classes = [MClass(tuple(r...)) for r in rolling_classes_] + +gmap = OverlappingGroupMapping(coordinate,rolling_classes) +test_mapping(gmap,coordinate,rolling_classes) + + + +time = collect(DateTime(1980,1,1):Day(1):DateTime(1982,1,20)) +class = Dates.Month.(time) +unique_class = sort(unique(class)) +gmap = GroupMapping(class,unique_class) + +for ku = 1:12 + @test all(Dates.Month.(time[dataindices(gmap,ku)]) .== Dates.Month(ku)) +end + +for k = 1:length(time) + @test all(unique_class[collect(groupindices(gmap,k))] .== Dates.Month(time[k])) +end + +# ---- +TDS = MemoryDataset + +time = DateTime(2000,1,1):Day(1):DateTime(2000,12,31); +data = rand(Float32.(-9:99),10,11,length(time)) +varname = "data" + +fname = tempname() + +data3 = Array{Union{Missing,Float32},3}(undef,size(data)) +data3 .= data +data3[1,1,:] .= missing +data3[1,2,1] = missing + +TDS(fname,"c") do ds + defVar(ds,"lon",1:size(data,1),("lon",)) + defVar(ds,"lat",1:size(data,2),("lat",)) + defVar(ds,"time",time,("time",)) + defVar(ds,"data",data,("lon","lat","time")) + defVar(ds,"data3",data3,("lon","lat","time")) +end + +ds = TDS(fname) + +# weekly averages +nn = 7 + +v = ds["data"] +gv = rolling(v,:time => 7) + +data_weekly_ref = similar(data); + +for reduce_fun in (sum,mean,var,std,median,maximum) + local gr + gr = reduce_fun(gv) + + for n = 1:length(time) + data_weekly_ref[:,:,n] = reduce_fun(data[:,:,max(n-3,1):min(n+3,length(time))],dims=3) + end + + for indices = [ + (:,:,:), + (1:3,2:5,:), + (1:3,2,:), + (2,1:3,:), + (1:3,2:6,2:6), + (:,:,2), + ] + + @test gr[indices...] ≈ data_weekly_ref[indices...] + end +end + + +# test data with missing values + +data3 = ds["data3"][:,:,:] +data_weekly_ref = similar(data3) +for n = 1:length(time) + data_weekly_ref[:,:,n] = mean(data3[:,:,max(n-3,1):min(n+3,length(time))],dims=3) +end + +gr = mean(rolling(ds["data3"],:time => 7))[:,:,:] + +@test ismissing.(gr) == ismissing.(data_weekly_ref) +@test collect(skipmissing(gr[:])) ≈ collect(skipmissing(data_weekly_ref[:])) + +rolling_classes = [(t - Dates.Month(1)):Day(1):(t + Dates.Month(1)) for t in time] +extrema(length.(rolling_classes)) +data_3monthly = mean(rolling(ds["data"],:time => rolling_classes))[:,:,:]; + + +data_3monthly_ref = similar(data) +for n = 1:length(time) + k = findall(in(rolling_classes[n]),time) + data_3monthly_ref[:,:,n] = mean(data[:,:,k],dims=3) +end + +@test data_3monthly_ref ≈ data_3monthly