From 039b0544a7dfc3e41fb3ac944e6a10cd1486ae63 Mon Sep 17 00:00:00 2001 From: "Viral B. Shah" Date: Wed, 7 Aug 2024 17:37:18 -0400 Subject: [PATCH 01/17] Create dependabot.yml (#3450) --- .github/dependabot.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..7b617025f --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,9 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + labels: + - "dependencies" + - "no changelog" From 52d5a62ce6c4742b50fcc66478f0883142acf295 Mon Sep 17 00:00:00 2001 From: "Viral B. Shah" Date: Wed, 7 Aug 2024 17:38:13 -0400 Subject: [PATCH 02/17] Update ci.yml (#3449) Add mac aarch64 CI Update some action versions --- .github/workflows/ci.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5bbb7f077..7e4f34bb7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,13 +22,16 @@ jobs: - os: windows-latest version: '1' arch: x86 + - os: macos-latest + version: '1' + arch: aarch64 - os: ubuntu-latest version: 'nightly' arch: x64 allow_failure: true steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} @@ -38,14 +41,15 @@ jobs: env: JULIA_NUM_THREADS: 4,1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v4 with: file: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} docs: name: Documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: julia-actions/cache@v1 - uses: julia-actions/julia-buildpkg@latest - uses: julia-actions/julia-docdeploy@latest From bb633619d0e393e392386e8adc905c40d2124568 Mon Sep 17 00:00:00 2001 From: Cody Tapscott <84105208+topolarity@users.noreply.github.com> Date: Sat, 7 Sep 2024 06:35:00 -0400 Subject: [PATCH 03/17] Remove REPL dependency (#3459) --- Project.toml | 1 - src/DataFrames.jl | 2 +- src/other/index.jl | 22 +++++++++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index f76100bdf..5ba3ac209 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,6 @@ PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/src/DataFrames.jl b/src/DataFrames.jl index debd309f5..c85f16a70 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -1,6 +1,6 @@ module DataFrames -using Statistics, Printf, REPL +using Statistics, Printf using Reexport, SortingAlgorithms, Compat, Unicode, PooledArrays @reexport using Missings, InvertedIndices using Base.Sort, Base.Order, Base.Iterators, Base.Threads diff --git a/src/other/index.jl b/src/other/index.jl index ae9358d38..61341b0c2 100644 --- a/src/other/index.jl +++ b/src/other/index.jl @@ -294,6 +294,26 @@ end @inline Base.getindex(x::AbstractIndex, rx::Regex) = getindex(x, filter(name -> occursin(rx, String(name)), _names(x))) +# Levenshtein Distance +# taken from https://github.com/JuliaLang/julia/blob/b5af119a6c608de43d6591a6c4129e9369239898/stdlib/REPL/src/docview.jl#L760-L776 +function _levenshtein(s1, s2) + a, b = collect(s1), collect(s2) + m = length(a) + n = length(b) + d = Matrix{Int}(undef, m+1, n+1) + + d[1:m+1, 1] = 0:m + d[1, 1:n+1] = 0:n + + for i = 1:m, j = 1:n + d[i+1,j+1] = min(d[i , j+1] + 1, + d[i+1, j ] + 1, + d[i , j ] + (a[i] != b[j])) + end + + return d[m+1, n+1] +end + # Fuzzy matching rules: # 1. ignore case # 2. maximum Levenshtein distance is 2 @@ -302,7 +322,7 @@ end # Returns candidates ordered by (distance, name) pair function fuzzymatch(l::Dict{Symbol, Int}, idx::Symbol) idxs = uppercase(string(idx)) - dist = [(REPL.levenshtein(uppercase(string(x)), idxs), x) for x in keys(l)] + dist = [(_levenshtein(uppercase(string(x)), idxs), x) for x in keys(l)] sort!(dist) c = [count(x -> x[1] <= i, dist) for i in 0:2] maxd = max(0, searchsortedlast(c, 8) - 1) From 97bbb40a9aa74a1517a420c0f906cb5b98d8cff3 Mon Sep 17 00:00:00 2001 From: sprig Date: Sat, 7 Sep 2024 03:38:25 -0700 Subject: [PATCH 04/17] Update filter docs, Fixes #3460 (#3461) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update filter docs * Apply suggestions from code review --------- Co-authored-by: Bogumił Kamiński --- src/abstractdataframe/abstractdataframe.jl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index e8f4e32ed..f549a07e8 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1151,6 +1151,11 @@ data frames. function instead as it is consistent with other DataFrames.jl functions (as opposed to `filter`). +!!! note + + Due to type stability the `filter(cols => fun, df::AbstractDataFrame; view::Bool=false)` + call is preferred in performance critical applications. + $METADATA_FIXED See also: [`filter!`](@ref) @@ -1281,6 +1286,11 @@ data frames. function instead as it is consistent with other DataFrames.jl functions (as opposed to `filter!`). +!!! note + + Due to type stability the `filter!(cols => fun, df::AbstractDataFrame)` + call is preferred in performance critical applications. + $METADATA_FIXED See also: [`filter`](@ref) From 1761261e432ec923f8750e929d986d398bb60d31 Mon Sep 17 00:00:00 2001 From: Daniel Rizk <124117406+drizk1@users.noreply.github.com> Date: Sat, 7 Sep 2024 06:48:34 -0400 Subject: [PATCH 05/17] Add TidierData to frameworks docs page (#3447) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add tidierdata to frameworks * adds TidierData to docs toml * change from begin end block * add @kdpsingh edits * Apply suggestions from code review --------- Co-authored-by: Bogumił Kamiński --- docs/Project.toml | 1 + docs/src/man/querying_frameworks.md | 139 ++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/docs/Project.toml b/docs/Project.toml index f6a9f940e..d821a4f08 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -9,6 +9,7 @@ Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Query = "1a8c2f83-1ff3-5112-b086-8aa67b057ba1" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +TidierData = "fe2206b3-d496-4ee9-a338-6a095c4ece80" [compat] Documenter = "1" diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md index abda7ec6f..dad7471b2 100644 --- a/docs/src/man/querying_frameworks.md +++ b/docs/src/man/querying_frameworks.md @@ -8,6 +8,145 @@ DataFramesMeta.jl, DataFrameMacros.jl and Query.jl. They implement a functionali These frameworks are designed both to make it easier for new users to start working with data frames in Julia and to allow advanced users to write more compact code. +## TidierData.jl +[TidierData.jl](https://tidierorg.github.io/TidierData.jl/latest/), part of +the [Tidier](https://tidierorg.github.io/Tidier.jl/dev/) ecosystem, is a macro-based +data analysis interface that wraps DataFrames.jl. The instructions below are for version +0.16.0 of TidierData.jl. + +First, install the TidierData.jl package: + +```julia +using Pkg +Pkg.add("TidierData") +``` + +TidierData.jl enables clean, readable, and fast code for all major data transformation +functions including +[aggregating](https://tidierorg.github.io/TidierData.jl/latest/examples/generated/UserGuide/summarize/), +[pivoting](https://tidierorg.github.io/TidierData.jl/latest/examples/generated/UserGuide/pivots/), +[nesting](https://tidierorg.github.io/TidierData.jl/latest/examples/generated/UserGuide/nesting/), +and [joining](https://tidierorg.github.io/TidierData.jl/latest/examples/generated/UserGuide/joins/) +data frames. TidierData re-exports `DataFrame` from DataFrames.jl, `@chain` from Chain.jl, and +Statistics.jl to streamline data operations. + +TidierData.jl is heavily inspired by the `dplyr` and `tidyr` R packages (part of the R +`tidyverse`), which it aims to implement using pure Julia by wrapping DataFrames.jl. While +TidierData.jl borrows conventions from the `tidyverse`, it is important to note that the +`tidyverse` itself is often not considered idiomatic R code. TidierData.jl brings +data analysis conventions from `tidyverse` into Julia to have the best of both worlds: +tidy syntax and the speed and flexibility of the Julia language. + +TidierData.jl has two major differences from other macro-based packages. First, TidierData.jl +uses tidy expressions. An example of a tidy expression is `a = mean(b)`, where `b` refers +to an existing column in the data frame, and `a` refers to either a new or existing column. +Referring to variables outside of the data frame requires prefixing variables with `!!`. +For example, `a = mean(!!b)` refers to a variable `b` outside the data frame. Second, +TidierData.jl aims to make broadcasting mostly invisible through +[auto-vectorization](https://tidierorg.github.io/TidierData.jl/latest/examples/generated/UserGuide/autovec/). TidierData.jl currently uses a lookup table to decide which functions not to +vectorize; all other functions are automatically vectorized. This allows for +writing of concise expressions: `@mutate(df, a = a - mean(a))` transforms the `a` column +by subtracting each value by the mean of the column. Behind the scenes, the right-hand +expression is converted to `a .- mean(a)` because `mean()` is in the lookup table as a +function that should not be vectorized. Take a look at the +[auto-vectorization](https://tidierorg.github.io/TidierData.jl/latest/examples/generated/UserGuide/autovec/) documentation for details. + +One major benefit of combining tidy expressions with auto-vectorization is that +TidierData.jl code (which uses DataFrames.jl as its backend) can work directly on +databases using [TidierDB.jl](https://github.com/TidierOrg/TidierDB.jl), +which converts tidy expressions into SQL, supporting DuckDB and several other backends. + +```jldoctest tidierdata +julia> using TidierData + +julia> df = DataFrame( + name = ["John", "Sally", "Roger"], + age = [54.0, 34.0, 79.0], + children = [0, 2, 4] + ) +3×3 DataFrame + Row │ name age children + │ String Float64 Int64 +─────┼─────────────────────────── + 1 │ John 54.0 0 + 2 │ Sally 34.0 2 + 3 │ Roger 79.0 4 + +julia> @chain df begin + @filter(children != 2) + @select(name, num_children = children) + end +2×2 DataFrame + Row │ name num_children + │ String Int64 +─────┼────────────────────── + 1 │ John 0 + 2 │ Roger 4 +``` + +Below are examples showcasing `@group_by` with `@summarize` or `@mutate` - analagous to the split, apply, combine pattern. + +```jldoctest tidierdata +julia> df = DataFrame( + groups = repeat('a':'e', inner = 2), + b_col = 1:10, + c_col = 11:20, + d_col = 111:120 + ) +10×4 DataFrame + Row │ groups b_col c_col d_col + │ Char Int64 Int64 Int64 +─────┼───────────────────────────── + 1 │ a 1 11 111 + 2 │ a 2 12 112 + 3 │ b 3 13 113 + 4 │ b 4 14 114 + 5 │ c 5 15 115 + 6 │ c 6 16 116 + 7 │ d 7 17 117 + 8 │ d 8 18 118 + 9 │ e 9 19 119 + 10 │ e 10 20 120 + +julia> @chain df begin + @filter(b_col > 2) + @group_by(groups) + @summarise(median_b = median(b_col), + across((b_col:d_col), mean)) + end +4×5 DataFrame + Row │ groups median_b b_col_mean c_col_mean d_col_mean + │ Char Float64 Float64 Float64 Float64 +─────┼────────────────────────────────────────────────────── + 1 │ b 3.5 3.5 13.5 113.5 + 2 │ c 5.5 5.5 15.5 115.5 + 3 │ d 7.5 7.5 17.5 117.5 + 4 │ e 9.5 9.5 19.5 119.5 + +julia> @chain df begin + @filter(b_col > 4 && c_col <= 18) + @group_by(groups) + @mutate( + new_col = b_col + maximum(d_col), + new_col2 = c_col - maximum(d_col), + new_col3 = case_when(c_col >= 18 => "high", + c_col > 15 => "medium", + true => "low")) + @select(starts_with("new")) + @ungroup # required because `@mutate` does not ungroup + end +4×4 DataFrame + Row │ groups new_col new_col2 new_col3 + │ Char Int64 Int64 String +─────┼───────────────────────────────────── + 1 │ c 121 -101 low + 2 │ c 122 -100 medium + 3 │ d 125 -101 medium + 4 │ d 126 -100 high +``` + +For more examples, please visit the [TidierData.jl](https://tidierorg.github.io/TidierData.jl/latest/) documentation. + ## DataFramesMeta.jl The [DataFramesMeta.jl](https://github.com/JuliaStats/DataFramesMeta.jl) package From 96839313f523e98b894459f4bba959c21febd7f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 8 Sep 2024 10:50:16 +0200 Subject: [PATCH 06/17] fix tests on nightly and 32-bit (#3463) --- src/groupeddataframe/complextransforms.jl | 8 ++++++-- test/io.jl | 6 +++--- test/select.jl | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/groupeddataframe/complextransforms.jl b/src/groupeddataframe/complextransforms.jl index 0f7fea661..1a907de6f 100644 --- a/src/groupeddataframe/complextransforms.jl +++ b/src/groupeddataframe/complextransforms.jl @@ -269,8 +269,12 @@ function _combine_rows_with_first!((firstrow,)::Ref{Any}, # Create up to one task per thread # This has lower overhead than creating one task per group, # but is optimal only if operations take roughly the same time for all groups - basesize = max(1, cld(len - 1, Threads.nthreads())) - partitions = Iterators.partition(2:len, basesize) + if isthreadsafe(outcols, incols) + basesize = max(1, cld(len - 1, Threads.nthreads())) + partitions = Iterators.partition(2:len, basesize) + else + partitions = (2:len,) + end widen_type_lock = ReentrantLock() outcolsref = Ref{NTuple{<:Any, AbstractVector}}(outcols) type_widened = fill(false, length(partitions)) diff --git a/test/io.jl b/test/io.jl index dee15ea7c..b566c655e 100644 --- a/test/io.jl +++ b/test/io.jl @@ -760,7 +760,7 @@ end df = DataFrame( A=Int64[1,4,9,16,25,36,49,64], B = [ - md"[DataFrames.jl](http://juliadata.github.io/DataFrames.jl)", + md"ABC", md"``\frac{x^2}{x^2+y^2}``", md"`Header`", md"This is *very*, **very**, very, very, very, very, very, very, very long line" , @@ -781,7 +781,7 @@ end Row │ A B │ Int64 MD ─────┼────────────────────────────────────────── - 1 │ 1 DataFrames.jl (http://juliadat… + 1 │ 1 ABC 2 │ 4 \\frac{x^2}{x^2+y^2} 3 │ 9 Header 4 │ 16 This is very, very, very, very… @@ -793,7 +793,7 @@ end @test sprint(show, "text/csv", df) == """ \"A\",\"B\" - 1,\"[DataFrames.jl](http://juliadata.github.io/DataFrames.jl)\" + 1,\"ABC\" 4,\"\$\\\\frac{x^2}{x^2+y^2}\$\" 9,\"`Header`\" 16,\"This is *very*, **very**, very, very, very, very, very, very, very long line\" diff --git a/test/select.jl b/test/select.jl index 3a8ad3b23..02d84d4b9 100644 --- a/test/select.jl +++ b/test/select.jl @@ -3039,7 +3039,7 @@ end @test size(combine(df, :a => (x -> Any[]) => AsTable)) == (0, 0) df2 = combine(df, :a => (x -> NamedTuple{(:x,),Tuple{Int64}}[]) => AsTable) @test size(df2) == (0, 1) - @test eltype(df2.x) === Int + @test eltype(df2.x) === Int64 end end # module From 5cd46161227b13ec74be05d149d4b4428531af73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 11 Sep 2024 13:41:28 +0200 Subject: [PATCH 07/17] move code from abstractdataframe.jl to iteration.jl --- src/abstractdataframe/abstractdataframe.jl | 282 --------------------- src/abstractdataframe/iteration.jl | 282 +++++++++++++++++++++ 2 files changed, 282 insertions(+), 282 deletions(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index f549a07e8..01f3ba304 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -1814,288 +1814,6 @@ Base.vcat(dfs::AbstractDataFrame...; Pair{<:SymbolOrString, <:AbstractVector}}=nothing) = reduce(vcat, dfs; cols=cols, source=source) -""" - reduce(::typeof(vcat), - dfs::Union{AbstractVector{<:AbstractDataFrame}, - Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}}; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, - source::Union{Nothing, Symbol, AbstractString, - Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing, - init::AbstractDataFrame=DataFrame()) - -Efficiently reduce the given vector or tuple of `AbstractDataFrame`s with -`vcat`. - -See the [`vcat`](@ref) docstring for a description of keyword arguments `cols` -and `source`. - -The keyword argument `init` is the initial value to use in the reductions. -It must be a data frame that has zero rows. It is not taken into account when -computing the value of the `source` column nor when determining metadata -of the produced data frame. - -The column order, names, and types of the resulting `DataFrame`, and the -behavior of `cols` and `source` keyword arguments follow the rules specified for -[`vcat`](@ref) of `AbstractDataFrame`s. - -Metadata: `vcat` propagates table-level `:note`-style metadata for keys that are -present in all passed data frames and have the same value. `vcat` propagates -column-level `:note`-style metadata for keys that are present in all passed data -frames that contain this column and have the same value. - -# Example -```jldoctest -julia> df1 = DataFrame(A=1:3, B=1:3) -3×2 DataFrame - Row │ A B - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 3 - -julia> df2 = DataFrame(A=4:6, B=4:6) -3×2 DataFrame - Row │ A B - │ Int64 Int64 -─────┼────────────── - 1 │ 4 4 - 2 │ 5 5 - 3 │ 6 6 - -julia> df3 = DataFrame(A=7:9, C=7:9) -3×2 DataFrame - Row │ A C - │ Int64 Int64 -─────┼────────────── - 1 │ 7 7 - 2 │ 8 8 - 3 │ 9 9 - -julia> reduce(vcat, (df1, df2)) -6×2 DataFrame - Row │ A B - │ Int64 Int64 -─────┼────────────── - 1 │ 1 1 - 2 │ 2 2 - 3 │ 3 3 - 4 │ 4 4 - 5 │ 5 5 - 6 │ 6 6 - -julia> reduce(vcat, [df1, df2, df3], cols=:union, source=:source) -9×4 DataFrame - Row │ A B C source - │ Int64 Int64? Int64? Int64 -─────┼───────────────────────────────── - 1 │ 1 1 missing 1 - 2 │ 2 2 missing 1 - 3 │ 3 3 missing 1 - 4 │ 4 4 missing 2 - 5 │ 5 5 missing 2 - 6 │ 6 6 missing 2 - 7 │ 7 missing 7 3 - 8 │ 8 missing 8 3 - 9 │ 9 missing 9 3 -``` -""" -function Base.reduce(::typeof(vcat), - dfs::Union{AbstractVector{<:AbstractDataFrame}, - Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}}; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, - source::Union{Nothing, SymbolOrString, - Pair{<:SymbolOrString, <:AbstractVector}}=nothing, - init::AbstractDataFrame=DataFrame()) - if nrow(init) > 0 - throw(ArgumentError("init data frame must have zero rows")) - end - dfs_init = AbstractDataFrame[emptycolmetadata!(copy(init))] - append!(dfs_init, dfs) - res = _vcat(AbstractDataFrame[df for df in dfs_init if ncol(df) != 0]; cols=cols) - # only handle table-level metadata, as column-level metadata was done in _vcat - _merge_matching_table_note_metadata!(res, dfs) - - if source !== nothing - len = length(dfs) - if source isa SymbolOrString - col, vals = source, 1:len - else - @assert source isa Pair{<:SymbolOrString, <:AbstractVector} - col, vals = source - end - - if columnindex(res, col) > 0 - idx = findfirst(df -> columnindex(df, col) > 0, dfs) - @assert idx !== nothing - throw(ArgumentError("source column name :$col already exists in data frame " * - " passed in position $idx")) - end - - if len != length(vals) - throw(ArgumentError("number of passed source identifiers ($(length(vals)))" * - "does not match the number of data frames ($len)")) - end - - source_vec = Tables.allocatecolumn(eltype(vals), nrow(res)) - @assert firstindex(source_vec) == 1 && lastindex(source_vec) == nrow(res) - start = 1 - for (v, df) in zip(vals, dfs) - stop = start + nrow(df) - 1 - source_vec[start:stop] .= Ref(v) - start = stop + 1 - end - - @assert start == nrow(res) + 1 - insertcols!(res, col => source_vec) - end - - return res -end - -# definition needed to avoid dispatch ambiguity -Base.reduce(::typeof(vcat), - dfs::SentinelArrays.ChainedVector{T, A} where {T<:AbstractDataFrame, - A<:AbstractVector{T}}; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal, - source::Union{Nothing, SymbolOrString, - Pair{<:SymbolOrString, <:AbstractVector}}=nothing, - init::AbstractDataFrame=DataFrame()) = - reduce(vcat, collect(AbstractDataFrame, dfs), cols=cols, source=source, init=init) - -function _vcat(dfs::AbstractVector{AbstractDataFrame}; - cols::Union{Symbol, AbstractVector{Symbol}, - AbstractVector{<:AbstractString}}=:setequal) - # note that empty DataFrame() objects are dropped from dfs before we call _vcat - if isempty(dfs) - cols isa Symbol && return DataFrame() - return DataFrame([col => Missing[] for col in cols]) - end - # Array of all headers - allheaders = map(names, dfs) - # Array of unique headers across all data frames - uniqueheaders = unique(allheaders) - # All symbols present across all headers - unionunique = union(uniqueheaders...) - # List of symbols present in all dataframes - intersectunique = intersect(uniqueheaders...) - - if cols === :orderequal - header = unionunique - if length(uniqueheaders) > 1 - throw(ArgumentError("when `cols=:orderequal` all data frames need to " * - "have the same column names and be in the same order")) - end - elseif cols === :setequal || cols === :equal - # an explicit error is thrown as :equal was supported in the past - if cols === :equal - throw(ArgumentError("`cols=:equal` is not supported. " * - "Use `:setequal` instead.")) - end - - header = unionunique - coldiff = setdiff(unionunique, intersectunique) - - if !isempty(coldiff) - # if any DataFrames are a full superset of names, skip them - let header=header # julia #15276 - filter!(u -> !issetequal(u, header), uniqueheaders) - end - estrings = map(enumerate(uniqueheaders)) do (i, head) - matching = findall(h -> head == h, allheaders) - headerdiff = setdiff(coldiff, head) - badcols = join(headerdiff, ", ", " and ") - args = join(matching, ", ", " and ") - return "column(s) $badcols are missing from argument(s) $args" - end - throw(ArgumentError(join(estrings, ", ", ", and "))) - end - elseif cols === :intersect - header = intersectunique - elseif cols === :union - header = unionunique - elseif cols isa Symbol - throw(ArgumentError("Invalid `cols` value :$cols. " * - "Only `:orderequal`, `:setequal`, `:intersect`, " * - "`:union`, or a vector of column names is allowed.")) - elseif cols isa AbstractVector{Symbol} - header = cols - else - @assert cols isa AbstractVector{<:AbstractString} - header = Symbol.(cols) - end - - if isempty(header) - out_df = DataFrame() - else - all_cols = Vector{AbstractVector}(undef, length(header)) - for (i, name) in enumerate(header) - newcols = map(dfs) do df - if hasproperty(df, name) - return df[!, name] - else - Iterators.repeated(missing, nrow(df)) - end - end - - lens = map(length, newcols) - T = mapreduce(eltype, promote_type, newcols) - all_cols[i] = Tables.allocatecolumn(T, sum(lens)) - offset = 1 - for j in 1:length(newcols) - copyto!(all_cols[i], offset, newcols[j]) - offset += lens[j] - end - end - - out_df = DataFrame(all_cols, header, copycols=false) - end - - # here we process column-level metadata, table-level metadata is processed in reduce - - # first check if all data frames do not have column-level metadata - # in which case we do not have to do anything - all(df -> getfield(parent(df), :colmetadata) === nothing, dfs) && return out_df - - for colname in _names(out_df) - if length(dfs) == 1 - df1 = dfs[1] - hasproperty(df1, colname) && _copy_col_note_metadata!(out_df, colname, df1, colname) - else - start = findfirst(x -> hasproperty(x, colname), dfs) - start === nothing && continue - df_start = dfs[start] - for key_start in colmetadatakeys(df_start, colname) - meta_val_start, meta_style_start = colmetadata(df_start, colname, key_start, style=true) - if meta_style_start === :note - good_key = true - for i in start+1:length(dfs) - dfi = dfs[i] - if hasproperty(dfi, colname) - if key_start in colmetadatakeys(dfi, colname) - meta_vali, meta_stylei = colmetadata(dfi, colname, key_start, style=true) - if !(meta_stylei === :note && isequal(meta_val_start, meta_vali)) - good_key = false - break - end - else - good_key = false - break - end - end - end - good_key && colmetadata!(out_df, colname, key_start, meta_val_start, style=:note) - end - end - end - end - - return out_df -end - """ repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 22589d3c8..07c6c75fb 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -597,3 +597,285 @@ function mapcols!(f::Union{Function,Type}, df::DataFrame; cols=All()) _drop_all_nonnote_metadata!(df) return df end + +""" + reduce(::typeof(vcat), + dfs::Union{AbstractVector{<:AbstractDataFrame}, + Tuple{AbstractDataFrame, Vararg{AbstractDataFrame}}}; + cols::Union{Symbol, AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + source::Union{Nothing, Symbol, AbstractString, + Pair{<:Union{Symbol, AbstractString}, <:AbstractVector}}=nothing, + init::AbstractDataFrame=DataFrame()) + +Efficiently reduce the given vector or tuple of `AbstractDataFrame`s with +`vcat`. + +See the [`vcat`](@ref) docstring for a description of keyword arguments `cols` +and `source`. + +The keyword argument `init` is the initial value to use in the reductions. +It must be a data frame that has zero rows. It is not taken into account when +computing the value of the `source` column nor when determining metadata +of the produced data frame. + +The column order, names, and types of the resulting `DataFrame`, and the +behavior of `cols` and `source` keyword arguments follow the rules specified for +[`vcat`](@ref) of `AbstractDataFrame`s. + +Metadata: `vcat` propagates table-level `:note`-style metadata for keys that are +present in all passed data frames and have the same value. `vcat` propagates +column-level `:note`-style metadata for keys that are present in all passed data +frames that contain this column and have the same value. + +# Example +```jldoctest +julia> df1 = DataFrame(A=1:3, B=1:3) +3×2 DataFrame + Row │ A B + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 3 + +julia> df2 = DataFrame(A=4:6, B=4:6) +3×2 DataFrame + Row │ A B + │ Int64 Int64 +─────┼────────────── + 1 │ 4 4 + 2 │ 5 5 + 3 │ 6 6 + +julia> df3 = DataFrame(A=7:9, C=7:9) +3×2 DataFrame + Row │ A C + │ Int64 Int64 +─────┼────────────── + 1 │ 7 7 + 2 │ 8 8 + 3 │ 9 9 + +julia> reduce(vcat, (df1, df2)) +6×2 DataFrame + Row │ A B + │ Int64 Int64 +─────┼────────────── + 1 │ 1 1 + 2 │ 2 2 + 3 │ 3 3 + 4 │ 4 4 + 5 │ 5 5 + 6 │ 6 6 + +julia> reduce(vcat, [df1, df2, df3], cols=:union, source=:source) +9×4 DataFrame + Row │ A B C source + │ Int64 Int64? Int64? Int64 +─────┼───────────────────────────────── + 1 │ 1 1 missing 1 + 2 │ 2 2 missing 1 + 3 │ 3 3 missing 1 + 4 │ 4 4 missing 2 + 5 │ 5 5 missing 2 + 6 │ 6 6 missing 2 + 7 │ 7 missing 7 3 + 8 │ 8 missing 8 3 + 9 │ 9 missing 9 3 +``` +""" +function Base.reduce(::typeof(vcat), + dfs::Union{AbstractVector{<:AbstractDataFrame}, + Tuple{AbstractDataFrame,Vararg{AbstractDataFrame}}}; + cols::Union{Symbol,AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + source::Union{Nothing,SymbolOrString, + Pair{<:SymbolOrString,<:AbstractVector}}=nothing, + init::AbstractDataFrame=DataFrame()) + if nrow(init) > 0 + throw(ArgumentError("init data frame must have zero rows")) + end + dfs_init = AbstractDataFrame[emptycolmetadata!(copy(init))] + append!(dfs_init, dfs) + res = _vcat(AbstractDataFrame[df for df in dfs_init if ncol(df) != 0]; cols=cols) + # only handle table-level metadata, as column-level metadata was done in _vcat + _merge_matching_table_note_metadata!(res, dfs) + + if source !== nothing + len = length(dfs) + if source isa SymbolOrString + col, vals = source, 1:len + else + @assert source isa Pair{<:SymbolOrString,<:AbstractVector} + col, vals = source + end + + if columnindex(res, col) > 0 + idx = findfirst(df -> columnindex(df, col) > 0, dfs) + @assert idx !== nothing + throw(ArgumentError("source column name :$col already exists in data frame " * + " passed in position $idx")) + end + + if len != length(vals) + throw(ArgumentError("number of passed source identifiers ($(length(vals)))" * + "does not match the number of data frames ($len)")) + end + + source_vec = Tables.allocatecolumn(eltype(vals), nrow(res)) + @assert firstindex(source_vec) == 1 && lastindex(source_vec) == nrow(res) + start = 1 + for (v, df) in zip(vals, dfs) + stop = start + nrow(df) - 1 + source_vec[start:stop] .= Ref(v) + start = stop + 1 + end + + @assert start == nrow(res) + 1 + insertcols!(res, col => source_vec) + end + + return res +end + +# definition needed to avoid dispatch ambiguity +Base.reduce(::typeof(vcat), + dfs::SentinelArrays.ChainedVector{T,A} where {T<:AbstractDataFrame, + A<:AbstractVector{T}}; + cols::Union{Symbol,AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal, + source::Union{Nothing,SymbolOrString, + Pair{<:SymbolOrString,<:AbstractVector}}=nothing, + init::AbstractDataFrame=DataFrame()) = + reduce(vcat, collect(AbstractDataFrame, dfs), cols=cols, source=source, init=init) + +function _vcat(dfs::AbstractVector{AbstractDataFrame}; + cols::Union{Symbol,AbstractVector{Symbol}, + AbstractVector{<:AbstractString}}=:setequal) + # note that empty DataFrame() objects are dropped from dfs before we call _vcat + if isempty(dfs) + cols isa Symbol && return DataFrame() + return DataFrame([col => Missing[] for col in cols]) + end + # Array of all headers + allheaders = map(names, dfs) + # Array of unique headers across all data frames + uniqueheaders = unique(allheaders) + # All symbols present across all headers + unionunique = union(uniqueheaders...) + # List of symbols present in all dataframes + intersectunique = intersect(uniqueheaders...) + + if cols === :orderequal + header = unionunique + if length(uniqueheaders) > 1 + throw(ArgumentError("when `cols=:orderequal` all data frames need to " * + "have the same column names and be in the same order")) + end + elseif cols === :setequal || cols === :equal + # an explicit error is thrown as :equal was supported in the past + if cols === :equal + throw(ArgumentError("`cols=:equal` is not supported. " * + "Use `:setequal` instead.")) + end + + header = unionunique + coldiff = setdiff(unionunique, intersectunique) + + if !isempty(coldiff) + # if any DataFrames are a full superset of names, skip them + let header = header # julia #15276 + filter!(u -> !issetequal(u, header), uniqueheaders) + end + estrings = map(enumerate(uniqueheaders)) do (i, head) + matching = findall(h -> head == h, allheaders) + headerdiff = setdiff(coldiff, head) + badcols = join(headerdiff, ", ", " and ") + args = join(matching, ", ", " and ") + return "column(s) $badcols are missing from argument(s) $args" + end + throw(ArgumentError(join(estrings, ", ", ", and "))) + end + elseif cols === :intersect + header = intersectunique + elseif cols === :union + header = unionunique + elseif cols isa Symbol + throw(ArgumentError("Invalid `cols` value :$cols. " * + "Only `:orderequal`, `:setequal`, `:intersect`, " * + "`:union`, or a vector of column names is allowed.")) + elseif cols isa AbstractVector{Symbol} + header = cols + else + @assert cols isa AbstractVector{<:AbstractString} + header = Symbol.(cols) + end + + if isempty(header) + out_df = DataFrame() + else + all_cols = Vector{AbstractVector}(undef, length(header)) + for (i, name) in enumerate(header) + newcols = map(dfs) do df + if hasproperty(df, name) + return df[!, name] + else + Iterators.repeated(missing, nrow(df)) + end + end + + lens = map(length, newcols) + T = mapreduce(eltype, promote_type, newcols) + all_cols[i] = Tables.allocatecolumn(T, sum(lens)) + offset = 1 + for j in 1:length(newcols) + copyto!(all_cols[i], offset, newcols[j]) + offset += lens[j] + end + end + + out_df = DataFrame(all_cols, header, copycols=false) + end + + # here we process column-level metadata, table-level metadata is processed in reduce + + # first check if all data frames do not have column-level metadata + # in which case we do not have to do anything + all(df -> getfield(parent(df), :colmetadata) === nothing, dfs) && return out_df + + for colname in _names(out_df) + if length(dfs) == 1 + df1 = dfs[1] + hasproperty(df1, colname) && _copy_col_note_metadata!(out_df, colname, df1, colname) + else + start = findfirst(x -> hasproperty(x, colname), dfs) + start === nothing && continue + df_start = dfs[start] + for key_start in colmetadatakeys(df_start, colname) + meta_val_start, meta_style_start = colmetadata(df_start, colname, key_start, style=true) + if meta_style_start === :note + good_key = true + for i in start+1:length(dfs) + dfi = dfs[i] + if hasproperty(dfi, colname) + if key_start in colmetadatakeys(dfi, colname) + meta_vali, meta_stylei = colmetadata(dfi, colname, key_start, style=true) + if !(meta_stylei === :note && isequal(meta_val_start, meta_vali)) + good_key = false + break + end + else + good_key = false + break + end + end + end + good_key && colmetadata!(out_df, colname, key_start, meta_val_start, style=:note) + end + end + end + end + + return out_df +end From 6fb2a3f2eb511f46cc92936cd0472aa67fdc9f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 11 Sep 2024 13:46:20 +0200 Subject: [PATCH 08/17] avoid type piracy in reduce with vcat --- src/abstractdataframe/iteration.jl | 17 ++++++++++++++--- test/cat.jl | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl index 07c6c75fb..7954d011e 100644 --- a/src/abstractdataframe/iteration.jl +++ b/src/abstractdataframe/iteration.jl @@ -598,6 +598,12 @@ function mapcols!(f::Union{Function,Type}, df::DataFrame; cols=All()) return df end +############################################################################## +## +## Reduction +## +############################################################################## + """ reduce(::typeof(vcat), dfs::Union{AbstractVector{<:AbstractDataFrame}, @@ -686,7 +692,10 @@ julia> reduce(vcat, [df1, df2, df3], cols=:union, source=:source) ``` """ function Base.reduce(::typeof(vcat), - dfs::Union{AbstractVector{<:AbstractDataFrame}, + dfs::Union{AbstractVector{AbstractDataFrame}, + AbstractVector{DataFrame}, + AbstractVector{SubDataFrame}, + AbstractVector{Union{DataFrame,SubDataFrame}}, Tuple{AbstractDataFrame,Vararg{AbstractDataFrame}}}; cols::Union{Symbol,AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal, @@ -741,8 +750,10 @@ end # definition needed to avoid dispatch ambiguity Base.reduce(::typeof(vcat), - dfs::SentinelArrays.ChainedVector{T,A} where {T<:AbstractDataFrame, - A<:AbstractVector{T}}; + dfs::Union{SentinelArrays.ChainedVector{AbstractDataFrame,<:AbstractVector{AbstractDataFrame}}, + SentinelArrays.ChainedVector{DataFrame,<:AbstractVector{DataFrame}}, + SentinelArrays.ChainedVector{SubDataFrame,<:AbstractVector{SubDataFrame}}, + SentinelArrays.ChainedVector{Union{DataFrame,SubDataFrame},<:AbstractVector{Union{DataFrame,SubDataFrame}}}}; cols::Union{Symbol,AbstractVector{Symbol}, AbstractVector{<:AbstractString}}=:setequal, source::Union{Nothing,SymbolOrString, diff --git a/test/cat.jl b/test/cat.jl index b5aa1cfd9..a63dcc2cc 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -477,4 +477,19 @@ end @test reduce(vcat, (df1, df2)) == DataFrame(a=[1, 1], b=[2, 2]) end +@testset "vcat type piracy" begin + x = Int[] + @test reduce(vcat, Union{}[], init=x) === x + + @test reduce(vcat, AbstractDataFrame[DataFrame(a=1), DataFrame(a=2)]) == + DataFrame(a=[1, 2]) + @test reduce(vcat, Union{DataFrame, SubDataFrame}[DataFrame(a=1), DataFrame(a=2)]) == + DataFrame(a=[1, 2]) + @test reduce(vcat, AbstractDataFrame[DataFrame(a=1), DataFrame(a=2)]; source=:source) == + DataFrame(a=[1, 2], source=[1, 2]) + @test reduce(vcat, Union{DataFrame,SubDataFrame}[DataFrame(a=1), DataFrame(a=2)]; source=:source) == + DataFrame(a=[1, 2], source=[1, 2]) +end + + end # module From 78197154dfcbcd728aab23ca7893011e0c19783d Mon Sep 17 00:00:00 2001 From: Andy Dienes Date: Mon, 5 Aug 2024 22:12:11 -0400 Subject: [PATCH 09/17] add `?` suffix to show on all return paths --- src/abstractdataframe/show.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/abstractdataframe/show.jl b/src/abstractdataframe/show.jl index 7f9593d4a..eec666e99 100644 --- a/src/abstractdataframe/show.jl +++ b/src/abstractdataframe/show.jl @@ -129,7 +129,7 @@ function compacttype(T::Type, maxwidth::Int) # handle the case when the type printed is not parametric but string(T) # prefixed it with the module name which caused it to be overlong - textwidth(sT) ≤ maxwidth + 1 && endswith(sTfull, sT) && return sT + textwidth(sT) ≤ maxwidth + 1 && endswith(sTfull, sT) && return sT * suffix cumwidth = 0 stop = 0 From 58db5c3c0263cbd653cb90753636a107f78f7ef4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 15 Sep 2024 09:33:55 -0400 Subject: [PATCH 10/17] Bump julia-actions/cache from 1 to 2 (#3453) Bumps [julia-actions/cache](https://github.com/julia-actions/cache) from 1 to 2. - [Release notes](https://github.com/julia-actions/cache/releases) - [Changelog](https://github.com/julia-actions/cache/blob/main/devdocs/making_a_new_release.md) - [Commits](https://github.com/julia-actions/cache/compare/v1...v2) --- updated-dependencies: - dependency-name: julia-actions/cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Viral B. Shah Co-authored-by: Viral B. Shah --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e4f34bb7..2142167fc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,7 @@ jobs: with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} - - uses: julia-actions/cache@v1 + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 env: @@ -50,7 +50,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: julia-actions/cache@v1 + - uses: julia-actions/cache@v2 - uses: julia-actions/julia-buildpkg@latest - uses: julia-actions/julia-docdeploy@latest env: From 392608a5cf55b02447a0aa08b67f783abb944466 Mon Sep 17 00:00:00 2001 From: "Viral B. Shah" Date: Tue, 17 Sep 2024 14:14:36 -0400 Subject: [PATCH 11/17] Fix codecov badge in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a660f02c..89d3597c4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ DataFrames.jl ============= -[![Coverage Status](http://codecov.io/github/JuliaData/DataFrames.jl/coverage.svg?branch=main)](http://codecov.io/github/JuliaData/DataFrames.jl?branch=main) +[![codecov](https://codecov.io/gh/JuliaData/DataFrames.jl/graph/badge.svg?token=DHYzeKcumV)](https://codecov.io/gh/JuliaData/DataFrames.jl) [![CI Testing](https://github.com/JuliaData/DataFrames.jl/workflows/CI/badge.svg)](https://github.com/JuliaData/DataFrames.jl/actions?query=workflow%3ACI+branch%3Amain) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7632427.svg)](https://doi.org/10.5281/zenodo.7632427) From ef370960efc36925e9df3413c207f00183800042 Mon Sep 17 00:00:00 2001 From: CompatHelper Julia Date: Mon, 16 Sep 2024 00:04:33 +0000 Subject: [PATCH 12/17] CompatHelper: add new compat entry for Statistics at version 1, (keep existing compat) --- Project.toml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index 5ba3ac209..b3bb15779 100644 --- a/Project.toml +++ b/Project.toml @@ -19,11 +19,11 @@ PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" SortingAlgorithms = "a2af1166-a08f-5f64-846c-94a0d3cef48c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TableTraits = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c" Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] @@ -45,6 +45,7 @@ Reexport = "1" SentinelArrays = "1.2" ShiftedArrays = "1, 2" SortingAlgorithms = "0.3, 1" +Statistics = "1" TableTraits = "0.4, 1" Tables = "1.9.0" Unitful = "1" @@ -57,12 +58,10 @@ DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d" -ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" [targets] -test = ["CategoricalArrays", "Combinatorics", "DataValues", - "Dates", "Logging", "OffsetArrays", "Test", - "Unitful", "ShiftedArrays", "SparseArrays"] +test = ["CategoricalArrays", "Combinatorics", "DataValues", "Dates", "Logging", "OffsetArrays", "Test", "Unitful", "ShiftedArrays", "SparseArrays"] From d1d0b54d58fb24eeadb573fb65acd06b827e7261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 14 Sep 2024 23:06:40 +0200 Subject: [PATCH 13/17] improve names docs --- src/abstractdataframe/abstractdataframe.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl index 01f3ba304..bcf70a9f4 100644 --- a/src/abstractdataframe/abstractdataframe.jl +++ b/src/abstractdataframe/abstractdataframe.jl @@ -47,7 +47,8 @@ selector (this is useful in particular with regular expressions, `Cols`, `Not`, * a `Function` predicate taking the column name as a string and returning `true` for columns that should be kept -See also [`propertynames`](@ref) which returns a `Vector{Symbol}`. +See also [`propertynames`](@ref) which returns a `Vector{Symbol}` +(except for `GroupedDataFrame` in which case use `Symbol.(names(df))`). # Examples From 8ab697119b3c45693fa7e563c34ea3e22c73ab3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Wed, 27 Mar 2024 20:34:46 +0100 Subject: [PATCH 14/17] advanced transformation examples --- docs/src/man/working_with_dataframes.md | 66 +++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index e65d0ab03..9f3f037bd 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -830,6 +830,72 @@ julia> combine(df, names(df) .=> sum, names(df) .=> prod) If you would prefer the result to have the same number of rows as the source data frame, use `select` instead of `combine`. +Note that a `DataFrame` can store values of any type as its columns, for example +below we show how one can store a `Tuple`: + +``` +julia> df2 = combine(df, All() .=> extrema) +1×2 DataFrame + Row │ A_extrema B_extrema + │ Tuple… Tuple… +─────┼─────────────────────── + 1 │ (1, 4) (1.0, 4.0) +``` + +Later you might want to expand the tuples into separate columns storing the computed +minima and maxima. This can be achieved by passing multiple columns for the output. +In the example below we show how this can be done in combination with a function +so that we can generate target column names conditional on source column names: + +``` +julia> combine(df2, All() .=> identity .=> [c -> first(c) .* ["_min", "_max"]]) +1×4 DataFrame + Row │ A_min A_max B_min B_max + │ Int64 Int64 Float64 Float64 +─────┼──────────────────────────────── + 1 │ 1 4 1.0 4.0 +``` + +Note that in this example we needed to pass `identity` explicitly as otherwise the +functions generated with `c -> first(c) .* ["_min", "_max"]` would be treated as transformations +and not as rules for target column names generation. + +You might want to perform the transformation of the source data frame into the result +we have just shown in one step. This can be achieved with the following expression: + +``` +julia> combine(df, All() .=> Ref∘extrema .=> [c -> c .* ["_min", "_max"]]) +1×4 DataFrame + Row │ A_min A_max B_min B_max + │ Int64 Int64 Float64 Float64 +─────┼──────────────────────────────── + 1 │ 1 4 1.0 4.0 +``` + +Note that in this case we needed to add a `Ref` call in the `Ref∘extrema` operation specification. +The reason why this is needed is that instead `combine` iterates the contents of the value returned +by the operation specification function and tries to expand it, which in our case is a tuple of numbers, +so one gets an error: + +``` +julia> combine(df, names(df) .=> extrema .=> [c -> c .* ["_min", "_max"]]) +ERROR: ArgumentError: 'Tuple{Int64, Int64}' iterates 'Int64' values, +which doesn't satisfy the Tables.jl `AbstractRow` interface +``` + +Note that we used `Ref` as it is a container that is typically used in DataFrames.jl when one +wants to store one value, however, in general it could be another iterator. Here is an example +when the tuple returned by `extrema` is wrapped in a `Tuple`, producing the same result: + +``` +julia> combine(df, names(df) .=> tuple∘extrema .=> [c -> c .* ["_min", "_max"]]) +1×4 DataFrame + Row │ A_min A_max B_min B_max + │ Int64 Int64 Float64 Float64 +─────┼──────────────────────────────── + 1 │ 1 4 1.0 4.0 +``` + ## Handling of Columns Stored in a `DataFrame` Functions that transform a `DataFrame` to produce a From cf62a9495fb62705fe709952f411d68c739da136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 13 Apr 2024 20:41:40 +0200 Subject: [PATCH 15/17] apply review suggestions --- docs/src/man/working_with_dataframes.md | 73 +++++++++++++++++-------- 1 file changed, 49 insertions(+), 24 deletions(-) diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index 9f3f037bd..ac13eeb6d 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -812,14 +812,21 @@ julia> df = DataFrame(A=1:4, B=4.0:-1.0:1.0) 3 │ 3 2.0 4 │ 4 1.0 -julia> combine(df, names(df) .=> sum) +julia> combine(df, All() .=> sum) 1×2 DataFrame Row │ A_sum B_sum │ Int64 Float64 ─────┼──────────────── 1 │ 10 10.0 -julia> combine(df, names(df) .=> sum, names(df) .=> prod) +julia> combine(df, All() .=> sum, All() .=> prod) +1×4 DataFrame + Row │ A_sum B_sum A_prod B_prod + │ Int64 Float64 Int64 Float64 +─────┼───────────────────────────────── + 1 │ 10 10.0 24 24.0 + +julia> combine(df, All() .=> [sum prod]) # the same using 2-dimensional broadcasting 1×4 DataFrame Row │ A_sum B_sum A_prod B_prod │ Int64 Float64 Int64 Float64 @@ -830,7 +837,11 @@ julia> combine(df, names(df) .=> sum, names(df) .=> prod) If you would prefer the result to have the same number of rows as the source data frame, use `select` instead of `combine`. -Note that a `DataFrame` can store values of any type as its columns, for example +In the remainder of this section we will discuss some of the more advanced topis +related to operation specification syntax, so you may decide to skip them if you +want to focus on the most common usage patterns. + +A `DataFrame` can store values of any type as its columns, for example below we show how one can store a `Tuple`: ``` @@ -844,11 +855,22 @@ julia> df2 = combine(df, All() .=> extrema) Later you might want to expand the tuples into separate columns storing the computed minima and maxima. This can be achieved by passing multiple columns for the output. -In the example below we show how this can be done in combination with a function -so that we can generate target column names conditional on source column names: +Here is an example how this can be done by writing the column names by-hand for a single +input column: + +``` +julia> combine(df2, "A_extrema" => identity => ["A_min", "A_max"]) +1×2 DataFrame + Row │ A_min A_max + │ Int64 Int64 +─────┼────────────── + 1 │ 1 4 +``` + +You can extend it to handling all columns in `df2` using broadcasting: ``` -julia> combine(df2, All() .=> identity .=> [c -> first(c) .* ["_min", "_max"]]) +julia> combine(df2, All() .=> identity .=> [["A_min", "A_max"], ["B_min", "B_max"]]) 1×4 DataFrame Row │ A_min A_max B_min B_max │ Int64 Int64 Float64 Float64 @@ -856,15 +878,28 @@ julia> combine(df2, All() .=> identity .=> [c -> first(c) .* ["_min", "_max"]]) 1 │ 1 4 1.0 4.0 ``` -Note that in this example we needed to pass `identity` explicitly as otherwise the -functions generated with `c -> first(c) .* ["_min", "_max"]` would be treated as transformations -and not as rules for target column names generation. +This approach works, but can be improved. Instead of writing all the column names +manually we can instead use a function as a way to specify target column names +conditional on source column names: + +``` +julia> combine(df2, All() .=> identity .=> c -> first(c) .* ["_min", "_max"]) +1×4 DataFrame + Row │ A_min A_max B_min B_max + │ Int64 Int64 Float64 Float64 +─────┼──────────────────────────────── + 1 │ 1 4 1.0 4.0 +``` + +Note that in this example we needed to pass `identity` explicitly as with +`All() => (c -> first(c) .* ["_min", "_max"])` the right-hand side part would be +treated as a transformation and not as a rule for target column names generation. You might want to perform the transformation of the source data frame into the result we have just shown in one step. This can be achieved with the following expression: ``` -julia> combine(df, All() .=> Ref∘extrema .=> [c -> c .* ["_min", "_max"]]) +julia> combine(df, All() .=> Ref∘extrema .=> c -> c .* ["_min", "_max"]) 1×4 DataFrame Row │ A_min A_max B_min B_max │ Int64 Int64 Float64 Float64 @@ -873,28 +908,18 @@ julia> combine(df, All() .=> Ref∘extrema .=> [c -> c .* ["_min", "_max"]]) ``` Note that in this case we needed to add a `Ref` call in the `Ref∘extrema` operation specification. -The reason why this is needed is that instead `combine` iterates the contents of the value returned -by the operation specification function and tries to expand it, which in our case is a tuple of numbers, +Without `Ref`, `combine` iterates the contents of the value returned by the operation specification function, +which in our case is a tuple of numbers, and tries to expand it assuming that each produced value specifies one row, so one gets an error: ``` -julia> combine(df, names(df) .=> extrema .=> [c -> c .* ["_min", "_max"]]) +julia> combine(df, All() .=> extrema .=> [c -> c .* ["_min", "_max"]]) ERROR: ArgumentError: 'Tuple{Int64, Int64}' iterates 'Int64' values, which doesn't satisfy the Tables.jl `AbstractRow` interface ``` Note that we used `Ref` as it is a container that is typically used in DataFrames.jl when one -wants to store one value, however, in general it could be another iterator. Here is an example -when the tuple returned by `extrema` is wrapped in a `Tuple`, producing the same result: - -``` -julia> combine(df, names(df) .=> tuple∘extrema .=> [c -> c .* ["_min", "_max"]]) -1×4 DataFrame - Row │ A_min A_max B_min B_max - │ Int64 Int64 Float64 Float64 -─────┼──────────────────────────────── - 1 │ 1 4 1.0 4.0 -``` +wants to store one row, however, in general it could be another iterator (e.g. a tuple). ## Handling of Columns Stored in a `DataFrame` From 1145ad731f017eed04a8a638ec6fbeddfd63db12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 19 Apr 2024 22:51:47 +0200 Subject: [PATCH 16/17] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/man/working_with_dataframes.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index ac13eeb6d..fdd2b694e 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -837,8 +837,8 @@ julia> combine(df, All() .=> [sum prod]) # the same using 2-dimensional broadcas If you would prefer the result to have the same number of rows as the source data frame, use `select` instead of `combine`. -In the remainder of this section we will discuss some of the more advanced topis -related to operation specification syntax, so you may decide to skip them if you +In the remainder of this section we will discuss more advanced topics related +to the operation specification syntax, so you may decide to skip them if you want to focus on the most common usage patterns. A `DataFrame` can store values of any type as its columns, for example @@ -855,7 +855,7 @@ julia> df2 = combine(df, All() .=> extrema) Later you might want to expand the tuples into separate columns storing the computed minima and maxima. This can be achieved by passing multiple columns for the output. -Here is an example how this can be done by writing the column names by-hand for a single +Here is an example of how this can be done by writing the column names by-hand for a single input column: ``` @@ -880,7 +880,7 @@ julia> combine(df2, All() .=> identity .=> [["A_min", "A_max"], ["B_min", "B_max This approach works, but can be improved. Instead of writing all the column names manually we can instead use a function as a way to specify target column names -conditional on source column names: +based on source column names: ``` julia> combine(df2, All() .=> identity .=> c -> first(c) .* ["_min", "_max"]) @@ -909,7 +909,7 @@ julia> combine(df, All() .=> Ref∘extrema .=> c -> c .* ["_min", "_max"]) Note that in this case we needed to add a `Ref` call in the `Ref∘extrema` operation specification. Without `Ref`, `combine` iterates the contents of the value returned by the operation specification function, -which in our case is a tuple of numbers, and tries to expand it assuming that each produced value specifies one row, +which in our case is a tuple of numbers, and tries to expand it assuming that each produced value represents one row, so one gets an error: ``` From 85815e45c559aae14a194f913e25df0e484248bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sun, 22 Sep 2024 22:10:53 +0200 Subject: [PATCH 17/17] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- docs/src/man/working_with_dataframes.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/man/working_with_dataframes.md b/docs/src/man/working_with_dataframes.md index fdd2b694e..a74298c18 100755 --- a/docs/src/man/working_with_dataframes.md +++ b/docs/src/man/working_with_dataframes.md @@ -891,8 +891,8 @@ julia> combine(df2, All() .=> identity .=> c -> first(c) .* ["_min", "_max"]) 1 │ 1 4 1.0 4.0 ``` -Note that in this example we needed to pass `identity` explicitly as with -`All() => (c -> first(c) .* ["_min", "_max"])` the right-hand side part would be +Note that in this example we needed to pass `identity` explicitly since with +`All() => (c -> first(c) .* ["_min", "_max"])` the right-hand side part would be treated as a transformation and not as a rule for target column names generation. You might want to perform the transformation of the source data frame into the result