diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml new file mode 100644 index 00000000..7ac45b88 --- /dev/null +++ b/.github/workflows/pull_request.yml @@ -0,0 +1,68 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2023 by Maximilian Löffler +## All Rights Reserved. + +name: Build Status + +on: + pull_request: + branches: [ master, dev ] + types: [ opened, reopened, synchronize ] + push: + branches: [ master, dev ] + +permissions: + contents: read + +jobs: + build: + name: Build + + # change to 'runs-on: self-hosted' to run on self-hosted runners (https://docs.github.com/en/actions/using-jobs/choosing-the-runner-for-a-job) + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + r-version: ['3.6', '4.0', '4.1', '4.2', 'latest'] + + steps: + - name: Checkout Repo + uses: actions/checkout@v3 + + - name: Update system + run: | + sudo apt-get update -y + sudo apt-get install --assume-yes libxml2 + sudo apt-get install --assume-yes libxml2-dev + sudo apt-get install --assume-yes libglpk-dev + sudo apt-get install --assume-yes libfontconfig1-dev + sudo su -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/' >> /etc/apt/sources.list" + wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc + + - name: Set up R ${{ matrix.r-version }} + uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.r-version }} + + - name: Install dependencies + run: Rscript install.R + + - name: Run Tests + run: Rscript tests.R + + - name: Run Showcase + run: Rscript showcase.R + if: always() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7119b877..0f118cd5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -114,8 +114,8 @@ In our development process, we pursue the following idea: - The current development will be performed on the branch `dev`, i.e., all incoming pull requests are against this branch. The current build status is as follows: -- `master`: [![Build Status](https://cloud.drone.io/api/badges/se-sic/coronet/status.svg)](https://cloud.drone.io/se-sic/coronet) -- `dev`: [![Build Status](https://cloud.drone.io/api/badges/se-sic/coronet/status.svg?ref=refs/heads/dev)](https://cloud.drone.io/se-sic/coronet) +- `master`: ![Build Status](https://github.com/se-sic/coronet/actions/workflows/pull_request.yml/badge.svg?branch=master) +- `dev`: ![Build Status](https://github.com/se-sic/coronet/actions/workflows/pull_request.yml/badge.svg?branch=dev) ### Pull Requests diff --git a/NEWS.md b/NEWS.md index 7544432a..b19c21ef 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,34 @@ # coronet – Changelog +## 4.3 + +### Added + +- Add function `verify.data.frame.columns` to check that a dataframe includes all required columns, optionally with a specified datatype (PR #231, d1d9a039f50480ec5b442dc7e8b518648d1f9d9d) +- Add helper function `is.single.na` to check whether an element is of length 1 and is `NA` (ddff2b8bbca6405f5c7c1cf4e7e97374fb1426ca, ccfc2d12a68dfa412f05159e8e3b03118694e748) +- Add CI support for GitHub Actions (PR #234, fa1fc4af65751402ae6b23298dd4ed821930c6d2) + +### Changed/Improved + +- Include structural verification to almost all functions that read dataframes from files or set a dataframe (setter-functions) (PR #231, b7a95881da72ccaa548c6cd5d94bd558a25caa6f) +- Include removal of empty and deleted users in the setters of mails, commits, issues, and authors. For commits, also the `committer.name` column is now checked for deleted or empty users (PR #235, 08fbd3e11e33d060f42cbc6f729eaf60b48a6de7) +- Check for empty values (i.e., values of length < 1) when updating configuration attributes and throw an error if a value is empty (9f36c544637ab4f4173408152d223b9b5098ce5a) + +### Fixed + +- Fix check for empty input files in utility read functions. Compared to unpresent files, empty files do not throw an error when reading them, a check for `nrow(commit.data) < 1` is therefore required (PR #231, ecfa643cbc15975c3062af95c50ead02730b580f) +- Fix various problems regarding the default classes of edge attributes and vertex attributes, and also make sure that the edge attributes for bipartite edges are chosen correctly (PR #240, 4275b93867c78d20d0bd116749c1e7603cd9d473, 98a6deb1b178a1fcf799c741906e99770c46a8d0, b8232c09b91df3412f703dd26c21c685bacd0321, a9535550d93207f466b315f33ea263a50e6c8924, 820a7631093d03ac5ccb7bf9923bd498f669120a) +- Add argument to `construct.edge.list.from.key.value.list` function which differentiates if constructed edges are supposed to be artifact edges, in which case we check if the `artifact` attribute is present for edges and replace it by `author.name`. (PR #238, e2c9d6c39fb757c566ef4c4b18780cca247477cb, 7f42a91d4aa84e8c28c048925190637051e358a9) +- Change edge construction algorithm for cochange-based artifact networks to respect the temporal order of data. This avoids duplicate edges. (PR #238, e2c9d6c39fb757c566ef4c4b18780cca247477cb) +- Clarify that edges in issue-based artifact-networks are not available yet in `README.md`. (PR #238, 18a54f0241a28675dba4cdcbd433e88ec68d515a) +- Fix bugs related to expanded adjacency matrices and update the initiation of sparse matrices to the most recent version of package Matrix, to replace deprecated and disfunct function calls. Due to this update, package versions prior to 1.3.0 of the Matrix package cannot be used any more. If the 'install.R' detects that a version prior to 1.3.0 is installed, it now automatically tries to re-install package Matrix once (PR #241, 573fab22a290e826e2bdd6e1f063cd2e87ed2167, 2f06252750354e4ed53b768bd212aacf1a350217) +- Prevent R warnings `'length(x) = 2 > 1' in coercion to 'logical(1)'` in `if` conditions for updating configuration values, in update functions of additional data sources, and in `get.first.activity.data()` (PR #237, PR #241, ddff2b8bbca6405f5c7c1cf4e7e97374fb1426ca, e1579cab9bf8cdfee4105426c144350d092fffbd) +- Prevent R warnings `In xtfrm.data.frame(x) : cannot xtfrm data frames` (PR #237, c24aee7d8f0b6ff4b641c8922e6ee1dce6f5999c) +- Fix wrong bracket in pasted logging message (PR #241, 50c68cb60114b49c32dc5be15014745cb8d42ded) +- Replace deprecated R function calls (PR #237, ed433821c04711a96501887b315d1b0ea8681f5a) + + ## 4.2 ### Added diff --git a/README.md b/README.md index 2349cd95..3ebd1bf9 100644 --- a/README.md +++ b/README.md @@ -256,7 +256,7 @@ Relations determine which information is used to construct edges among the verti - `issue` * For author networks (configured via `author.relation` in the [`NetworkConf`](#networkconf)), authors who contribute to the same issue are connected with an edge. - * For artifact networks (configured via `artifact.relation` in the [`NetworkConf`](#networkconf)), issues are connected when they reference each other. + * For artifact networks (configured via `artifact.relation` in the [`NetworkConf`](#networkconf)), issues are connected when they reference each other. (**Note:** There are no edges available right now.) * For bipartite networks (configured via `artifact.relation` in the [`NetworkConf`](#networkconf)), authors get linked to all issues they have contributed to. - `callgraph` diff --git a/install.R b/install.R index 37565cc9..99f047cc 100644 --- a/install.R +++ b/install.R @@ -16,8 +16,7 @@ ## Copyright 2015 by Wolfgang Mauerer ## Copyright 2015-2017 by Claus Hunsen ## Copyright 2017 by Thomas Bock -## Copyright 2022 by Thomas Bock -## Copyright 2020-2021 by Thomas Bock +## Copyright 2020-2023 by Thomas Bock ## Copyright 2019 by Anselm Fehnker ## Copyright 2021 by Christian Hechtl ## All Rights Reserved. @@ -69,4 +68,14 @@ if (length(p) > 0) { if (compareVersion(igraph.version, "1.3.0") == -1) { print("WARNING: igraph version 1.3.0 or higher is recommended for using coronet.") } + + Matrix.version = installed.packages()[rownames(installed.packages()) == "Matrix", "Version"] + if (compareVersion(Matrix.version, "1.3.0") == -1) { + print("WARNING: Matrix version 1.3.0 or higher is necessary for using coronet. Re-install package Matrix...") + install.packages("Matrix", dependencies = NA, verbose = TRUE, quiet = TRUE) + Matrix.version = installed.packages()[rownames(installed.packages()) == "Matrix", "Version"] + if (compareVersion(Matrix.version, "1.3.0") == -1) { + print("WARNING: Re-installation of package Matrix did not end up in the necessary packge version.") + } + } } diff --git a/plot-multi.png b/plot-multi.png index 7ac17f96..05c37f3a 100644 Binary files a/plot-multi.png and b/plot-multi.png differ diff --git a/showcase.R b/showcase.R index 30dfc91c..a4cceb53 100644 --- a/showcase.R +++ b/showcase.R @@ -402,7 +402,7 @@ p = p + plot.title = ggplot2::element_text(hjust = 0.5, size = 15), plot.margin = ggplot2::unit(c(0.5, 0.5, 0.5, 0.5), "cm") ) -ggplot2::ggsave("plot-multi.png", plot = p, width = 6.57, height = 4.114) +ggplot2::ggsave("plot-multi.png", plot = p, bg = "white", width = 6.57, height = 4.114) ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / diff --git a/tests/codeface-data/results/testing/test_empty_feature/feature/bots.list b/tests/codeface-data/results/testing/test_empty_feature/feature/bots.list new file mode 100644 index 00000000..e69de29b diff --git a/tests/codeface-data/results/testing/test_empty_proximity/proximity/bots.list b/tests/codeface-data/results/testing/test_empty_proximity/proximity/bots.list new file mode 100644 index 00000000..e69de29b diff --git a/tests/codeface-data/results/testing/test_feature/feature/commits.list b/tests/codeface-data/results/testing/test_feature/feature/commits.list index 88f92a73..2f1476d0 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/commits.list +++ b/tests/codeface-data/results/testing/test_feature/feature/commits.list @@ -9,3 +9,5 @@ 32716;"2016-07-12 16:06:30";"Thomas";"thomas@example.org";"2016-07-12 16:06:30";"Thomas";"thomas@example.org";"d01921773fae4bed8186b0aa411d6a2f7a6626e6";1;1;0;1;"";"";"";0 32711;"2016-07-12 16:06:32";"Thomas";"thomas@example.org";"2016-07-12 16:06:32";"Thomas";"thomas@example.org";"0a1a5c523d835459c42f33e863623138555e2526";1;1;0;1;"test2.c";"Base_Feature";"Feature";1 32711;"2016-07-12 16:06:32";"Thomas";"thomas@example.org";"2016-07-12 16:06:32";"Thomas";"thomas@example.org";"0a1a5c523d835459c42f33e863623138555e2526";1;1;0;1;"test2.c";"foo";"Feature";1 +31711;"2016-07-12 16:06:33";"Thomas";"thomas@example.org";"2016-07-12 16:06:33";"";"thomas@example.org";"2ef7bde608ce5404e97d5f042f95f89f1c232871";1;1;0;1;"test2.c";"foo";"Feature";1 +30711;"2016-07-12 16:06:34";"Thomas";"thomas@example.org";"2016-07-12 16:06:34";"deleted user";"thomas@example.org";"c6954cb75e3eeec5b827f64e97b6a4ba187c0d55";1;1;0;1;"test2.c";"foo";"Feature";1 diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/commits.list b/tests/codeface-data/results/testing/test_proximity/proximity/commits.list index e4094c63..e4f136f1 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/commits.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/commits.list @@ -5,3 +5,5 @@ 32715;"2016-07-12 16:06:32";"Thomas";"thomas@example.org";"2016-07-12 16:06:32";"Thomas";"thomas@example.org";"0a1a5c523d835459c42f33e863623138555e2526";1;1;0;1;"test2.c";"File_Level";"Function";1 32720;"2016-07-12 16:06:20";"Karl";"karl@example.org";"2016-07-12 16:06:20";"Karl";"karl@example.org";"418d1dc4929ad1df251d2aeb833dd45757b04a6f";1;1;0;1;"";"";"";0 32721;"2016-07-12 16:06:30";"Thomas";"thomas@example.org";"2016-07-12 16:06:30";"Thomas";"thomas@example.org";"d01921773fae4bed8186b0aa411d6a2f7a6626e6";1;1;0;1;"";"";"";0 +32811;"2016-07-12 16:06:33";"Thomas";"thomas@example.org";"2016-07-12 16:06:33";"";"thomas@example.org";"2ef7bde608ce5404e97d5f042f95f89f1c232871";1;1;0;1;"test2.c";"foo";"Feature";1 +32911;"2016-07-12 16:06:34";"Thomas";"thomas@example.org";"2016-07-12 16:06:34";"deleted user";"thomas@example.org";"c6954cb75e3eeec5b827f64e97b6a4ba187c0d55";1;1;0;1;"test2.c";"foo";"Feature";1 diff --git a/tests/test-core-peripheral.R b/tests/test-core-peripheral.R index ae499554..07c7389c 100644 --- a/tests/test-core-peripheral.R +++ b/tests/test-core-peripheral.R @@ -79,7 +79,7 @@ test_that("Eigenvector classification", { row.names(result[["core"]]) = NULL row.names(result[["peripheral"]]) = NULL - expect_equal(expected, result, tolerance = 0.0001) + ## expect_equal(expected, result, tolerance = 0.0001) ## TODO: Find a way to directly test for equality without the need of taking care of different orders of author ## names. For the moment, we take the following workaround: diff --git a/tests/test-misc.R b/tests/test-misc.R index e40bde80..562540b5 100644 --- a/tests/test-misc.R +++ b/tests/test-misc.R @@ -14,6 +14,8 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Claus Hunsen ## Copyright 2017-2018 by Thomas Bock +## Copyright 2023 by Thomas Bock +## Copyright 2022-2023 by Maximilian Löffler ## All Rights Reserved. @@ -107,6 +109,109 @@ test_that("Match argument or take default.", { expect_equal(actual.result, expected.result, info = "Multiple choices with ignored default, two choices") }) +## +## Check presence and datatype of data frame columns. +## + +test_that("Check presence and datatype of data frame columns.", { + + user.names = c("John", "Peter", "Maria", "Susanne") + + ## contains NaN to verify functionality does not break + age = c(42, 50, NaN, 66) + + ## contains NA to verify functionality does not break + is.male = c(TRUE, TRUE, FALSE, NA) + + ## construct simple testing dataframe + data.frame = data.frame(user.names, age, is.male) + + ## 1) Check base functionality (benign use-case) + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age", "is.male"), c("character", "numeric", "logical")), + NA, + message = "All columns present and well-typed.") + ## Expect no error + + ## 2) Base test with reordered columns + expect_error(verify.data.frame.columns( + data.frame, c("is.male", "age", "user.names"), c("logical", "numeric", "character")), + NA, + message = "Order of columns does not matter.") + ## Expect no error + + ## 3) Specify less columns than present (Allow optional columns) + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age"), c("character", "numeric")), + NA, + message = "Optional columns are allowed.") + ## Expect no error + + ## 4) Unequal amount of column names and datatypes + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age", "is.male"), c("character", "numeric")), + message = "More column names specified than datatypes.") + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age"), c("character", "numeric", "logical")), + message = "More column names specified than datatypes.") + + ## 5) Datatypes do not match column names + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age", "is.male"), c("logical", "character", "numeric")), + message = "Column names do not match datatypes.") + + ## 6) Invalid column / Column not present in dataframe (Typo) + expect_error(verify.data.frame.columns( + data.frame, c("user.name"), c("character")), + message = "Column name 'user.name' should not be in dataframe.") + + ## 7) No datatypes specified and column names are present + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age", "is.male")), + NA, + message = "Column names do not match datatypes.") + ## Expect no error + + ## 8) No datatypes specified and column names are not specified correctly (Typo) + expect_error(verify.data.frame.columns( + data.frame, c("user.name")), + message = "Column name 'user.name' should not be in dataframe.") + + ## 9) Too many column names and no datatypes specified + expect_error(verify.data.frame.columns( + data.frame, c("user.names", "age", "is.male", "job.orientation")), + message = "More column names specififed than present in the dataframe.") + +}) + + +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Vector misc-------------------------------------------------------------- + +## +## Check if a value is a single NA value. +## + +test_that("Check if a value is a single NA value", { + + ## 1) Tests for single NA + expect_true(is.single.na(NA)) + expect_true(is.single.na(list(NA))) + expect_true(is.single.na(data.frame(NA))) + + ## 2) Tests for values other than a single NA + expect_false(is.single.na(0)) + expect_false(is.single.na("na")) + expect_false(is.single.na(NULL)) + expect_false(is.single.na(logical(0))) + expect_false(is.single.na(FALSE)) + expect_false(is.single.na(c(NA, NA))) + expect_false(is.single.na(c(3, NA))) + expect_false(is.single.na(list(NA, NA))) + expect_false(is.single.na(data.frame(NA, NA))) +}) + + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Date handling ----------------------------------------------------------- diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index ed9409e8..8eaebaf8 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -15,6 +15,7 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Jakob Kronawitter +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -42,13 +43,13 @@ test_that("Network construction of the undirected artifact-cochange network", { type = TYPE.ARTIFACT) ## 2) edges edges = data.frame( - from = c("Base_Feature", "Base_Feature"), - to = c("foo", "foo"), - date = get.date.from.string(c("2016-07-12 16:06:32", "2016-07-12 16:06:32")), - artifact.type = c("Feature", "Feature"), - hash = c("0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526"), - file = c("test2.c", "test2.c"), - artifact = c("Base_Feature", "foo"), + from = "Base_Feature", + to = "foo", + date = get.date.from.string("2016-07-12 16:06:32"), + artifact.type = "Feature", + hash = "0a1a5c523d835459c42f33e863623138555e2526", + file = "test2.c", + author.name = "Thomas", weight = 1, type = TYPE.EDGES.INTRA, relation = "cochange" diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index 2a62b127..613c1862 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -13,9 +13,11 @@ ## ## Copyright 2017, 2019 by Claus Hunsen ## Copyright 2017 by Christian Hechtl +## Copyright 2023 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock +## Copyright 2023 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert @@ -421,12 +423,19 @@ test_that("Network construction of the undirected simplified author-cochange net kind = TYPE.AUTHOR, type = TYPE.AUTHOR) + ## make test independent of igraph version + date.attr = igraph::get.edge.attribute(network.built, "date") + date.conversion.function = ifelse(all(sapply(date.attr, lubridate::is.POSIXct)), + get.date.from.unix.timestamp, identity) + ## edge attributes data = data.frame( from = c("Björn", "Olaf", "Olaf", "Karl"), to = c("Olaf", "Karl", "Thomas", "Thomas"), - date = I(list(c(1468339139, 1468339245), c(1468339541, 1468339570), c(1468339541, 1468339592), - c(1468339570, 1468339592))), + date = I(list(date.conversion.function(c(1468339139, 1468339245)), + date.conversion.function(c(1468339541, 1468339570)), + date.conversion.function(c(1468339541, 1468339592)), + date.conversion.function(c(1468339570, 1468339592)))), artifact.type = I(list(c("Feature", "Feature"), c("Feature", "Feature"), c("Feature", "Feature"), c("Feature", "Feature"))), hash = I(list( @@ -442,6 +451,13 @@ test_that("Network construction of the undirected simplified author-cochange net relation = "cochange" ) + ## remove the 'AsIs' class from the edge attributes that have been inserted via `I(...)` + data[["date"]] = unclass(data[["date"]]) + data[["artifact.type"]] = unclass(data[["artifact.type"]]) + data[["hash"]] = unclass(data[["hash"]]) + data[["file"]] = unclass(data[["file"]]) + data[["artifact"]] = unclass(data[["artifact"]]) + ## build expected network network.expected = igraph::graph.data.frame(data, directed = FALSE, vertices = authors) diff --git a/tests/test-networks-multi-relation.R b/tests/test-networks-multi-relation.R index b264ec70..c724d155 100644 --- a/tests/test-networks-multi-relation.R +++ b/tests/test-networks-multi-relation.R @@ -19,6 +19,7 @@ ## Copyright 2019 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -235,13 +236,13 @@ test_that("Construction of the multi network for the feature artifact with autho ## 2) construct expected edge attributes (data sorted by 'author.name') edges = data.frame(from = c("Björn", "Björn", "Olaf", "Olaf", "Olaf", "Olaf", "Karl", "Karl", # author cochange "Björn", "Björn", "Olaf", "Olaf", # author mail - "Base_Feature", "Base_Feature", # artifact cochange + "Base_Feature", # artifact cochange "Björn", "Olaf", "Olaf", "Karl", "Thomas", "Thomas", # bipartite cochange "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", "Björn", # bipartite issue "Olaf", "Olaf", "Olaf", "Olaf", "Olaf", "Olaf", "Karl", "Thomas", "Thomas", "Thomas"), to = c("Olaf", "Olaf", "Karl", "Karl", "Thomas", "Thomas", "Thomas", "Thomas", # author cochange "Olaf", "Olaf", "Thomas", "Thomas", # author mail - "foo", "foo", # artifact cochange + "foo", # artifact cochange "A", "A", "Base_Feature", "Base_Feature", "Base_Feature", "foo", # bipartite cochange "", "", "", "", # bipartite issue "", "", "", "", "", "", @@ -252,7 +253,7 @@ test_that("Construction of the multi network for the feature artifact with autho "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 15:58:40", "2016-07-12 15:58:50", "2016-07-12 16:04:40", "2016-07-12 16:05:37", - "2016-07-12 16:06:32", "2016-07-12 16:06:32", # artifact cochange + "2016-07-12 16:06:32", # artifact cochange "2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", # bipartite cochange "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", # bipartite issue @@ -262,48 +263,49 @@ test_that("Construction of the multi network for the feature artifact with autho "2013-06-01 06:50:26", "2016-07-12 16:01:01", "2016-07-12 16:02:02", "2016-07-12 15:59:59", "2013-04-21 23:52:09", "2016-07-12 15:59:25", "2016-07-12 16:03:59")), - artifact.type = c(rep("Feature", 8), rep("Mail", 4), rep("Feature", 2), rep("Feature", 6), + artifact.type = c(rep("Feature", 8), rep("Mail", 4), rep("Feature", 1), rep("Feature", 6), rep("IssueEvent", 21)), hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", # author cochange "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", "3a0ed78458b3976243db6829f63eba3eead26774", "0a1a5c523d835459c42f33e863623138555e2526", "1143db502761379c2bfcecc2007fc34282e7ee61", "0a1a5c523d835459c42f33e863623138555e2526", NA, NA, NA, NA, # author mail - "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526", # artifact cochange + "0a1a5c523d835459c42f33e863623138555e2526", # artifact cochange "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", # bipartite cochange "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526", rep(NA, 21)), # bipartite issue file = c("test.c", "test.c", "test2.c", "test3.c", "test2.c", "test2.c", "test3.c", "test2.c", # author cochange NA, NA, NA, NA, - "test2.c", "test2.c", # artifact cochange + "test2.c", # artifact cochange "test.c", "test.c", "test2.c", "test3.c", "test2.c", "test2.c", # bipartite cochange rep(NA, 21)), artifact = c("A", "A", "Base_Feature", "Base_Feature", "Base_Feature", "Base_Feature", "Base_Feature", # author cochange "Base_Feature", rep(NA, 4), - "Base_Feature", "foo", # bipartite cochange + NA, # artifact cochange "A", "A", "Base_Feature", "Base_Feature", "Base_Feature", "foo", # bipartite cochange rep(NA, 21)), weight = 1, - type = c(rep(TYPE.EDGES.INTRA, 14), rep(TYPE.EDGES.INTER, 27)), - relation = c(rep("cochange", 8), rep("mail", 4), rep("cochange", 2), rep("cochange", 6), + type = c(rep(TYPE.EDGES.INTRA, 13), rep(TYPE.EDGES.INTER, 27)), + relation = c(rep("cochange", 8), rep("mail", 4), rep("cochange", 1), rep("cochange", 6), rep("issue", 21)), message.id = c(rep(NA, 8), "<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>", "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>", "<65a1sf31sagd684dfv31@mail.gmail.com>", "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>", - rep(NA, 29)), + rep(NA, 28)), thread = c(rep(NA, 8), "", "", "", "", - rep(NA, 29)), - issue.id = c(rep(NA, 20), + rep(NA, 28)), + author.name = c(rep(NA, 12), "Thomas", rep(NA, 27)), + issue.id = c(rep(NA, 19), "", "", "", "", # bipartite issue "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""), - event.name = c(rep(NA, 20), rep("commented", 21)) + event.name = c(rep(NA, 19), rep("commented", 21)) ) ## 3) build expected network diff --git a/tests/test-networks-multi.R b/tests/test-networks-multi.R index 9ce03817..bbc93894 100644 --- a/tests/test-networks-multi.R +++ b/tests/test-networks-multi.R @@ -15,6 +15,7 @@ ## Copyright 2018 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -56,38 +57,37 @@ test_that("Construction of the multi network for the feature artifact with autho ) row.names(vertices) = c("Björn", "Olaf", "Karl", "Thomas", "Base_Feature", "foo", "A") - edges = data.frame( - from = c("Björn", "Björn", "Olaf", "Olaf", "Olaf", "Olaf", "Karl", "Karl", - "Base_Feature", "Base_Feature", "Björn", "Olaf", "Olaf", "Karl", "Thomas", + from = c("Björn", "Björn", "Olaf", "Olaf", "Olaf", "Olaf", "Karl", "Karl", + "Base_Feature", "Björn", "Olaf", "Olaf", "Karl", "Thomas", "Thomas"), - to = c("Olaf", "Olaf", "Karl", "Karl", "Thomas", "Thomas", "Thomas", "Thomas", "foo", + to = c("Olaf", "Olaf", "Karl", "Karl", "Thomas", "Thomas", "Thomas", "Thomas", "foo", "A", "A", "Base_Feature", "Base_Feature", "Base_Feature", "foo"), date = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:05:41", "2016-07-12 16:06:32", - "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:32", - "2016-07-12 16:06:32", "2016-07-12 15:58:59", "2016-07-12 16:00:45", - "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:32", - "2016-07-12 16:06:32")), + "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:32", + "2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:32")), artifact.type = c("Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", "Feature", - "Feature", "Feature"), + "Feature"), hash = c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", "3a0ed78458b3976243db6829f63eba3eead26774", "0a1a5c523d835459c42f33e863623138555e2526", "1143db502761379c2bfcecc2007fc34282e7ee61", "0a1a5c523d835459c42f33e863623138555e2526", - "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526", - "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", - "3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61", - "0a1a5c523d835459c42f33e863623138555e2526", "0a1a5c523d835459c42f33e863623138555e2526"), + "0a1a5c523d835459c42f33e863623138555e2526", "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", + "5a5ec9675e98187e1e92561e1888aa6f04faa338", "3a0ed78458b3976243db6829f63eba3eead26774", + "1143db502761379c2bfcecc2007fc34282e7ee61", "0a1a5c523d835459c42f33e863623138555e2526", + "0a1a5c523d835459c42f33e863623138555e2526"), file = c("test.c", "test.c", "test2.c", "test3.c", "test2.c", "test2.c", "test3.c", "test2.c", - "test2.c", "test2.c", "test.c", "test.c", "test2.c", "test3.c", "test2.c", "test2.c"), + "test2.c", "test.c", "test.c", "test2.c", "test3.c", "test2.c", "test2.c"), artifact = c("A", "A", "Base_Feature", "Base_Feature", "Base_Feature", "Base_Feature", "Base_Feature", - "Base_Feature", "Base_Feature", "foo", "A", "A", "Base_Feature", "Base_Feature", "Base_Feature", + "Base_Feature", NA, "A", "A", "Base_Feature", "Base_Feature", "Base_Feature", "foo"), weight = 1, - type = c(rep(TYPE.EDGES.INTRA, 10), rep(TYPE.EDGES.INTER, 6)), - relation = "cochange" + type = c(rep(TYPE.EDGES.INTRA, 9), rep(TYPE.EDGES.INTER, 6)), + relation = "cochange", + author.name = c(NA, NA, NA, NA, NA, NA, NA, NA, "Thomas", NA, NA, NA, NA, NA, NA) ) network.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) diff --git a/tests/test-read.R b/tests/test-read.R index 48cae572..9a597f23 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -21,6 +21,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann +## Copyright 2022-2023 by Maximilian Löffler ## All Rights Reserved. @@ -129,7 +130,7 @@ test_that("Read the raw commit data with the file artifact.", { artifact = c("test.c", "test.c", "test2.c", "test3.c", UNTRACKED.FILE, UNTRACKED.FILE, "test2.c"), artifact.type = c("File", "File", "File", "File", UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, "File"), - artifact.diff.size = c(1, 1, 1, 1, 0, 0, 1)) + artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1))) ## check the results expect_identical(commit.data.read, commit.data.expected, info = "Raw commit data.") @@ -243,7 +244,7 @@ test_that("Read the author data.", { ## build the expected data.frame author.data.expected = data.frame( - author.id = as.integer(c(4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944)), + author.id = as.character(c(4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944)), author.name = c("Thomas", "Olaf", "Björn", "udo", "Fritz fritz@example.org", "georg", "Hans", "Karl", "Max"), author.email = c("thomas@example.org", "olaf@example.org", "bjoern@example.org", "udo@example.org", "asd@sample.org", "heinz@example.org", "hans1@example.org", "karl@example.org", "max@example.org"), diff --git a/util-conf.R b/util-conf.R index a34e24e8..f05c2b92 100644 --- a/util-conf.R +++ b/util-conf.R @@ -18,7 +18,7 @@ ## Copyright 2020-2021 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2019 by Thomas Bock -## Copyright 2021 by Thomas Bock +## Copyright 2021, 2023 by Thomas Bock ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker @@ -100,6 +100,7 @@ Conf = R6::R6Class("Conf", #' #' @return a named vector of logical values, named: #' - existing, + #' - value.not.empty, #' - type, #' - allowed, #' - allowed.number, and @@ -109,15 +110,18 @@ Conf = R6::R6Class("Conf", check.value = function(value, name) { if (!exists(name, where = private[["attributes"]])) { result = c(existing = FALSE) + } else if (length(value) < 1){ + result = c(existing = TRUE, value.not.empty = FALSE) } else { ## check all other properties attribute = private[["attributes"]][[name]] ## if non-updatable field, return early if (!is.null(attribute[["updatable"]]) && !attribute[["updatable"]]) { - result = c(existing = TRUE, updatable = FALSE) + result = c(existing = TRUE, value.not.empty = TRUE, updatable = FALSE) } else { result = c( existing = TRUE, + value.not.empty = TRUE, updatable = TRUE, type = class(value) %in% attribute[["type"]], ## if 'allowed' is not defined for this attribute, any @@ -219,22 +223,31 @@ Conf = R6::R6Class("Conf", if (!check[["existing"]]) { message = paste( - "Updating network-configuration attribute '%s' failed:", - "A network-configuraton attribute with this name does not exist." + "Updating configuration attribute '%s' failed:", + "A configuraton attribute with this name does not exist." + ) + error.function(sprintf(message, name)) + + } else if (!check[["value.not.empty"]]) { + + message = paste( + "Updating configuration attribute '%s' failed:", + "The provided value is empty!" ) error.function(sprintf(message, name)) } else if (!check[["updatable"]]) { message = paste( - "Updating network-configuration attribute '%s' failed:", + "Updating configuration attribute '%s' failed:", "The value is not updatable!" ) error.function(message, name) } else { + message = paste0( - "Updating network-configuration attribute '%s' failed.\n", + "Updating configuration attribute '%s' failed.\n", "Allowed values (%s of type '%s'): %s\n", "Given value (of type '%s'): %s" ) @@ -264,22 +277,24 @@ Conf = R6::R6Class("Conf", paste(names.to.update, collapse = ", ") ) for (name in names.to.update) { + default.value = private[["attributes"]][[name]][["default"]] + new.value = updated.values[[name]] + ## check if the default value or the given new value are NA ## if only one of both is NA that means that the value has to be changed - if (is.na(private[["attributes"]][[name]][["default"]]) && !is.na(updated.values[[name]]) || - !is.na(private[["attributes"]][[name]][["default"]]) && is.na(updated.values[[name]])) { - private[["attributes"]][[name]][["value"]] = updated.values[[name]] + if (is.single.na(default.value) && !is.single.na(new.value) || + !is.single.na(default.value) && is.single.na(new.value)) { + private[["attributes"]][[name]][["value"]] = new.value } ## if the default value and the given value are the same and if the 'value' field is present ## then reset the 'value' field - else if (is.na(private[["attributes"]][[name]][["default"]]) && is.na(updated.values[[name]]) || - identical(sort(updated.values[[name]]), - sort(private[["attributes"]][[name]][["default"]]))) { + else if (is.single.na(default.value) && is.single.na(new.value) || + identical(sort(new.value), sort(default.value))) { if ("value" %in% names(private[["attributes"]][[name]])) { private[["attributes"]][[name]][["value"]] = NULL } } ## otherwise proceed with updating the value else { - private[["attributes"]][[name]][["value"]] = sort(updated.values[[name]]) + private[["attributes"]][[name]][["value"]] = sort(new.value) } } } else { diff --git a/util-data.R b/util-data.R index e4025c75..80470d75 100644 --- a/util-data.R +++ b/util-data.R @@ -13,7 +13,7 @@ ## ## Copyright 2016-2019 by Claus Hunsen ## Copyright 2017-2019 by Thomas Bock -## Copyright 2020-2021 by Thomas Bock +## Copyright 2020-2021, 2023 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2020 by Christian Hechtl @@ -21,10 +21,11 @@ ## Copyright 2017 by Ferdinand Frank ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker -## Copyright 2020-2021 by Niklas Schneider +## Copyright 2020-2021, 2023 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann +## Copyright 2022-2023 by Maximilian Löffler ## All Rights Reserved. @@ -34,6 +35,7 @@ requireNamespace("R6") # for R6 classes requireNamespace("logging") # for logging requireNamespace("parallel") # for parallel computation +requireNamespace("lubridate") # for date conversion ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -267,7 +269,7 @@ ProjectData = R6::R6Class("ProjectData", result = parallel::mclapply(thread.data, function(thread) { ## ensure that all mails within the thread are ordered correctly - thread = thread[order(thread["date"]), ] + thread = thread[order(thread[["date"]]), ] running = TRUE i = 1 @@ -397,7 +399,7 @@ ProjectData = R6::R6Class("ProjectData", ## only print warning if this function has not been called by 'cleanup.commit.message.data' including the ## case that it is called manually, i.e. the stack is too short. - if (is.na(caller) || caller != "cleanup.commit.message.data()") { + if (is.na(caller) || paste(caller, collapse = " ") != "cleanup.commit.message.data()") { logging::logwarn("There might be commit message data that does not appear in the commit data. To clean this up you can call the function 'cleanup.commit.message.data()'.") } @@ -639,7 +641,7 @@ ProjectData = R6::R6Class("ProjectData", ## only print warning if this function has not been called by 'cleanup.pasta.data' including the case ## that it is called manually, i.e. the stack is too short. - if (is.na(caller) || caller != "cleanup.pasta.data()") { + if (all(is.na(caller)) || paste(caller, collapse = " ") != "cleanup.pasta.data()") { logging::logwarn("There might be PaStA data that does not appear in the mail or commit data. To clean this up you can call the function 'cleanup.pasta.data()'.") } @@ -693,7 +695,7 @@ ProjectData = R6::R6Class("ProjectData", ## only print warning if this function has not been called by 'cleanup.synchronicity.data' including the case ## that it is called manually, i.e. the stack is too short. - if (is.na(caller) || caller != "cleanup.synchronicity.data()") { + if (all(is.na(caller)) || paste(caller, collapse = " ") != "cleanup.synchronicity.data()") { logging::logwarn("There might be synchronicity data that does not appear in the commit data. To clean this up you can call the function 'cleanup.synchronicity.data()'.") } @@ -1064,8 +1066,14 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(commit.data)) { commit.data = create.empty.commits.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(commit.data, COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES) } + ## remove commits that have no author or commiter + commit.data = remove.deleted.and.empty.user(commit.data, c("author.name", "committer.name")) + ## store commit data private$commits.unfiltered = commit.data @@ -1145,6 +1153,9 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(data)) { data = create.empty.commit.message.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(data, COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES) } ## set the actual data @@ -1214,6 +1225,9 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(data)) { data = create.empty.synchronicity.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(data, SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES) } ## set the actual data @@ -1287,6 +1301,9 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(data)) { data = create.empty.pasta.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(data, PASTA.LIST.COLUMNS, PASTA.LIST.DATA.TYPES) } ## set the actual data @@ -1368,6 +1385,9 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(data)) { data = create.empty.gender.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(data, GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES) } ## set the actual data @@ -1444,8 +1464,14 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(mail.data)) { mail.data = create.empty.mails.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(mail.data, MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES) } + ## remove deleted and empty users + mail.data = remove.deleted.and.empty.user(mail.data) + ## store mail data private$mails.unfiltered = mail.data private$mails = mail.data @@ -1502,6 +1528,17 @@ ProjectData = R6::R6Class("ProjectData", set.authors = function(data) { logging::loginfo("Setting author data.") private$authors = data + + if (is.null(data)) { + data = create.empty.authors.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(data, AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES) + } + + ## remove deleted and empty users + data = remove.deleted.and.empty.user(data) + ## add gender data if wanted if (private$project.conf$get.value("gender")) { @@ -1606,8 +1643,15 @@ ProjectData = R6::R6Class("ProjectData", if (is.null(data)) { data = create.empty.issues.list() + } else { + ## check that dataframe is of correct shape + verify.data.frame.columns(data, ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES) } + ## remove deleted user from the "author.name" column, + ## however, keep events where the user in the "event.info.1" column is empty or deleted + data = remove.deleted.and.empty.user(data) + private$issues.unfiltered = data private$issues = create.empty.issues.list() }, @@ -2129,6 +2173,11 @@ ProjectData = R6::R6Class("ProjectData", #' #' @param custom.event.timestamps the list of timestamps to set set.custom.event.timestamps = function(custom.event.timestamps) { + if (!is.list(custom.event.timestamps)) { + error.message = sprintf("set.custom.event.timestamps: Input is expected to be a list.") + logging::logerror(error.message) + stop(error.message) + } if(length(custom.event.timestamps) != 0){ private$custom.event.timestamps = custom.event.timestamps[ order(unlist(get.date.from.string(custom.event.timestamps))) diff --git a/util-misc.R b/util-misc.R index b161cf67..152f13ca 100644 --- a/util-misc.R +++ b/util-misc.R @@ -16,10 +16,11 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock -## Copyright 2020-2021 by Thomas Bock +## Copyright 2020-2021, 2023 by Thomas Bock ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann +## Copyright 2022-2023 by Maximilian Löffler ## All Rights Reserved. @@ -139,6 +140,79 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE } } +#' Check if a dataframe matches a given structure. This includes the dataframe to contain columns +#' which must match the column names in \code{columns} and the datatypes in \code{data.types}. +#' +#' @param data the data frame under investigation for structural conformity +#' @param columns a character vector containing the column names the data frame should include +#' @param data.types an ordered vector containing the data types corresponding to the columns. +#' If this parameter is \code{NULL} only the existence of \code{columns} is checked +#' without regarding column types. Otherwise this vector must be of the +#' same length as the vector of \code{columns} +#' [default: NULL] +verify.data.frame.columns = function(data, columns, data.types = NULL) { + + ## every column of the data frame must be one to one mapped to a datatype expected in the column + ## therefore if there aren't as many datatypes provided in \code{data.types} as column names have + ## been provided in \code{columns} we can stop here already + if (!is.null(data.types) && length(columns) != length(data.types)) { + error.message = sprintf(paste("If specified, the length of the two given vectors", + "'columns' and 'data.types' must match.")) + logging::logerror(error.message) + stop(error.message) + } + + ## obtain vector of all column names included in the data frame to ease further checks + data.frame.columns = colnames(data) + + ## iterate over all columns in \code{columns} + for (i in seq_along(columns)) { + + ## obtain the column. + column = columns[i] + + ## stop verification process early if column is not present in the data frame + if (!(column %in% data.frame.columns)) { + error.message = sprintf("Column '%s' is missing from the dataframe", column) + logging::logerror(error.message) + stop(error.message) + } + + if (!is.null(data.types)) { + + ## obtain the datatype that should be present in the data frame column + ## which is currently under investigation + expected.type = data.types[i] + + ## necessary case distinction for special case list where calling \code{base::class} + ## removes the information whether or not \code{data[[column]]} is a list + if (expected.type == "list()") { + + ## column is not a list + if (!is.list(data[[column]])) { + error.message = sprintf("Column '%s' is expected to be a list but is '%s'", + column, class(received.type)) + logging::logerror(error.message) + stop(error.message) + } + + } else { + ## obtain the datatype that elements of the current column hold in the data frame + received.type = class(data[[column]]) + + ## stop verification process early if column type in the data frame is not matching + ## the expected datatype + if (!(expected.type %in% received.type)) { + error.message = sprintf("Column '%s' has type '%s' in dataframe, expected '%s'", + column, received.type, expected.type) + logging::logerror(error.message) + stop(error.message) + } + } + } + } +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Empty dataframe creation------------------------------------------------- @@ -220,6 +294,17 @@ get.second.last.element = function(v) { } } +#' Check if a value is a single \code{NA} value. +#' (The function \code{is.na} is not capable of doing that, as it does the \code{NA} check for each element of a vector +#' instead of checking whether vector itself is just a single \code{NA} element.) +#' +#' @param x an R object to be tested: atomic vectors, lists, pairlists, or ‘NULL’ +#' +#' @return \code{TRUE} if \code{x} is of length 1 and \code{x} is \code{NA}; \code{FALSE} otherwise +is.single.na = function(x) { + return(length(x) == 1 && is.na(x)) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Stacktrace -------------------------------------------------------------- diff --git a/util-networks-covariates.R b/util-networks-covariates.R index 31dd7134..9d560fed 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -14,7 +14,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2018-2019 by Claus Hunsen ## Copyright 2018-2019 by Thomas Bock -## Copyright 2021 by Thomas Bock +## Copyright 2021, 2023 by Thomas Bock ## Copyright 2018-2019 by Klara Schlüter ## Copyright 2018 by Jakob Kronawitter ## Copyright 2020 by Christian Hechtl @@ -1594,7 +1594,7 @@ get.first.activity.data = function(range.data, activity.types = c("commits", "ma ## check for keys whose member lists are empty or NA ## first, get a logical vector indicating all missing keys missing.keys = sapply(activity.types, function(x) { - is.na(activity.by.type[[x]]) || length(activity.by.type[[x]]) == 0 + all(is.na(activity.by.type[[x]])) || length(activity.by.type[[x]]) == 0 }) ## then apply this vector to the 'activity.types' vector in order to pick the actual keys missing.keys = activity.types[missing.keys] @@ -1604,7 +1604,7 @@ get.first.activity.data = function(range.data, activity.types = c("commits", "ma ## if there are no keys left that are present, again, print a warning and return an empty list as there is no data ## for the configured activity types - if (length(present.keys) == 0 || is.na(present.keys) || is.null(present.keys)) { + if (length(present.keys) == 0 || all(is.na(present.keys)) || is.null(present.keys)) { logging::logwarn("There were no activities in the given RangeData that were configured") return(list()) } @@ -1618,7 +1618,7 @@ get.first.activity.data = function(range.data, activity.types = c("commits", "ma for (missing.key in missing.keys) { logging::logwarn(paste("The type", missing.key, "was configured but the RangeData did not contain any", - "activities of that type"), sep = " ") + "activities of that type", sep = " ")) activity.by.type[missing.key] = na.list } diff --git a/util-networks-metrics.R b/util-networks-metrics.R index 092ee15c..faa7c4f6 100644 --- a/util-networks-metrics.R +++ b/util-networks-metrics.R @@ -12,7 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2015, 2019 by Thomas Bock -## Copyright 2021 by Thomas Bock +## Copyright 2021, 2023 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2017-2018 by Christian Hechtl @@ -26,6 +26,7 @@ ## Libraries --------------------------------------------------------------- requireNamespace("igraph") +requireNamespace("logging") ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -36,7 +37,8 @@ requireNamespace("igraph") #' @param network the network to be examined #' @param mode the mode to be used for determining the degrees [default: "total"] #' -#' @return A data frame containing the name of the vertex with with maximum degree its degree. +#' @return If the network is empty (i.e., has no vertices), \code{NA}. +#' Otherwise, a data frame containing the name of the vertex/vertices with maximum degree and its/their degree. metrics.hub.degree = function(network, mode = c("total", "in", "out")) { ## check whether the network is empty, i.e., if it has no vertices if (igraph::vcount(network) == 0) { @@ -102,8 +104,9 @@ metrics.density = function(network) { #' @param directed whether to consider directed paths in directed networks [default: TRUE] #' @param unconnected whether there are subnetworks in the network that are not connected. #' If \code{TRUE} only the lengths of the existing paths are considered and averaged; -#' if \code{FALSE} the length of the missing paths are counted having length \code{vcount(graph)}, one longer than -#' the longest possible geodesic in the network (from igraph documentation) [default: TRUE] +#' if \code{FALSE} the length of the missing paths are counted having length \code{vcount(graph)}, +#' one longer than the longest possible geodesic in the network (from igraph documentation) +#' [default: TRUE] #' #' @return The average path length of the given network. metrics.avg.pathlength = function(network, directed = TRUE, unconnected = TRUE) { @@ -130,7 +133,8 @@ metrics.clustering.coeff = function(network, cc.type = c("global", "local", "bar #' #' @param network the network to be examined #' @param community.detection.algorithm the algorithm to be used for the detection of communities -#' which is required for the calculation of the clustering coefficient [default: igraph::cluster_walktrap] +#' which is required for the calculation of the clustering coefficient +#' [default: igraph::cluster_walktrap] #' #' @return The modularity value for the given network. metrics.modularity = function(network, community.detection.algorithm = igraph::cluster_walktrap) { @@ -211,7 +215,7 @@ metrics.is.smallworld = function(network) { #' #' @param network the network to be examined #' @param minimum.number.vertices the minimum number of vertices with which -#' a network can be scale free [default: 30] +#' a network can be scale free [default: 30] #' #' @return A dataframe containing the different values, connected to scale-freeness. metrics.scale.freeness = function(network, minimum.number.vertices = 30) { @@ -256,7 +260,7 @@ metrics.scale.freeness = function(network, minimum.number.vertices = 30) { #' #' @param network the network to be examined #' @param minimum.number.vertices the minimum number of vertices with which -#' a network can be scale free [default: 30] +#' a network can be scale free [default: 30] #' #' @return \code{TRUE}, if the network is scale free, #' \code{FALSE}, otherwise. @@ -305,7 +309,7 @@ VERTEX.CENTRALITIES.COLUMN.NAMES = c("vertex.name", "centrality") #' - "network.degree" #' - "network.eigen" #' - "network.hierarchy" -#' [defalt: "network.degree"] +#' [default: "network.degree"] #' @param restrict.classification.to.vertices a vector of vertex names. Only vertices that are contained within this #' vector are to be classified. Vertices that appear in the vector but are #' not part of the classification result (i.e., they are not present in the diff --git a/util-networks-misc.R b/util-networks-misc.R index bc5489c4..c2ebc509 100644 --- a/util-networks-misc.R +++ b/util-networks-misc.R @@ -14,7 +14,7 @@ ## Copyright 2016-2017 by Sofie Kemper ## Copyright 2016-2017 by Claus Hunsen ## Copyright 2016-2018 by Thomas Bock -## Copyright 2020 by Thomas Bock +## Copyright 2020, 2023 by Thomas Bock ## Copyright 2017 by Angelika Schmid ## Copyright 2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker @@ -104,7 +104,7 @@ get.author.names.from.data = function(data.ranges, data.sources = c("commits", " ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Adjacency matrices ---------------------------------------------------- -#' Get a sparse expanded adjacency matrix for network. +#' Get a sparse expanded adjacency matrix (in triplet format) for a given network. #' #' The adjacency matrix is expanded as it may contain rows and columns for authors which are not part of the network #' but given in the \code{authors} parameter. However, this also means that authors present in the network @@ -117,9 +117,10 @@ get.author.names.from.data = function(data.ranges, data.sources = c("commits", " #' @return the sparse adjacency matrix of the network get.expanded.adjacency = function(network, authors, weighted = FALSE) { - ## create an empty sparse matrix with the right size - matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE) - matrix = as(matrix, "dgTMatrix") + ## create an empty sparse matrix using the triplet form with the right size. + ## x = 0 indicates that the matrix should contain numeric values (i.e., it is a 'dgTMatrix'; + ## without setting x = 0 it would be a binary 'ngTMatrix') + matrix = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors), length(authors)), repr = "T") ## add row and column names rownames(matrix) = authors @@ -225,11 +226,11 @@ convert.adjacency.matrix.list.to.array = function(adjacency.list){ colnames(array) = colnames(adjacency.list[[1]]) ## copy the activity values from the adjacency matrices in the list to the corresponding array slices - for (i in seq_along(adjacency.list)){ + for (i in seq_along(adjacency.list)) { adjacency = adjacency.list[[i]] - activity.indices = which(adjacency != 0, arr.ind = TRUE) + activity.indices = Matrix::which(adjacency != 0, arr.ind = TRUE) - for (j in 1:nrow(activity.indices)){ + for (j in seq_len(nrow(activity.indices))) { array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2]), i] = adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])] } diff --git a/util-networks.R b/util-networks.R index 5da3b4d5..9f205bba 100644 --- a/util-networks.R +++ b/util-networks.R @@ -15,12 +15,13 @@ ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017-2019 by Thomas Bock -## Copyright 2021 by Thomas Bock +## Copyright 2021, 2023 by Thomas Bock ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2020 by Anselm Fehnker ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -32,6 +33,7 @@ requireNamespace("logging") # for logging requireNamespace("parallel") # for parallel computation requireNamespace("plyr") # for dlply function requireNamespace("igraph") # networks +requireNamespace("lubridate") # for date conversion ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -315,7 +317,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", artifacts.net.data = construct.edge.list.from.key.value.list( artifacts.net.data.raw, network.conf = private$network.conf, - directed = FALSE + directed = FALSE, + respect.temporal.order = TRUE, + artifact.edges = TRUE ) ## construct network from obtained data @@ -900,11 +904,14 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## check directedness and adapt artifact network if needed if (igraph::is.directed(authors.net) && !igraph::is.directed(artifacts.net)) { - logging::logwarn("Author network is directed, but artifact network is not. Converting artifact network...") + logging::logwarn(paste0("Author network is directed, but artifact network is not.", + "Converting artifact network...")) artifacts.net = igraph::as.directed(artifacts.net, mode = "mutual") } else if (!igraph::is.directed(authors.net) && igraph::is.directed(artifacts.net)) { - logging::logwarn("Author network is undirected, but artifact network is not. Converting artifact network...") - artifacts.net = igraph::as.undirected(artifacts.net, mode = "each", edge.attr.comb = EDGE.ATTR.HANDLING) + logging::logwarn(paste0("Author network is undirected, but artifact network is not.", + "Converting artifact network...")) + artifacts.net = igraph::as.undirected(artifacts.net, mode = "each", + edge.attr.comb = EDGE.ATTR.HANDLING) } ## reduce memory consumption by removing temporary data @@ -914,6 +921,23 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## combine the networks: ## 1) merge the existing networks u = igraph::disjoint_union(authors.net, artifacts.net) + + ## As there is a bug in 'igraph::disjoint_union' in igraph versions 1.4.0, 1.4.1, and 1.4.2 + ## (see https://github.com/igraph/rigraph/issues/761), we need to adjust the type of the date attribute + ## of the outcome of 'igraph::disjoint_union'. + ## Note: The following temporary fix only considers the 'date' attribute. However, this problem could also + ## affect several other attributes, whose classes are not adjusted in our temporary fix. + ## The following code block should be redundant as soon as igraph has fixed their bug. + u.actual.edge.attribute.date = igraph::get.edge.attribute(u, "date") + if (!is.null(u.actual.edge.attribute.date)) { + if (is.list(u.actual.edge.attribute.date)) { + u.expected.edge.attribute.date = lapply(u.actual.edge.attribute.date, get.date.from.unix.timestamp) + } else { + u.expected.edge.attribute.date = get.date.from.unix.timestamp(u.actual.edge.attribute.date) + } + u = igraph::set.edge.attribute(u, "date", value = u.expected.edge.attribute.date) + } + ## 2) add the bipartite edges u = add.edges.for.bipartite.relation(u, authors.to.artifacts, private$network.conf) @@ -949,11 +973,14 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", #' i.e., whether to only add edges from the later event to the previous one. #' If \code{NA} is passed, the default value is taken. #' [default: directed] +#' @param artifact.edges whether the key value data represents edges in an artifact network based +#' on the cochange relation +#' [default: FALSE] #' #' @return a list of two data.frames named 'vertices' and 'edges' (compatible with return value #' of \code{igraph::as.data.frame}) construct.edge.list.from.key.value.list = function(list, network.conf, directed = FALSE, - respect.temporal.order = directed) { + respect.temporal.order = directed, artifact.edges = FALSE) { logging::loginfo("Create edges.") logging::logdebug("construct.edge.list.from.key.value.list: starting.") @@ -970,6 +997,19 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed keys = names(list) keys.number = length(list) + + ## if edges in an artifact network contain the \code{artifact} attribute + ## replace it with the \code{author.name} attribute as artifacts cannot cause + ## edges in artifact networks, authors can + edge.attributes = network.conf$get.value("edge.attributes") + if (artifact.edges) { + artifact.index = match("artifact", edge.attributes, nomatch = NA) + if (!is.na(artifact.index)) { + edge.attributes = edge.attributes[-artifact.index] + edge.attributes = c(edge.attributes, c("author.name")) + } + } + if (respect.temporal.order) { ## for all subsets (sets), connect all items in there with the previous ones @@ -998,11 +1038,11 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed item.vertex = item[["data.vertices"]] ## get edge attributes - cols.which = network.conf$get.value("edge.attributes") %in% colnames(item) - item.edge.attrs = item[, network.conf$get.value("edge.attributes")[cols.which], drop = FALSE] + cols.which = edge.attributes %in% colnames(item) + item.edge.attrs = item[ , edge.attributes[cols.which], drop = FALSE] ## construct edges - combinations = expand.grid(item.vertex, vertices.processed.set, stringsAsFactors = default.stringsAsFactors()) + combinations = expand.grid(item.vertex, vertices.processed.set, stringsAsFactors = FALSE) if (nrow(combinations) > 0 & nrow(item.edge.attrs) == 1) { combinations = cbind(combinations, item.edge.attrs, row.names = NULL) # add edge attributes } @@ -1068,8 +1108,8 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed ## get edge attibutes edge.attrs = set[set[["data.vertices"]] %in% comb.item, ] # get data for current combination item - cols.which = network.conf$get.value("edge.attributes") %in% colnames(edge.attrs) - edge.attrs = edge.attrs[, network.conf$get.value("edge.attributes")[cols.which], drop = FALSE] + cols.which = edge.attributes %in% colnames(edge.attrs) + edge.attrs = edge.attrs[ , edge.attributes[cols.which], drop = FALSE] # add edge attributes to edge list edgelist = cbind(edge, edge.attrs) @@ -1282,17 +1322,40 @@ add.edges.for.bipartite.relation = function(net, bipartite.relations, network.co igraph::V(net)[d, vert] # get two vertices from source network: c(author, artifact) }) return(new.edges) - }, names(net1.to.net2), net1.to.net2) + }, names(net1.to.net2), net1.to.net2, SIMPLIFY = FALSE) ## initialize edge attributes allowed.edge.attributes = network.conf$get.value("edge.attributes") - available.edge.attributes = available.edge.attributes[names(available.edge.attributes) %in% allowed.edge.attributes] + available.edge.attributes = available.edge.attributes[names(available.edge.attributes) + %in% allowed.edge.attributes] net = add.attributes.to.network(net, "edge", allowed.edge.attributes) ## get extra edge attributes - extra.edge.attributes.df = parallel::mclapply(net1.to.net2, function(a.df) { - cols.which = allowed.edge.attributes %in% colnames(a.df) - return(a.df[, allowed.edge.attributes[cols.which], drop = FALSE]) + extra.edge.attributes.df = parallel::mcmapply(vertex.sequence = vertex.sequence.for.edges, a.df = net1.to.net2, + SIMPLIFY = FALSE, function(vertex.sequence, a.df) { + + ## return empty data.frame if vertex sequence is empty + if (length(unlist(vertex.sequence)) == 0){ + return(data.frame()) + } + + ## get the artifacts from the vertex sequence (which are the even elements of the sequence vector) + vertex.names.in.sequence = names(unlist(vertex.sequence)) + artifacts.in.sequence = vertex.names.in.sequence[seq(2, length(vertex.names.in.sequence), 2)] + + ## get the edges that will be constructed from the artifacts, + ## to get only the edge attributes for edges that will be present in the final network + ## (i.e., ignore edges to removed artifacts, such as the empty artifact that has been removed above) + constructed.edges = a.df[a.df[["data.vertices"]] %in% artifacts.in.sequence, , drop = FALSE] + + ## return empty data.frame if there will be no edges in the end + if (nrow(constructed.edges) < 1) { + return(data.frame()) + } + + ## select the allowed attributes from the edge data.frame's columns + cols.which = allowed.edge.attributes %in% colnames(constructed.edges) + return(constructed.edges[ , allowed.edge.attributes[cols.which], drop = FALSE]) }) extra.edge.attributes.df = plyr::rbind.fill(extra.edge.attributes.df) extra.edge.attributes = as.list(extra.edge.attributes.df) @@ -1323,9 +1386,7 @@ create.empty.network = function(directed = TRUE, add.attributes = FALSE) { date = c("POSIXct", "POSIXt"), artifact.type = "character", weight = "numeric", type = "character", relation = "character" ) - mandatory.edge.attributes = names(mandatory.edge.attributes.classes) mandatory.vertex.attributes.classes = list(name = "character", kind = "character", type = "character") - mandatory.vertex.attributes = names(mandatory.vertex.attributes.classes) net = add.attributes.to.network(net, "vertex", mandatory.vertex.attributes.classes) net = add.attributes.to.network(net, "edge", mandatory.edge.attributes.classes) @@ -1352,7 +1413,7 @@ create.empty.edge.list = function() { #' Add the given list of \code{type} attributes to the given network. #' -#' All added attributes are set to the value \code{NA}. +#' All added attributes are set to the default value of the respective class. #' #' @param network the network to which the attributes are to be added #' @param type the type of attribute to add; either \code{"vertex"} or \code{"edge"} @@ -1366,11 +1427,15 @@ add.attributes.to.network = function(network, type = c("vertex", "edge"), attrib ## get type type = match.arg(type, several.ok = FALSE) - ## get corresponding attribute function + ## get corresponding attribute functions if (type == "vertex") { - attribute.function = igraph::set.vertex.attribute # sprintf("igraph::set.%s.attribute", type) + attribute.set.function = igraph::set.vertex.attribute # sprintf("igraph::set.%s.attribute", type) + attribute.get.function = igraph::get.vertex.attribute # sprintf("igraph::get.%s.attribute", type) + attribute.remove.function = igraph::remove.vertex.attribute # sprintf("igraph::remove.%s.attribute", type) } else { - attribute.function = igraph::set.edge.attribute # sprintf("igraph::set.%s.attribute", type) + attribute.set.function = igraph::set.edge.attribute # sprintf("igraph::set.%s.attribute", type) + attribute.get.function = igraph::get.edge.attribute # sprintf("igraph::get.%s.attribute", type) + attribute.remove.function = igraph::remove.edge.attribute # sprintf("igraph::remove.%s.attribute", type) } ## iterate over all wanted attribute names and add the attribute with the wanted class @@ -1379,10 +1444,23 @@ add.attributes.to.network = function(network, type = c("vertex", "edge"), attrib default.value = 0 ## set the right class for the default value class(default.value) = attributes[[attr.name]] + + ## make sure that the default value contains a tzone attribute if the attribute is of class 'POSIXct' + if (lubridate::is.POSIXct(default.value)) { + attr(default.value, "tzone") = TIMEZONE + } + + ## check if the attribute is already present. If so, remove it and re-add it (to keep the intended order). + ## only exception from this: the name attribute is not removed and re-added, as this would lead to problems. + if (!is.null(attribute.get.function(network, attr.name)) && attr.name != "name") { + logging::logwarn("Attribute %s has already been present, but is re-added now.", attr.name) + present.value = attribute.get.function(network, attr.name) + network = attribute.remove.function(network, attr.name) + default.value = present.value + } + ## add the new attribute to the network with the proper class - network = attribute.function(network, attr.name, value = default.value) - ## fill the new attribute with NA values - network = attribute.function(network, attr.name, value = NA) + network = attribute.set.function(network, attr.name, value = default.value) } return(network) diff --git a/util-read.R b/util-read.R index 1f0451d0..8f1b4fd9 100644 --- a/util-read.R +++ b/util-read.R @@ -17,12 +17,14 @@ ## Copyright 2020-2022 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock +## Copyright 2023 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker -## Copyright 2020-2021 by Niklas Schneider +## Copyright 2020-2021, 2023 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann +## Copyright 2022-2023 by Maximilian Löffler ## All Rights Reserved. ## Note: @@ -42,15 +44,26 @@ requireNamespace("sqldf") # for SQL-selections on data.frames requireNamespace("data.table") # for faster data.frame processing ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Helper functions --------------------------------------------------------------- +## Helper functions -------------------------------------------------------- #' Remove the "deleted user" or the author with empty name "" from a data frame. #' -#' @param data the data from which to remove the "deleted user" and author with empty name +#' @param data the data from which to remove the "deleted user" and author with empty name. +#' @param columns the columns in which to search for the "deleted user" and author with empty name. +#' [default: c("author.name")] #' #' @return the data frame without the rows in which the author name is "deleted user" or "" -remove.deleted.and.empty.user = function(data) { - return(data[tolower(data[, "author.name"]) != "deleted user" & data["author.name"] != "", ]) +remove.deleted.and.empty.user = function(data, columns = c("author.name")) { + if (!all(columns %in% colnames(data))) { + logging::logerror("The given columns are not present in the data.frame.") + stop("Stopped due to invalid column names.") + } + + ## loop over the given columns and remove all rows in which the author name is "deleted user" or "" + for (column in columns) { + data = data[tolower(data[, column]) != "deleted user" & data[, column] != "", ] + } + return(data) } ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -72,8 +85,8 @@ COMMITS.LIST.DATA.TYPES = c( "character", "POSIXct", "character", "character", "POSIXct", "character", "character", - "character", "numeric", "numeric", "numeric", "numeric", - "character", "character", "character", "numeric" + "character", "integer", "integer", "integer", "integer", + "character", "character", "character", "integer" ) #' Read the commits from the 'commits.list' file. @@ -92,7 +105,7 @@ read.commits = function(data.path, artifact) { encoding = "UTF-8"), silent = TRUE) ## handle the case that the list of commits is empty - if (inherits(commit.data, "try-error")) { + if (inherits(commit.data, "try-error") || nrow(commit.data) < 1) { logging::logwarn("There are no commits available for the current environment.") logging::logwarn("Datapath: %s", data.path) @@ -136,7 +149,7 @@ read.commits = function(data.path, artifact) { ORDER BY `date`, `author.name`, `commit.id`, `file`, `artifact`") ## fix column class for diffsum - commit.data["diffsum"] = as.numeric(commit.data[["diffsum"]]) + commit.data["diffsum"] = as.integer(commit.data[["diffsum"]]) ## copy columns to match proper layout for further analyses commit.data["artifact"] = commit.data[["file"]] @@ -170,7 +183,7 @@ read.commits = function(data.path, artifact) { UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, commit.data[["artifact.type"]]) - commit.data = remove.deleted.and.empty.user(commit.data) # filter deleted user + commit.data = remove.deleted.and.empty.user(commit.data, c("author.name", "committer.name")) # filter deleted user ## convert dates and sort by them commit.data[["date"]] = get.date.from.string(commit.data[["date"]]) @@ -181,6 +194,9 @@ read.commits = function(data.path, artifact) { commit.data[["commit.id"]] = format.commit.ids(commit.data[["commit.id"]]) row.names(commit.data) = seq_len(nrow(commit.data)) + ## check that dataframe is of correct shape + verify.data.frame.columns(commit.data, COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES) + ## store the commit data logging::logdebug("read.commits: finished.") return(commit.data) @@ -207,7 +223,7 @@ MAILS.LIST.COLUMNS = c( ## declare the datatype for each column in the constant 'MAILS.LIST.COLUMNS' MAILS.LIST.DATA.TYPES = c( "character", "character", - "character", "POSIXct", "numeric", "character", + "character", "POSIXct", "integer", "character", "character", "character" ) @@ -228,7 +244,7 @@ read.mails = function(data.path) { encoding = "UTF-8"), silent = TRUE) ## handle the case that the list of mails is empty - if (inherits(mail.data, "try-error")) { + if (inherits(mail.data, "try-error") || nrow(mail.data) < 1) { logging::logwarn("There are no mails available for the current environment.") logging::logwarn("Datapath: %s", data.path) return(create.empty.mails.list()) @@ -264,6 +280,9 @@ read.mails = function(data.path) { } mail.data = remove.deleted.and.empty.user(mail.data) # filter deleted user + ## check that dataframe is of correct shape + verify.data.frame.columns(mail.data, MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES) + ## store the mail data logging::logdebug("read.mails: finished.") return(mail.data) @@ -326,7 +345,7 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { encoding = "UTF-8"), silent = TRUE) ## handle the case that the list of issues is empty - if (inherits(source.data, "try-error")) { + if (inherits(source.data, "try-error") || nrow(source.data) < 1) { logging::logwarn("There are no %s issue data available for the current environment.", issue.source) logging::logwarn("Datapath: %s", data.path) return(create.empty.issues.list()) @@ -344,13 +363,17 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { ## set proper column names colnames(source.data) = ISSUES.LIST.COLUMNS - return(source.data) }) ## combine issue data from all sources issue.data = do.call(rbind, issue.data) + ## if no chosen source is present exit early by returning the (combined) empty issues list + if (nrow(issue.data) < 1) { + return(issue.data) + } + ## set pattern for issue ID for better recognition issue.data[["issue.id"]] = sprintf("", issue.data[["issue.source"]], issue.data[["issue.id"]]) @@ -374,7 +397,9 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { commit.added.events.before.creation = commit.added.events & !is.na(issue.data["creation.date"]) & (issue.data["date"] < issue.data["creation.date"]) issue.data[commit.added.events.before.creation, "date"] = issue.data[commit.added.events.before.creation, "creation.date"] - issue.data = remove.deleted.and.empty.user(issue.data) # filter deleted user + ## filter deleted user from the "author.name" column, + ## however, keep events where the user in the "event.info.1" column is empty or deleted + issue.data = remove.deleted.and.empty.user(issue.data) issue.data = issue.data[order(issue.data[["date"]], decreasing = FALSE), ] # sort! } @@ -384,6 +409,9 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { function(event) { digest::digest(event, algo="sha1", serialize = FALSE) } ) + ## check that dataframe is of correct shape + verify.data.frame.columns(issue.data, ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES) + logging::logdebug("read.issues: finished.") return(issue.data) } @@ -431,13 +459,18 @@ read.bot.info = function(data.path) { logging::logwarn("There is no bot information available for the current environment.") logging::logwarn("Datapath: %s", data.path) - ## return a data frame with the correct columns but zero rows + ## return NULL. Creating an empty dataframe is not possible + ## because no type information about bot information is present return(NULL) } ## set column names for new data frame colnames(bot.data) = BOT.LIST.COLUMNS bot.data["is.bot"] = sapply(bot.data[["is.bot"]], function(x) switch(x, Bot = TRUE, Human = FALSE, NA)) + + ## check that dataframe is of correct shape + verify.data.frame.columns(bot.data, BOT.LIST.COLUMNS) + logging::logdebug("read.bot.info: finished.") return(bot.data) } @@ -473,15 +506,15 @@ read.authors = function(data.path) { ## break if the list of authors is empty - if (inherits(authors.df, "try-error")) { + if (inherits(authors.df, "try-error") || nrow(authors.df) < 1) { logging::logerror("There are no authors available for the current environment.") logging::logwarn("Datapath: %s", data.path) stop("Stopped due to missing authors.") } - ## if there is no third column, we need to add e-mail-address dummy data (NAs) + ## if there is no third column, we need to add e-mail-address dummy data if (ncol(authors.df) != length(AUTHORS.LIST.COLUMNS.WITHOUT.BOTS)) { - authors.df[3] = NA + authors.df[3] = "" } colnames(authors.df) = AUTHORS.LIST.COLUMNS.WITHOUT.BOTS @@ -499,6 +532,12 @@ read.authors = function(data.path) { authors.df = authors.df[, AUTHORS.LIST.COLUMNS] authors.df = remove.deleted.and.empty.user(authors.df) + ## assure type correctness + authors.df[["author.id"]] = as.character(authors.df[["author.id"]]) + + ## check that dataframe is of correct shape + verify.data.frame.columns(authors.df, AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES) + ## store the ID--author mapping logging::logdebug("read.authors: finished.") return(authors.df) @@ -550,7 +589,7 @@ read.gender = function(data.path) { ## handle the case if the list of items is empty - if (inherits(gender.data, "try-error")) { + if (inherits(gender.data, "try-error") || nrow(gender.data) < 1) { logging::logwarn("There are no gender data available for the current environment.") logging::logwarn("Datapath: %s", data.path) return(create.empty.gender.list()) @@ -583,6 +622,9 @@ read.gender = function(data.path) { ## remove rownames rownames(gender.data) = NULL + ## check that dataframe is of correct shape + verify.data.frame.columns(gender.data, GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES) + logging::logdebug("read.gender: finished.") return(gender.data) @@ -637,7 +679,7 @@ read.commit.messages = function(data.path) { encoding = "UTF-8"), silent = TRUE) ## handle the case that the list of commits is empty - if (inherits(commit.message.data, "try-error")) { + if (inherits(commit.message.data, "try-error") || nrow(commit.message.data) < 1) { logging::logwarn("There are no commit messages available for the current environment.") logging::logwarn("Datapath: %s", data.path) @@ -691,8 +733,10 @@ read.commit.messages = function(data.path) { commit.message.data[["commit.id"]] = format.commit.ids(commit.message.data[["commit.id"]]) row.names(commit.message.data) = seq_len(nrow(commit.message.data)) - logging::logdebug("read.commit.messages: finished.") + ## check that dataframe is of correct shape + verify.data.frame.columns(commit.message.data, COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES) + logging::logdebug("read.commit.messages: finished.") return(commit.message.data) } @@ -737,7 +781,7 @@ read.pasta = function(data.path) { lines = suppressWarnings(try(readLines(filepath), silent = TRUE)) ## handle the case if the list of PaStA items is empty - if (inherits(lines, "try-error")) { + if (inherits(lines, "try-error") || length(lines) < 1) { logging::logwarn("There are no PaStA data available for the current environment.") logging::logwarn("Datapath: %s", data.path) return(create.empty.pasta.list()) @@ -775,6 +819,10 @@ read.pasta = function(data.path) { return(df) }) result.df = plyr::rbind.fill(result.list) + + ## check that dataframe is of correct shape + verify.data.frame.columns(result.df, PASTA.LIST.COLUMNS, PASTA.LIST.DATA.TYPES) + logging::logdebug("read.pasta: finished.") return(result.df) } @@ -838,6 +886,9 @@ read.synchronicity = function(data.path, artifact, time.window) { ## ensure proper column names colnames(synchronicity) = SYNCHRONICITY.LIST.COLUMNS + ## check that dataframe is of correct shape + verify.data.frame.columns(synchronicity, SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES) + ## store the synchronicity data logging::logdebug("read.synchronicity: finished.") return(synchronicity) @@ -871,7 +922,7 @@ read.custom.event.timestamps = function(data.path, file.name) { encoding = "UTF-8"), silent = TRUE) ## handle the case that the list of commits is empty - if (inherits(custom.event.timestamps.table, "try-error")) { + if (inherits(custom.event.timestamps.table, "try-error") || nrow(custom.event.timestamps.table) < 1) { logging::logwarn("There are no custom timestamps available at the given path.") logging::logwarn("Datapath: %s", data.path) @@ -881,9 +932,19 @@ read.custom.event.timestamps = function(data.path, file.name) { timestamps = as.list(custom.event.timestamps.table[[2]]) names(timestamps) = custom.event.timestamps.table[[1]] + ## convert all timestamps to POSIXct format + posix.timestamps = get.date.from.string(timestamps) + + ## if a timestamp is malformatted get.date.from.string returns a NA + if (any(is.na(posix.timestamps))) { + error.message = sprintf("Input timestamps are not in POSIXct format (YYYY-mm-DD HH:MM:SS).") + logging::logerror(error.message) + stop(error.message) + } + ## Sort the timestamps if (length(timestamps) != 0) { - timestamps = timestamps[order(unlist(get.date.from.string(timestamps)))] + timestamps = timestamps[order(unlist(posix.timestamps))] } logging::logdebug("read.custom.event.timestamps: finished.")