diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 7ac45b88b..4e1b7a6c6 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -11,7 +11,8 @@ ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler +## Copyright 2024 by Thomas Bock ## All Rights Reserved. name: Build Status @@ -29,18 +30,18 @@ permissions: jobs: build: name: Build - + # change to 'runs-on: self-hosted' to run on self-hosted runners (https://docs.github.com/en/actions/using-jobs/choosing-the-runner-for-a-job) runs-on: ubuntu-latest - + strategy: fail-fast: false matrix: - r-version: ['3.6', '4.0', '4.1', '4.2', 'latest'] + r-version: ['3.6', '4.0', '4.1', '4.2', '4.3', 'latest'] steps: - - name: Checkout Repo - uses: actions/checkout@v3 + - name: Checkout Repo + uses: actions/checkout@v4 - name: Update system run: | @@ -56,7 +57,7 @@ jobs: uses: r-lib/actions/setup-r@v2 with: r-version: ${{ matrix.r-version }} - + - name: Install dependencies run: Rscript install.R diff --git a/NEWS.md b/NEWS.md index b19c21ef8..e58c86118 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,51 @@ # coronet – Changelog +## 4.4 + +### Announcement + +- Due to a bug in package `igraph` (https://github.com/igraph/rigraph/issues/1158), which is present in their versions 2.0.0 to 2.0.3, the functions `metrics.scale.freeness` and `metrics.is.scale.free` can currently not be used with these `igraph` versions. If you need to call any of these two functions, you either need to install `igraph` version 1.6.0 or wait until the bug in `igraph` is fixed in a future version of `igraph`. + +### Added + +- Add issue-based artifact-networks, in which issues form vertices connected by edges that represent issue references. If possible, disambiguate duplicate JIRA issue references that originate from [codeface-extraction](https://github.com/se-sic/codeface-extraction) (PR #244, PR #249, 98a93ee721a293410623aafe46890cfba9d81e72, 771bcc8d961d419b53a1e891e9dc536371f1143b, 368e79264adf5a5358c04518c94ad2e1c13e212b, fa3167c289c9785f3a5db03d9724848f1441a63d, 4646d581d5e1f63260692b396a8bd8f51b0da48fda, ed77bd726bf92e06c2fc9145a5847787a8d0588b) +- Add a new `split.data.by.bins` function (not to be confused with a previously existing function that had the same name and was renamed in this context), which splits data based on given activity-based bins (PR #244, ece569ceaf557bb38cd0cfad437b69b30fe8a698, ed5feb214a123b605c9513262f187cfd72b9e1f4) +- Add `get.bin.dates.from.ranges` function to convert date ranges into bins format (PR #249, a1842e9be46596321ee86860fd87d17a3c88f50f, 858b1812ebfc3194cc6a03c99f3ee7d161d1ca15) +- Add the possibility to simplify edges of multiple-relation networks into a single edge at all instead of a single edge per relation (PR #250, PR #255, 2105ea89b5227e7c9fa78fea9de1977f2d9e8faa, a34b5bd50351b9ccf3cc45fc323cfa2e84d65ea0, 34516415fed599eba0cc7d3cc4a9acd6b26db252, 78f43514962d7651e6b7a1e80ee22ce012f32535, d310fdc38690f0d701cd32c92112c33f7fdde0ff, 58d77b01ecc6a237104a4e72ee5fb9025efeaaf2) +- Add network simplification to showcase file (PR #255, dc32d44f9db7dfc3cc795ef5d6b86609d6c1936f) +- Add tests for network simplification (PR #255, 338b06941eec1c9cfdb121e78ce0d9db6b75da19, 8a6f47bc115c10fbbe4eee21985d97aee5c9dc91, e01908c94eccc4dda5f2b3c0746b0eab0172dc07, 7b6848fb86f69db088ce6ef2bea8315ac94d48f9, 666d78444ffcb3bc8b36f2121284e4840176618e) +- Add an `assert.sparse.matrices.equal` function to compare two sparse matrices for equality for testing purposes (PR #248, 9784cdf12d1497ee122e2ae73b768b8c334210d4, d9f1a8d90e00a634d7caeb5e7f8f262776496838) +- Add tests for file `util-networks-misc.R` (#242, PR #248, PR #258, f3202a6f96723d11c170346556d036cf087521c8, 030574b9d0f3435db4032d0e195a3d407fb7244b, 380b02234275127297fcd508772c69db21c216de, 8b803c50d60fc593e4e527a08fd4c2068d801a48, 7335c3dd4d0302b024a66d18701d9800ed3fe806, 6b600df04bec1fe70c272604f274ec5309840e65, a53fab85358b223af43749a088ad02e9fbcb0a30, faf19fc369beb901b556ecb8c4fa0bf6f1bd6304) + +### Changed/Improved + +- Add input validation for the `bins` parameter in `split.data.time.based` and `split.data.by.bins` (PR #244, ed0a5302ea8c8934d7200b95be7ac1446305af07, 5e5ecbac44d07927b953ae9d4330a616f8224ba7) +- Test for the presence and validity of the `bins` attribute on network-, and data-splits (PR #249, c064affcfff2eb170d8bdcb39d837a7ff62b2cbd, 93051ab848ec94de138b0513dac22f6da0d20885) +- Simplify call chain-, and branching-routes in network-splitting functions and consequently set the `bins` attribute on every output network-split (while minimizing recalculations) (PR #249, #256, PR #257, a1842e9be46596321ee86860fd87d17a3c88f50f, 8695fbe7f21ccaa3ccd6d1016e754017d387b1fa) +- Rename `split.data.by.bins` into `split.dataframe.by.bins` as this it what it does (PR #244, ed5feb214a123b605c9513262f187cfd72b9e1f4) +- Throw an error in `split.data.time.based.by.timestamps` if no custom event timestamps are available in the ProjectData object (6305adcee7f18747141994b00bdd94641f95e86f) +- Enhance testing data by adding `add_link` and `referenced_by` issue events, which connect issues to form edges in issue-based artifact-networks. This includes duplicate edge information in JIRA data as produced by [codeface-extraction](https://github.com/se-sic/codeface-extraction) (PR #244, 9f840c040d552e8639aa82c3dd537c189679b348, ea4fe8d3c84f948af6147cf0137e80181ebb7a1e, 6eb731102301b1af08f4affb40d1f8df94500e34) +- Add a check for empty networks in the functions `metrics.scale.freeness` and `metrics.is.scale.free` and return `NA` if the network is empty (29418f2da38de8c39ec2a1fb3d445b63f320be40) +- Enhance `get.author.names.from.network` and `get.author.names.from.data` to always have the same output format. Now it doesn't depend on the `global` flag anymore (PR #248, d87d32564156f13c83ebe3361c2b68e5d0ac16ac, ddbfe68d3e628e82f34e09b36fffe886646986c5) +- Change `util-tensor.R` to correctly use the new output format of `get.author.names.from.network` (PR #248, 72b663ebf7169c0da5c687fe215529f3be0c08c5) +- Throw an error in `convert.adjacency.matrix.list.to.array` if the function is called with wrong parameters (PR #248, ece2d38b4972745af3a83e06f32317a06465a345, 1a3e510df15f5fa4e920e9fce3e0e162c27cd6d1) +- Rename `compare.networks` to `assert.networks.equal` to better match the purpose of the function (PR #248, d9f1a8d90e00a634d7caeb5e7f8f262776496838) +- Explicitly add R version 4.3 to the CI test pipeline (9f346d5bc3cfc553f01e5e80f0bbe51e1dc2b53e) + +### Fixed + +- Reformat `event.info.1` column of issue data according to the format, if the content of the `event.info.1` field references another issue (PR #244, 62ff9d0f31adbefb3381936237dc4ab984e33acb) +- Rename vertex attribute `IssueEvent` to `Issue` in multi-networks, to be consistent with bipartite-networks (PR #244, 26d7b7e9fd6d33d1c0a8a08f19c5c2e30346a3d9) +- Fix an issue in activity-based splitting where elements close to the border of bins might be assigned to the wrong bin. The issue was caused by the usage of `split.data.time.based` inside `split.data.activity.based` to split data into the previously derived bins, when elements close to bin borders share the same timestamps. It is fixed by replacing `split.data.time.based` by `split.data.by.bins` (PR #244, ece569ceaf557bb38cd0cfad437b69b30fe8a698) +- Remove the last range when using a sliding-window approach and the last range's elements are fully contained in the second last range (PR #244, 48ef4fa685adf6e5d85281e5b90a8ed8f6aeb197, 943228fbc91eed6854dacafa7075441e58b22675) +- Fix broken error logging in `metrics.smallworldness` (03e06881f06abf30d44b69d7988873f20b95232d) +- Fix `get.expanded.adjacency` to work if the provided author list does not contain all authors from the network and add a warning when that happens since it causes some authors from the network to be lost in the resulting matrix (PR #248, ff59017e114b10812dcfb1704a19e01fc1586a13) +- Fix `get.expanded.adjacency.matrices` to have correct names for the columns and rows (PR #248, PR #258, e72eff864a1cb1a4aecd430e450d4a6a5044fdf2, a53fab85358b223af43749a088ad02e9fbcb0a30) +- Fix `get.expanded.adjacency.cumulated` so that it works if `weighted` parameter is set to `FALSE` (PR #248, 2fb9a5d446653f6aee808cbfc87c2dafeb9a749a) +- Fix multi-network construction to work with `igraph` version 2.0.1.1, which does not allow to add an empty list of vertices (PR #250, 5547896faa279f6adaae4b2b77c7ab9623ddf256) + + ## 4.3 ### Added diff --git a/README.md b/README.md index 3ebd1bf92..62c029b33 100644 --- a/README.md +++ b/README.md @@ -632,7 +632,7 @@ Updates to the parameters can be done by calling `NetworkConf$update.variables(. * possible values: [*`"cochange"`*, `"callgraph"`, `"mail"`, `"issue"`] - `artifact.directed` * The directedness of edges in an artifact network - * **Note**: This parameter does not take effect for now, as the `cochange` relation is always undirected, while the `callgraph` relation is always directed. For the other relations (`mail` and `issue`), we currently do not have data available to exhibit edge information. + * **Note**: This parameter does only affect the `issue` relation, as the `cochange` relation is always undirected, while the `callgraph` relation is always directed. For the `mail`, we currently do not have data available to exhibit edge information. * [`TRUE`, *`FALSE`*] - `edge.attributes` * The list of edge-attribute names and information @@ -654,6 +654,10 @@ Updates to the parameters can be done by calling `NetworkConf$update.variables(. - `simplify` * Perform edge contraction to retrieve a simplified network * [`TRUE`, *`FALSE`*] +- `simplify.multiple.relations` + * Whether the simplified network should contract edges of multiple relations into a single edge or not (if not, there will be one edge for each relation, resulting in possibly more than one edge between a pair of vertices) + * **Note** This parameter does not take effect if ``simplify = FALSE``! + * [`TRUE`, *`FALSE`*] - `skip.threshold` * The upper bound for total amount of edges to build for a subset of the data, i.e., not building any edges for the subset exceeding the limit * any positive integer diff --git a/showcase.R b/showcase.R index a4cceb535..4a2c9a72e 100644 --- a/showcase.R +++ b/showcase.R @@ -16,13 +16,14 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock -## Copyright 2020-2021 by Thomas Bock +## Copyright 2020-2021, 2024 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2019 by Klara Schlueter ## Copyright 2020 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. @@ -218,7 +219,7 @@ cf.data = split.data.time.based(x.data, bins = mybins) ## construct (author) networks from range data my.networks = lapply(cf.data, function(range.data) { y = NetworkBuilder$new(project.data = range.data, network.conf = net.conf) - return (y$get.author.network()) + return(y$get.author.network()) }) ## add commit-count vertex attributes sample = add.vertex.attribute.author.commit.count(my.networks, x.data, aggregation.level = "range") @@ -361,6 +362,24 @@ g.motifs = motifs.count(network = g, remove.duplicates = TRUE, raw.data = FALSE) +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Network simplification -------------------------------------------------- + +## construct sample network +g = y$get.multi.network() +g = igraph::delete_edges(g, c(5, 6)) +g = igraph::delete_vertices(g, c(2, 4, 5, 6, 7, 8)) +g = g + igraph::edges(c("Björn", "Olaf", "Björn", "Olaf"), type = TYPE.EDGES.INTRA, weight = 1, + relation = "cochange", artifact.type = "Feature") + +## merge edges between vertice pairs that stem from the same data source +g.simplified = simplify.network(g) +plot.network(g.simplified) + +## merge all edges between vertice pairs +g.simplified = simplify.network(g, simplify.multiple.relations = TRUE) +plot.network(g.simplified) + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Plots ------------------------------------------------------------------- diff --git a/tests/codeface-data/results/testing/test_feature/feature/issues-github.list b/tests/codeface-data/results/testing/test_feature/feature/issues-github.list index 5bb9f2155..12d2c0a0b 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/issues-github.list +++ b/tests/codeface-data/results/testing/test_feature/feature/issues-github.list @@ -5,6 +5,8 @@ 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"930af63a030fb92e48eddff01f53284c3eeba80e";"""commit""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Karl";"karl@example.org";"2016-08-31 16:45:09";"";"""""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Thomas";"thomas@example.org";"2016-10-05 16:45:09";"";"""""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"6";"""issue""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"2";"""issue""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"mentioned";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 16:03:59";"open";"[]" @@ -15,6 +17,7 @@ 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"Björn";"bjoern@example.org";"2016-12-07 15:30:02";"udo";"""udo@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"labeled";"Olaf";"olaf@example.org";"2017-05-23 12:31:34";"decided";"""""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Björn";"bjoern@example.org";"2017-05-23 12:32:39";"open";"[]" +6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"referenced_by";"Karl";"karl@example.org";"2016-08-07 15:37:02";"3";"""issue""" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"created";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"state_updated";"Thomas";"thomas@example.org";"2016-07-12 15:59:59";"closed";"""open""" @@ -25,6 +28,7 @@ "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"commented";"Björn";"bjoern@example.org";"2016-07-12 14:59:25";"open";"[]" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"merged";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"";"""""" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"state_updated";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"closed";"""open""" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"referenced_by";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"3";"""issue""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commit_added";"Björn";"bjoern@example.org";"2016-07-12 15:58:59";"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"""""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"created";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commented";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" diff --git a/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list b/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list index 3740aa58f..9d443053e 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list +++ b/tests/codeface-data/results/testing/test_feature/feature/issues-jira.list @@ -11,6 +11,10 @@ "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-05-25 06:22:23";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-06-01 06:50:26";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"resolution_updated";"Björn";"bjoern@example.org";"2013-06-01 06:53:06";"fixed";"""unresolved""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"created";"Björn";"bjoern@example.org";"2016-07-12 16:01:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:02:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-15 19:55:39";"open";"[""unresolved""]" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list b/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list index 5bb9f2155..12d2c0a0b 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/issues-github.list @@ -5,6 +5,8 @@ 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"930af63a030fb92e48eddff01f53284c3eeba80e";"""commit""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Karl";"karl@example.org";"2016-08-31 16:45:09";"";"""""" 3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"referenced";"Thomas";"thomas@example.org";"2016-10-05 16:45:09";"";"""""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Karl";"karl@example.org";"2016-08-07 15:37:02";"6";"""issue""" +3;"Error in construct.networks.from.list for openssl function networks";"[""issue"", ""bug""]";"closed";"[]";"2016-07-12 15:59:25";"2016-07-12 16:06:30";"[]";"add_link";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"2";"""issue""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"mentioned";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"udo";"udo@example.org";"2016-07-12 15:30:02";"Thomas";"""thomas@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 16:03:59";"open";"[]" @@ -15,6 +17,7 @@ 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"Björn";"bjoern@example.org";"2016-12-07 15:30:02";"udo";"""udo@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"labeled";"Olaf";"olaf@example.org";"2017-05-23 12:31:34";"decided";"""""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Björn";"bjoern@example.org";"2017-05-23 12:32:39";"open";"[]" +6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"referenced_by";"Karl";"karl@example.org";"2016-08-07 15:37:02";"3";"""issue""" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"created";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" "1";"Example pull request 1";"[""pull request""]";"reopened";"[]";"2016-07-14 13:37:00";"";"[]";"state_updated";"Thomas";"thomas@example.org";"2016-07-12 15:59:59";"closed";"""open""" @@ -25,6 +28,7 @@ "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"commented";"Björn";"bjoern@example.org";"2016-07-12 14:59:25";"open";"[]" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"merged";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"";"""""" "2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"state_updated";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"closed";"""open""" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2016-07-12 14:59:25";"2016-07-12 16:04:59";"[]";"referenced_by";"Thomas";"thomas@example.org";"2016-08-07 15:30:00";"3";"""issue""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commit_added";"Björn";"bjoern@example.org";"2016-07-12 15:58:59";"72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0";"""""" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"created";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" "4";"Example pull request 4";"[""pull request"", ""enhancement""]";"open";"[]";"2016-07-12 16:02:02";"";"[]";"commented";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" diff --git a/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list b/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list index 3740aa58f..9d443053e 100644 --- a/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list +++ b/tests/codeface-data/results/testing/test_proximity/proximity/issues-jira.list @@ -11,6 +11,10 @@ "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-05-25 06:22:23";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"commented";"Olaf";"olaf@example.org";"2013-06-01 06:50:26";"open";"[""unresolved""]" "ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"resolution_updated";"Björn";"bjoern@example.org";"2013-06-01 06:53:06";"fixed";"""unresolved""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-328";"[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name";"[""issue"", ""bug""]";"closed";"[""fixed""]";"2013-04-21 23:52:09";"2013-05-25 20:02:08";"[""GUI"", ""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-332";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"referenced_by";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" +"ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"add_link";"Thomas";"thomas@example.org";"2017-05-21 12:00:00";"ZEPPELIN-328";"""issue""" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"created";"Björn";"bjoern@example.org";"2016-07-12 16:01:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:02:30";"open";"[""unresolved""]" "ZEPPELIN-332";"[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table";"[""issue"", ""bug""]";"open";"[""unresolved""]";"2016-07-12 16:01:30";"";"[""Interpreters""]";"commented";"Björn";"bjoern@example.org";"2016-07-15 19:55:39";"open";"[""unresolved""]" diff --git a/tests/test-core-peripheral.R b/tests/test-core-peripheral.R index 07c7389ce..c9397d6f6 100644 --- a/tests/test-core-peripheral.R +++ b/tests/test-core-peripheral.R @@ -17,6 +17,7 @@ ## Copyright 2022 by Thomas Bock ## Copyright 2019 by Christian Hechtl ## Copyright 2021 by Christian Hechtl +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -171,8 +172,8 @@ test_that("Issue-count classification" , { result = get.author.class.issue.count(proj.data, issue.type = "all") ## Assert - expected.core = data.frame(author.name = c("Björn", "Olaf", "Thomas"), issue.count = c(6, 6, 4)) - expected.peripheral = data.frame(author.name = c("Karl", "Max", "udo"), issue.count = c(1, 1, 1)) + expected.core = data.frame(author.name = c("Björn", "Olaf", "Thomas"), issue.count = c(6, 6, 6)) + expected.peripheral = data.frame(author.name = c("Karl", "Max", "udo"), issue.count = c(2, 1, 1)) expected = list(core = expected.core, peripheral = expected.peripheral) row.names(result[["core"]]) = NULL diff --git a/tests/test-data.R b/tests/test-data.R index 1d06a34ca..9c6f4f8cb 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -19,6 +19,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -402,8 +403,8 @@ test_that("Filter bots from issue data", { filtered.issues = proj.data$get.issues() expect_true(all(filtered.issues[["author.name"]] != "Thomas")) - ## there are now 41 issue events remaining, since 6 issue events have been removed during filtering - expect_equal(nrow(filtered.issues), 41) + ## there are now 43 issue events remaining, since 10 issue events have been removed during filtering + expect_equal(nrow(filtered.issues), 43) }) test_that("Filter bots from mail data", { diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index 8eaebaf8a..253e08ba5 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -15,7 +15,8 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Jakob Kronawitter -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -101,3 +102,113 @@ test_that("Network construction of the undirected artifact-cochange network", { ## test expect_true(igraph::identical_graphs(network.built, network.expected)) }) + +patrick::with_parameters_test_that("Network construction of an issue-based artifact-network", { + ## build expected network: + ## 1) vertices + vertices = data.frame(name = c("", + "", + "", + "", + "", + "" , + ""), + kind = "Issue", + type = TYPE.ARTIFACT) + ## 2) edges + edges = data.frame( + from = c("", "", ""), + to = c("", "", ""), + date = get.date.from.string(c("2016-08-07 15:30:00", "2016-08-07 15:37:02", "2017-05-21 12:00:00")), + artifact.type = c("IssueEvent", "IssueEvent", "IssueEvent"), + issue.id = c("", "", ""), + event.name = c("add_link", "add_link", "add_link"), + author.name = c("Thomas", "Karl", "Thomas"), + weight = c(1, 1, 1), + type = TYPE.EDGES.INTRA, + relation = "issue" + ) + + ## 3) when constructing directed networks, we cannot deduplicate jira edges + if (test.directed) { + edges = rbind(edges, data.frame( + from = "", + to = "", + date = get.date.from.string("2017-05-21 12:00:00"), + artifact.type = "IssueEvent", + issue.id = "", + event.name = "add_link", + author.name = "Thomas", + weight = 1, + type = TYPE.EDGES.INTRA, + relation = "issue" + )) + } + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("issues.only.comments", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(artifact.relation = "issue", artifact.directed = test.directed)) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + + ## build expected network + network.expected = igraph::graph.data.frame(edges, directed = test.directed, vertices = vertices) + + ## build network + network.built = network.builder$get.artifact.network() + + ## test + expect_true(igraph::identical_graphs(network.built, network.expected)) +}, patrick::cases( + "directed: FALSE" = list(test.directed = FALSE), + "directed: TRUE" = list(test.directed = TRUE) +)) + +patrick::with_parameters_test_that("Network construction of an empty 'comments-only' issue-based artifact-network", { + + ## + ## 'issues.only.comments' (by default), this should not create any edges + ## + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(artifact.relation = "issue", artifact.directed = test.directed)) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + + ## build network + network.built = network.builder$get.artifact.network() + + ## 1) vertices + vertices = data.frame(name = c("", + "", + "", + "", + "", + "", + ""), + kind = "Issue", + type = TYPE.ARTIFACT) + ## 2) edges + edges = data.frame( + from = character(), to = character(), date = get.date.from.string(character(0)), artifact.type = character(), + issue.id = character(), event.name = character(), weight = numeric(), type = character(), + relation = character() + ) + + ## build expected network + network.expected = igraph::graph.data.frame(edges, directed = test.directed, vertices = vertices) + + ## test + assert.networks.equal(network.built, network.expected) +}, patrick::cases( + "directed: FALSE" = list(test.directed = FALSE), + "directed: TRUE" = list(test.directed = TRUE) +)) diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index 613c18628..d4d0e9faa 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -21,6 +21,7 @@ ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker ## Copyright 2021 by Johannes Hostert +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. @@ -488,75 +489,94 @@ test_that("Network construction of the undirected author-issue network with all ## edge attributes edges = data.frame(from = c(rep("Thomas", 5), rep("Thomas", 4), rep("Olaf", 3), # - rep("Olaf", 4), # - rep("Karl", 6), rep("Karl", 5), rep("Olaf", 3), # + rep("Olaf", 4), rep("Thomas", 3), rep("Thomas", 3), # + rep("Olaf", 7), rep("Thomas", 7), rep("Thomas", 4), # rep("Olaf", 3), # - rep("udo", 4), rep("udo", 7), rep("udo", 3), rep("Thomas", 7), rep("Thomas", 3), rep("Björn", 6), # - rep("Thomas", 9), rep("Thomas", 6), rep("Björn", 11), # - rep("Björn", 6) # + rep("Thomas", 4), rep("Karl", 3), rep("Björn", 7), rep("Olaf", 3), rep("Thomas", 3), rep("Thomas", 7), + rep("Thomas", 3), rep("Björn", 6), rep("Olaf", 2), rep("Olaf", 6), # + rep("Thomas", 11), rep("Thomas", 8), rep("Olaf", 11), # + rep("Björn", 6), rep("Thomas", 5), rep("Thomas", 5) # ), to = c(rep("Olaf", 5), rep("Björn", 4), rep("Björn", 3), # - rep("Björn", 4), # - rep("Olaf", 6), rep("Thomas", 5), rep("Thomas", 3), # + rep("Björn", 4), rep("Björn", 3), rep("Olaf", 3), # + rep("Karl", 7), rep("Karl", 7), rep("Olaf", 4), # rep("Björn", 3), # - rep("Thomas", 4), rep("Björn", 7), rep("Olaf", 3), rep("Björn", 7), rep("Olaf", 3), rep("Olaf", 6), # - rep("Björn", 9), rep("Olaf", 6), rep("Olaf", 11), # - rep("Max", 6) # + rep("udo", 4), rep("udo", 3), rep("udo", 7), rep("udo", 3), rep("Karl", 3), rep("Björn", 7), + rep("Olaf", 3), rep("Karl", 6), rep("Karl", 2), rep("Björn", 6), # + rep("Björn", 11), rep("Olaf", 8), rep("Björn", 11), # + rep("Max", 6), rep("Björn", 5), rep("Max", 5) # ), date = get.date.from.string(c( "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", # "2016-07-12 16:01:01", "2016-07-14 13:37:00", "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-07-12 16:06:01", "2016-07-12 16:01:01", "2016-07-14 13:37:00", "2016-07-12 16:06:01", "2016-07-12 14:59:25", "2016-07-12 14:59:25", "2016-07-12 16:04:59", # - "2016-07-12 16:04:59", + "2016-07-12 16:04:59", "2016-07-12 14:59:25", "2016-07-12 14:59:25", + "2016-08-07 15:30:00", "2016-07-12 16:04:59", "2016-07-12 16:04:59", + "2016-08-07 15:30:00", "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-08-07 15:37:02", # - "2016-08-31 16:45:09", "2016-07-12 15:59:25", "2016-07-12 16:06:30", - "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-08-07 15:37:02", - "2016-08-31 16:45:09", "2016-10-05 16:45:09", "2016-07-12 15:59:25", - "2016-07-12 16:06:30", "2016-10-05 16:45:09", + "2016-08-07 15:37:02", "2016-08-31 16:45:09", "2016-07-12 15:59:25", + "2016-07-12 16:06:30", "2016-07-12 15:59:25", "2016-07-12 15:59:59", + "2016-08-07 15:37:02", "2016-08-07 15:37:02", "2016-08-31 16:45:09", + "2016-08-07 15:30:00", "2016-10-05 16:45:09", "2016-07-12 15:59:25", + "2016-07-12 16:06:30", "2016-08-07 15:30:00", "2016-10-05 16:45:09", "2016-07-12 16:02:02", "2016-07-12 16:02:02", "2016-07-12 16:02:02", # "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2016-07-12 16:03:59", # "2016-10-13 15:30:02", "2016-07-12 15:30:02", "2016-07-12 15:30:02", + "2016-08-07 15:37:02", "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2017-05-23 12:31:34", "2016-07-12 16:03:59", + "2016-10-13 15:30:02", "2016-08-07 15:37:02", "2016-07-12 16:03:59", "2016-10-13 15:30:02", "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2016-07-12 16:03:59", "2016-10-13 15:30:02", "2017-05-23 12:31:34", - "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-12-07 15:30:02", - "2016-12-07 15:30:02", "2017-05-23 12:32:39", "2017-05-23 12:31:34", - "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2013-05-05 21:46:30", # - "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", - "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2013-06-01 06:53:06", - "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2013-05-25 03:25:06", + "2016-08-07 15:37:02", "2016-08-31 15:30:02", "2016-10-05 15:30:02", + "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:32:39", + "2016-08-07 15:37:02", "2017-05-23 12:31:34", "2016-08-31 15:30:02", + "2016-10-05 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", + "2017-05-23 12:32:39", "2017-05-23 12:31:34", + "2013-04-21 23:52:09", "2013-04-21 23:52:09", "2017-05-21 12:00:00", # + "2017-05-21 12:00:00", "2013-05-05 21:46:30", "2013-05-05 21:49:21", + "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:48:41", + "2013-05-25 04:08:07", "2013-06-01 06:53:06", "2013-04-21 23:52:09", + "2013-04-21 23:52:09", "2017-05-21 12:00:00", "2017-05-21 12:00:00", + "2013-05-25 03:25:06", "2013-05-25 06:06:53", "2013-05-25 06:22:23", + "2013-06-01 06:50:26", "2013-05-05 21:46:30", "2013-05-05 21:49:21", + "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:48:41", + "2013-05-25 04:08:07", "2013-06-01 06:53:06", "2013-05-25 03:25:06", "2013-05-25 06:06:53", "2013-05-25 06:22:23", "2013-06-01 06:50:26", - "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", - "2013-05-06 01:04:34", "2013-05-25 03:48:41", "2013-05-25 04:08:07", - "2013-06-01 06:53:06", "2013-05-25 03:25:06", "2013-05-25 06:06:53", - "2013-05-25 06:22:23", "2013-06-01 06:50:26", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", # - "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52" + "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", + "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", + "2017-05-21 12:00:00", "2017-05-21 12:00:00", "2016-07-15 20:07:47", + "2016-07-27 20:12:08", "2016-07-28 06:27:52", "2017-05-21 12:00:00", + "2017-05-21 12:00:00" )), artifact.type = "IssueEvent", - issue.id = c( rep("", 12), rep("", 4), rep("", 14), - rep("", 3), rep("", 30), rep("", 26), - rep("", 6)), + issue.id = c(rep("", 12), rep("", 10), rep("", 18), + rep("", 3), rep("", 44), rep("", 30), + rep("", 16)), event.name = c("created", "commented", "state_updated", "commented", "state_updated", "created", # "commented", "state_updated", "commented", "commented", "state_updated", "commented", - "created", "commented", "merged", "state_updated", # - "created", "commented", "add_link", "referenced", "assigned", "state_updated", "created", # - "commented", "add_link", "referenced", "referenced", "assigned", "state_updated", "referenced", + "created", "commented", "merged", "state_updated", "created", "commented", "referenced_by", # + "merged", "state_updated", "referenced_by", + "created", "commented", "add_link", "add_link", "referenced", "assigned", "state_updated", "created", # + "commented", "add_link", "add_link", "referenced", "add_link", "referenced", "assigned", "state_updated", "add_link", + "referenced", "commit_added", "created", "commented", # - "mentioned", "subscribed", "commented", "add_link", "mentioned", "subscribed", "mentioned", # - "subscribed", "mentioned", "subscribed", "commented", "mentioned", "subscribed", "labeled", - "commented", "add_link", "mentioned", "subscribed", "mentioned", "subscribed", "commented", - "commented", "add_link", "labeled", "mentioned", "subscribed", "mentioned", "subscribed", - "commented", "labeled", - "created", "commented", "commented", "commented", "commented", "commented", "commented", # - "commented", "resolution_updated", "created", "commented", "commented", "commented", - "commented", "commented", "commented", "commented", "commented", "commented", "commented", + "mentioned", "subscribed", "commented", "add_link", "mentioned", "subscribed", "referenced_by", # + "mentioned", "subscribed", "mentioned", "subscribed", "mentioned", "subscribed", "commented", + "mentioned", "subscribed", "labeled", "commented", "add_link", "referenced_by", "commented", "add_link", "mentioned", + "subscribed", "mentioned", "subscribed", "commented", "commented", "add_link", "labeled", "referenced_by", + "mentioned", "subscribed", "mentioned", "subscribed", "commented", "referenced_by", "labeled", + "mentioned", "subscribed", "mentioned", "subscribed", "commented", "labeled", + "created", "commented", "referenced_by", "add_link", "commented", "commented", "commented", "commented", # + "commented", "commented", "resolution_updated", "created", "commented", "referenced_by", "add_link", "commented", + "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "resolution_updated", "commented", "commented", "commented", "commented", - "created", "commented", "commented", "commented", "commented", "commented" # + "created", "commented", "commented", "commented", "commented", "commented", "created", # + "commented", "commented", "referenced_by", "add_link", "commented", "commented", "commented", "referenced_by", "add_link" ), weight = 1, type = TYPE.EDGES.INTRA, diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index 5ebaf02e4..d3c068638 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -21,6 +21,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021-2022 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -597,21 +598,21 @@ test_that("Test add.vertex.attribute.author.issue.count", { networks.and.data = get.network.covariates.test.networks(issues=TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(0L, 1L, 1L), c(2L, 1L, 1L, 1L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(1L, 1L, 1L), c(2L, 1L, 1L, 1L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, 1L, 1L), c(1L, 2L, 1L), c(2L, 1L, 1L, 1L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 2L, 1L), c(2L, 2L, 2L), c(3L, 2L, 1L, 1L)), - project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 2L, 1L), c(2L, 3L, 2L), c(3L, 2L, 1L, 1L)), - complete = network.covariates.test.build.expected(c(3L, 1L, 3L, 1L), c(3L, 3L, 3L), c(3L, 3L, 1L, 1L)) + range = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(0L, 1L, 1L), c(2L, 1L, 1L, 2L, 1L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 1L, 1L), c(1L, 1L, 1L), c(2L, 1L, 2L, 2L, 1L)), + all.ranges = network.covariates.test.build.expected(c(2L, 2L, 1L, 1L), c(1L, 2L, 2L), c(2L, 1L, 2L, 2L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 2L, 1L), c(2L, 2L, 2L), c(3L, 2L, 3L, 2L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(3L, 2L, 2L, 1L), c(2L, 3L, 3L), c(3L, 2L, 3L, 2L, 1L)), + complete = network.covariates.test.build.expected(c(4L, 2L, 3L, 1L), c(3L, 3L, 4L), c(3L, 3L, 4L, 2L, 1L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 0L), c(1L, 1L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 1L), c(2L, 3L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 3L, 0L), c(3L, 2L, 1L), c(2L, 3L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 2L, 1L), c(3L, 3L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 3L, 0L), c(3L, 3L, 1L), c(3L, 3L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 3L, 0L), c(3L, 3L, 1L), c(3L, 3L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 0L), c(1L, 1L, 1L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 1L, 1L), c(2L, 3L, 2L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(2L, 0L, 3L, 0L), c(3L, 2L, 2L), c(2L, 3L, 2L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(3L, 2L, 1L), c(3L, 3L, 2L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(2L, 0L, 3L, 0L), c(3L, 3L, 2L), c(3L, 3L, 2L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(2L, 0L, 3L, 0L), c(3L, 3L, 2L), c(3L, 3L, 2L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -661,21 +662,21 @@ test_that("Test add.vertex.attribute.author.issues.commented.count", { networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 1L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 1L, 1L)), - project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 1L, 1L)), - complete = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 3L, 2L), c(3L, 1L, 1L, 1L)) + range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 0L, 1L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 2L, 1L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 2L, 2L), c(2L, 1L, 2L, 1L, 1L)), + complete = network.covariates.test.build.expected(c(2L, 1L, 1L, 0L), c(1L, 3L, 2L), c(3L, 1L, 2L, 1L, 1L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 1L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 1L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 1L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -724,21 +725,21 @@ test_that("Test add.vertex.attribute.author.issue.creation.count", { networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(0L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 1L, 0L)), - all.ranges = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 1L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 0L)) + range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(0L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 0L, 1L, 0L)), + all.ranges = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 0L), c(1L, 0L, 0L, 1L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 1L, 1L, 0L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 0L), c(0L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 1L), c(0L, 1L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 0L, 1L), c(0L, 1L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 1L, 1L), c(1L, 1L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 0L), c(0L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 0L, 1L), c(0L, 1L, 1L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 0L, 1L), c(0L, 1L, 1L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(1L, 1L, 1L), c(1L, 1L, 1L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 1L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 0L, 1L, 0L), c(1L, 1L, 1L), c(1L, 1L, 1L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -787,21 +788,21 @@ test_that("Test add.vertex.attribute.author.issue.comment.count", { networks.and.data = get.network.covariates.test.networks(issues = TRUE, author.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 3L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(2L, 0L, 1L, 3L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 2L, 1L), c(2L, 0L, 1L, 3L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 4L, 0L), c(4L, 7L, 2L), c(8L, 4L, 1L, 3L)), - project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 8L, 2L), c(8L, 4L, 1L, 3L)), - complete = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 9L, 2L), c(9L, 4L, 1L, 3L)) + range = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(1L, 0L, 0L, 0L, 3L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, 0L, 0L), c(0L, 1L, 1L), c(2L, 0L, 1L, 1L, 3L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, 0L, 0L), c(0L, 2L, 1L), c(2L, 0L, 1L, 1L, 3L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 1L, 4L, 0L), c(4L, 7L, 2L), c(8L, 4L, 2L, 1L, 3L)), + project.all.ranges = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 8L, 2L), c(8L, 4L, 2L, 1L, 3L)), + complete = network.covariates.test.build.expected(c(2L, 1L, 4L, 0L), c(4L, 9L, 2L), c(9L, 4L, 2L, 1L, 3L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L)), - cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 0L, 0L)), - all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 0L, 0L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 0L, 0L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)), - complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 0L, 0L)) + range = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 0L), c(1L, 0L, 0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 0L, 1L), c(1L, 2L, 1L, 0L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 1L, 1L), c(1L, 2L, 1L, 0L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 0L, 0L, 0L), c(2L, 1L, 1L), c(2L, 2L, 1L, 0L, 0L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)), + complete = network.covariates.test.build.expected(c(1L, 0L, 2L, 0L), c(2L, 2L, 1L), c(2L, 2L, 1L, 0L, 0L)) ) expected.attributes.both = sum.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1668,21 +1669,21 @@ test_that("Test add.vertex.attribute.issue.contributor.count", { networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 2L, 2L, 1L)), - cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 2L, 2L, 3L)), - all.ranges = network.covariates.test.build.expected(c(3L, 2L, NA), c(NA, 2L, NA, 3L, NA), c(NA, 2L, 2L, 3L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 2L, 2L, 3L)), - project.all.ranges = network.covariates.test.build.expected(c(3L, 2L, NA), c(NA, 2L, NA, 3L, NA), c(NA, 2L, 2L, 3L)), - complete = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, 4L)) + range = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 3L, 2L, NA, 2L)), + cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 3L, 2L, NA, 4L)), + all.ranges = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, NA, 4L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 2L, NA), c(NA, 1L, NA, 2L, NA), c(NA, 3L, 2L, NA, 4L)), + project.all.ranges = network.covariates.test.build.expected(c(4L, 3L, NA), c(NA, 2L, NA, 4L, NA), c(NA, 3L, 2L, NA, 4L)), + complete = network.covariates.test.build.expected(c(5L, 3L, NA), c(NA, 3L, NA, 5L, NA), c(NA, 3L, 3L, NA, 5L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, NA)), - cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 1L), c(3L, NA, NA, NA)), - all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 1L), c(3L, NA, NA, NA)), - project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)), - project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)), - complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, NA)) + range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, 1L, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 1L), c(3L, NA, NA, 2L, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 2L), c(3L, NA, NA, 2L, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 2L, NA, 2L), c(3L, NA, NA, 3L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 3L), c(3L, NA, NA, 3L, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 2L, NA, 3L), c(3L, NA, NA, 3L, NA)) ) expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1794,10 +1795,10 @@ test_that("Test add.vertex.attribute.issue.contributor.count with issues.only.co expected.attributes.issues.only = list( range = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 1L), c(NA, 2L)), cumulative = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 2L), c(NA, 2L)), - all.ranges = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 2L, 3L), c(NA, 2L)), + all.ranges = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)), project.cumulative = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 1L, 2L), c(NA, 2L)), - project.all.ranges = network.covariates.test.build.expected(c(NA, 2L), c(NA, NA, 2L, 3L), c(NA, 2L)), - complete = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)) + project.all.ranges = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 2L, 4L), c(NA, 2L)), + complete = network.covariates.test.build.expected(c(NA, 3L), c(NA, NA, 3L, 5L), c(NA, 3L)) ) expected.attributes.prs.only = list( @@ -1856,21 +1857,21 @@ test_that("Test add.vertex.attribute.issue.event.count", { networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 1L, NA), c(NA, 3L, 4L, 1L)), - cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 6L, 6L, 3L)), - all.ranges = network.covariates.test.build.expected(c(3L, 6L, NA), c(NA, 6L, NA, 3L, NA), c(NA, 6L, 6L, 3L)), - project.cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 6L, 6L, 3L)), - project.all.ranges = network.covariates.test.build.expected(c(3L, 6L, NA), c(NA, 6L, NA, 3L, NA), c(NA, 6L, 6L, 3L)), - complete = network.covariates.test.build.expected(c(8L, 7L, NA), c(NA, 6L, NA, 8L, NA), c(NA, 7L, 6L, 8L)) + range = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 1L, NA), c(NA, 4L, 4L, NA, 2L)), + cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 7L, 6L, NA, 4L)), + all.ranges = network.covariates.test.build.expected(c(4L, 7L, NA), c(NA, 6L, NA, 4L, NA), c(NA, 7L, 6L, NA, 4L)), + project.cumulative = network.covariates.test.build.expected(c(1L, 3L, NA), c(NA, 2L, NA, 2L, NA), c(NA, 7L, 6L, NA, 4L)), + project.all.ranges = network.covariates.test.build.expected(c(4L, 7L, NA), c(NA, 6L, NA, 4L, NA), c(NA, 7L, 6L, NA, 4L)), + complete = network.covariates.test.build.expected(c(9L, 8L, NA), c(NA, 7L, NA, 9L, NA), c(NA, 8L, 7L, NA, 9L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(NA, NA, 2L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, NA)), - cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 1L), c(5L, NA, NA, NA)), - all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 1L), c(5L, NA, NA, NA)), - project.cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)), - project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)), - complete = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, NA)) + range = network.covariates.test.build.expected(c(NA, NA, 2L), c(1L, NA, 2L, NA, 1L), c(2L, NA, NA, 1L, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 1L), c(5L, NA, NA, 2L, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 2L), c(5L, NA, NA, 2L, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 2L), c(3L, NA, 2L, NA, 2L), c(5L, NA, NA, 3L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 3L), c(5L, NA, NA, 3L, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 5L), c(5L, NA, 2L, NA, 3L), c(5L, NA, NA, 3L, NA)) ) expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1918,21 +1919,21 @@ test_that("Test add.vertex.attribute.issue.comment.count", { networks.and.data = get.network.covariates.test.networks("artifact", issues = TRUE, artifact.relation = "issue") expected.attributes.issues.only = list( - range = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 0L, 4L, 0L)), - cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - project.cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, 1L)), - complete = network.covariates.test.build.expected(c(2L, 1L, NA), c(NA, 5L, NA, 2L, NA), c(NA, 1L, 5L, 2L)) + range = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 0L, 4L, NA, 0L)), + cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + project.cumulative = network.covariates.test.build.expected(c(0L, 1L, NA), c(NA, 1L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(1L, 1L, NA), c(NA, 5L, NA, 1L, NA), c(NA, 1L, 5L, NA, 1L)), + complete = network.covariates.test.build.expected(c(2L, 1L, NA), c(NA, 5L, NA, 2L, NA), c(NA, 1L, 5L, NA, 2L)) ) expected.attributes.prs.only = list( - range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 1L, NA, 0L), c(1L, NA, NA, NA)), - cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 0L), c(3L, NA, NA, NA)), - all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 0L), c(3L, NA, NA, NA)), - project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)), - project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)), - complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, NA)) + range = network.covariates.test.build.expected(c(NA, NA, 1L), c(1L, NA, 1L, NA, 0L), c(1L, NA, NA, 0L, NA)), + cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 0L), c(3L, NA, NA, 0L, NA)), + all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 0L), c(3L, NA, NA, 0L, NA)), + project.cumulative = network.covariates.test.build.expected(c(NA, NA, 1L), c(2L, NA, 1L, NA, 1L), c(3L, NA, NA, 1L, NA)), + project.all.ranges = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, 1L, NA)), + complete = network.covariates.test.build.expected(c(NA, NA, 3L), c(3L, NA, 1L, NA, 1L), c(3L, NA, NA, 1L, NA)) ) expected.attributes.both = merge.expected.attributes(expected.attributes.issues.only, expected.attributes.prs.only) @@ -1991,6 +1992,7 @@ test_that("Test add.vertex.attribute.issue.opened.date", { c(NA, "2016-07-12 15:59:25", "2016-07-12 16:01:30", + NA, "2016-07-12 14:30:13")) expected.attributes.prs.only = network.covariates.test.build.expected( @@ -2005,6 +2007,7 @@ test_that("Test add.vertex.attribute.issue.opened.date", { c("2016-07-14 13:37:00", NA, NA, + "2016-07-12 14:59:25", NA)) expected.attributes.both = network.covariates.test.build.expected( @@ -2019,6 +2022,7 @@ test_that("Test add.vertex.attribute.issue.opened.date", { c("2016-07-14 13:37:00", "2016-07-12 15:59:25", "2016-07-12 16:01:30", + "2016-07-12 14:59:25", "2016-07-12 14:30:13")) ## convert date strings to POSIXct @@ -2088,6 +2092,7 @@ test_that("Test add.vertex.attribute.issue.closed.date", { c(NA, "2016-07-12 16:06:30", NA, + NA, NA)) expected.attributes.prs.only = network.covariates.test.build.expected( @@ -2102,6 +2107,7 @@ test_that("Test add.vertex.attribute.issue.closed.date", { c(NA, NA, NA, + "2016-07-12 16:04:59", NA)) expected.attributes.both = network.covariates.test.build.expected( @@ -2116,6 +2122,7 @@ test_that("Test add.vertex.attribute.issue.closed.date", { c(NA, "2016-07-12 16:06:30", NA, + "2016-07-12 16:04:59", NA)) ## convert date strings to POSIXct @@ -2177,79 +2184,79 @@ test_that("Test add.vertex.attribute.issue.last.activity.date", { range = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", NA), c(NA , "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), project.cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", NA), c(NA , "2016-07-12 16:02:30", NA , "2016-07-12 16:03:59", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), project.all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", NA), c(NA , "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02", NA), - c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c(NA , "2016-08-31 16:45:09", "2016-07-28 06:27:52", NA , "2016-08-31 15:30:02")), complete = network.covariates.test.build.expected( c("2017-05-23 12:32:39", "2016-10-05 16:45:09", NA), - c(NA , "2016-07-28 06:27:52", NA , "2017-05-23 12:32:39", NA), - c(NA , "2016-10-05 16:45:09", "2016-07-28 06:27:52", "2017-05-23 12:32:39"))) + c(NA , "2017-05-21 12:00:00", NA , "2017-05-23 12:32:39", NA), + c(NA , "2016-10-05 16:45:09", "2017-05-21 12:00:00", NA , "2017-05-23 12:32:39"))) expected.attributes.prs.only = list( range = network.covariates.test.build.expected( c(NA , NA , "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), cumulative = network.covariates.test.build.expected( c(NA , NA , "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), all.ranges = network.covariates.test.build.expected( c(NA , NA , "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), project.cumulative = network.covariates.test.build.expected( c(NA , NA , "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), project.all.ranges = network.covariates.test.build.expected( c(NA , NA , "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA)), + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA)), complete = network.covariates.test.build.expected( c(NA , NA , "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", NA , NA , NA))) + c("2016-07-14 13:37:00", NA , "2016-07-12 16:02:02", NA , "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", NA , NA , "2016-08-07 15:30:00", NA))) expected.attributes.both = list( range = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), project.cumulative = network.covariates.test.build.expected( c("2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 15:59:59"), c("2016-07-12 16:01:01", "2016-07-12 16:02:30", "2016-07-12 16:02:02", "2016-07-12 16:03:59", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), project.all.ranges = network.covariates.test.build.expected( c("2016-08-31 15:30:02", "2016-08-31 16:45:09", "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-31 15:30:02")), + c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2016-08-31 15:30:02", "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", "2016-08-31 16:45:09", "2016-07-28 06:27:52", "2016-08-07 15:30:00", "2016-08-31 15:30:02")), complete = network.covariates.test.build.expected( c("2017-05-23 12:32:39", "2016-10-05 16:45:09", "2016-07-14 13:37:00"), - c("2016-07-14 13:37:00", "2016-07-28 06:27:52", "2016-07-12 16:02:02", "2017-05-23 12:32:39", "2016-07-12 16:04:59"), - c("2016-07-14 13:37:00", "2016-10-05 16:45:09", "2016-07-28 06:27:52", "2017-05-23 12:32:39"))) + c("2016-07-14 13:37:00", "2017-05-21 12:00:00", "2016-07-12 16:02:02", "2017-05-23 12:32:39", "2016-08-07 15:30:00"), + c("2016-07-14 13:37:00", "2016-10-05 16:45:09", "2017-05-21 12:00:00", "2016-08-07 15:30:00", "2017-05-23 12:32:39"))) ## convert date strings to POSIXct expected.attributes.issues.only = lapply(expected.attributes.issues.only, function(times) { @@ -2328,6 +2335,7 @@ test_that("Test add.vertex.attribute.issue.title", { c(NA, "Error in construct.networks.from.list for openssl function networks", "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + NA, "Distinguish directedness of networks and edge-construction algorithm")) expected.attributes.prs.only = network.covariates.test.build.expected( @@ -2342,6 +2350,7 @@ test_that("Test add.vertex.attribute.issue.title", { c("Example pull request 1", NA, NA, + "Example pull request 2", NA)) expected.attributes.both = network.covariates.test.build.expected( @@ -2356,6 +2365,7 @@ test_that("Test add.vertex.attribute.issue.title", { c("Example pull request 1", "Error in construct.networks.from.list for openssl function networks", "[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", + "Example pull request 2", "Distinguish directedness of networks and edge-construction algorithm")) ## Test issues only @@ -2403,7 +2413,7 @@ test_that("Test add.vertex.attribute.pr.open.merged.or.closed", { networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "issue") expected.attributes = network.covariates.test.build.expected( - c(NA, NA, "open"), c("open", NA, "open", NA, "merged"), c("open", NA, NA, NA) + c(NA, NA, "open"), c("open", NA, "open", NA, "merged"), c("open", NA, NA, "merged", NA) ) ## Test @@ -2424,7 +2434,7 @@ test_that("Test add.vertex.attribute.issue.is.pull.request", { networks.and.data = get.network.covariates.test.networks("artifact", artifact.relation = "issue") expected.attributes = network.covariates.test.build.expected( - c(FALSE, FALSE, TRUE), c(TRUE, FALSE, TRUE, FALSE, TRUE), c(TRUE, FALSE, FALSE, FALSE) + c(FALSE, FALSE, TRUE), c(TRUE, FALSE, TRUE, FALSE, TRUE), c(TRUE, FALSE, FALSE, TRUE, FALSE) ) ## Test diff --git a/tests/test-networks-misc.R b/tests/test-networks-misc.R new file mode 100644 index 000000000..3e7d72351 --- /dev/null +++ b/tests/test-networks-misc.R @@ -0,0 +1,814 @@ +## This file is part of coronet, which is free software: you +## can redistribute it and/or modify it under the terms of the GNU General +## Public License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +## +## Copyright 2024 by Leo Sendelbach +## Copyright 2024 by Thomas Bock +## All Rights Reserved. + + +context("Tests for the file 'util-networks-misc.R'") + +## +## Context +## + +CF.DATA = file.path(".", "codeface-data") +CF.SELECTION.PROCESS = "testing" +CASESTUDY = "test" +ARTIFACT = "feature" + +## use only when debugging this file independently +if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") + +test_that("getting all authors of a list of networks, list length 0", { + + ## Act + result = get.author.names.from.networks(networks = list(), globally = TRUE) + + ## Assert + expected = list(c()) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of networks, list length 1", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + ## Act + result = get.author.names.from.networks(networks = list(network)) + + ## Assert + expected = list(c("Dieter", "Heinz", "Klaus")) + + expect_equal(expected, result) + +}) + +test_that("getting all authors of a list of networks, list length 1, not global", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + ## Act + result = get.author.names.from.networks(networks = list(network), globally = FALSE) + + ## Assert + expected = list(c("Dieter", "Heinz", "Klaus")) + + expect_equal(expected, result) + +}) + +test_that("getting all authors of a list of networks, list length 2", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + first.network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + second.vertices = data.frame( + name = c("Detlef", "Dieter"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + second.edges = data.frame( + from = "Detlef", + to = "Dieter" + ) + second.network = igraph::graph.data.frame(second.edges, directed = FALSE, vertices = second.vertices) + + ## Act + result = get.author.names.from.networks(networks = list(first.network, second.network)) + + ## Assert + expected = list(c("Detlef", "Dieter", "Heinz", "Klaus")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of networks, list length 2, not global", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + first.network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + second.vertices = data.frame( + name = c("Detlef", "Dieter"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + second.edges = data.frame( + from = "Detlef", + to = "Dieter" + ) + second.network = igraph::graph.data.frame(second.edges, directed = FALSE, vertices = second.vertices) + + ## Act + result = get.author.names.from.networks(networks = list(first.network, second.network), globally = FALSE) + + ## Assert + expected = list(c("Dieter", "Heinz", "Klaus"), c("Detlef", "Dieter")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges, list length 0", { + + ## Act + result = get.author.names.from.data(data.ranges = list()) + + ## Assert + expected = list(c()) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges, list length 1", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data = proj.data.base$get.data.cut.to.same.date("mails") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data)) + + ## Assert + expected = list(c("Björn", "Fritz fritz@example.org","georg", "Hans", + "Karl", "Olaf", "Thomas", "udo")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges, list length 1, not global", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data = proj.data.base$get.data.cut.to.same.date("mails") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data), globally = FALSE) + + ## Assert + expected = list(c("Björn", "Fritz fritz@example.org","georg", "Hans", + "Karl", "Olaf", "Thomas", "udo")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges, list length 2", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data.one = proj.data.base$get.data.cut.to.same.date("mails") + range.data.two = proj.data.base$get.data.cut.to.same.date("issues") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data.one, range.data.two)) + + ## Assert + expected = list(c("Björn", "Fritz fritz@example.org","georg", "Hans", + "Karl", "Max", "Olaf", "Thomas", "udo")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges, list length 2, not global", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data.one = proj.data.base$get.data.cut.to.same.date("mails") + range.data.two = proj.data.base$get.data.cut.to.same.date("issues") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data.one, range.data.two), globally = FALSE) + + ## Assert + expected = list(c("Björn", "Fritz fritz@example.org","georg", "Hans", "Karl", "Olaf", + "Thomas", "udo"), c("Björn", "Karl", "Max", "Olaf", "Thomas")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges by data source 'mails', list length 2, not global", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data.one = proj.data.base$get.data.cut.to.same.date("mails") + range.data.two = proj.data.base$get.data.cut.to.same.date("issues") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data.one, range.data.two), + data.sources = "mails", globally = FALSE) + + ## Assert + + expected = list(c("Björn", "Fritz fritz@example.org","georg", "Hans", "Olaf", + "Thomas", "udo"), c("Björn", "Olaf", "Thomas")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges by data source 'issues', list length 2, not global", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data.one = proj.data.base$get.data.cut.to.same.date("mails") + range.data.two = proj.data.base$get.data.cut.to.same.date("issues") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data.one, range.data.two), + data.sources = "issues", globally = FALSE) + + ## Assert + expected = list(c("Björn", "Karl", "Olaf", "Thomas"), c("Björn","Karl", "Max", "Olaf", "Thomas")) + + expect_equal(expected, result) +}) + +test_that("getting all authors of a list of data ranges by data source 'commits', list length 2, not global", { + + ## Arrange + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.data.base = ProjectData$new(project.conf = proj.conf) + range.data.one = proj.data.base$get.data.cut.to.same.date("mails") + range.data.two = proj.data.base$get.data.cut.to.same.date("issues") + + ## Act + result = get.author.names.from.data(data.ranges = list(range.data.one, range.data.two), + data.sources = "commits", globally = FALSE) + + ## Assert + + expected = list(c("Björn", "Olaf"), c("Björn", "Olaf", "Thomas")) + + expect_equal(expected, result) +}) + +test_that("getting a sparse adjacency matrix for a network, single edge, matching author list", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Heinz", "Dieter", "Klaus") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + matrix.out["Heinz", "Dieter"] = 1 + matrix.out["Dieter", "Heinz"] = 1 + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in) + + ## Assert + + expect_equal(matrix.out, result) + +}) + +test_that("getting a sparse adjacency matrix for a network, single edge, fewer authors than network", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Dieter", "Heinz") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + matrix.out["Heinz", "Dieter"] = 1 + matrix.out["Dieter", "Heinz"] = 1 + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in) + + ## Assert + + expect_equal(matrix.out, result) + +}) + +test_that("getting a sparse adjacency matrix for a network, single edge, more authors than network", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Gerhardt", "Bob", "Dieter", "Heinz", "Klaus") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + matrix.out["Heinz", "Dieter"] = 1 + matrix.out["Dieter", "Heinz"] = 1 + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in) + + ## Assert + + expect_equal(matrix.out, result) + +}) + +test_that("getting a sparse adjacency matrix for a network, single edge, no matching author list", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Gerhardt", "Bob", "Dieter", "Heinz") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + matrix.out["Heinz", "Dieter"] = 1 + matrix.out["Dieter", "Heinz"] = 1 + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in) + + ## Assert + + expect_equal(matrix.out, result) + +}) + +test_that("getting a sparse adjacency matrix for a network, single edge, no overlap in author list", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = "Heinz", + to = "Dieter" + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Gerhardt", "Bob") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in) + + ## Assert + + expect_equal(matrix.out, result) + +}) + +test_that("getting a sparse adjacency matrix for a network, two edges, more authors than network", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter"), + to = c("Dieter", "Klaus") + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Klaus", "Gerhardt", "Bob", "Dieter", "Heinz") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + # order these statements so that the second arguments are ordered alphabetically + # or use the helper function as used below + matrix.out["Heinz", "Dieter"] = 1 + matrix.out["Klaus", "Dieter"] = 1 + matrix.out["Dieter", "Heinz"] = 1 + matrix.out["Dieter", "Klaus"] = 1 + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in) + + ## Assert + expect_equal(matrix.out, result) + +}) + +test_that("getting a sparse adjacency matrix for a network, three edges, more authors than network, weighted", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz"), + weight = c(1, 3, 4) + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = c("Klaus", "Gerhardt", "Bob", "Dieter", "Heinz") + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + # order these statements so that the second arguments are ordered alphabetically + # or use the helper function as used below + matrix.out["Heinz", "Dieter"] = 5 + matrix.out["Klaus", "Dieter"] = 3 + matrix.out["Dieter", "Heinz"] = 5 + matrix.out["Dieter", "Klaus"] = 3 + + ## Act + result = get.expanded.adjacency(network = network.in, authors = authors.in, weighted = TRUE) + + ## Assert + expect_equal(matrix.out, result) +}) + +test_that("getting a sparse adjacency matrix per network, zero networks", { + ## Act + result = get.expanded.adjacency.matrices(networks = list()) + + ## Assert + expect_equal(list(), result) +}) + +test_that("getting a sparse adjacency matrix per network, one network", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz") + ) + network.in = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in = sort(c("Heinz", "Dieter", "Klaus")) + + matrix.out = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in), + length(authors.in)), repr = "T") + rownames(matrix.out) = authors.in + colnames(matrix.out) = authors.in + + # order these statements so that the second arguments are ordered alphabetically + # or use the helper function as used below + matrix.out["Heinz", "Dieter"] = 1 + matrix.out["Klaus", "Dieter"] = 1 + matrix.out["Dieter", "Heinz"] = 1 + matrix.out["Dieter", "Klaus"] = 1 + + ## Act + result = get.expanded.adjacency.matrices(networks = list(network.in)) + + ## Assert + expect_equal(list(matrix.out), result) +}) + +test_that("getting a sparse adjacency matrix per network, two networks", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz") + ) + network.in.one = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + vertices = data.frame( + name = c("Klaus", "Tobias"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Klaus"), + to = c("Tobias") + ) + network.in.two = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + all.authors = sort(c("Heinz", "Dieter", "Klaus", "Tobias")) + + matrix.out.one = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(all.authors), + length(all.authors)), repr = "T") + rownames(matrix.out.one) = all.authors + colnames(matrix.out.one) = all.authors + + # order these statements so that the second arguments are ordered alphabetically + # or use the helper function as used below + matrix.out.one["Heinz", "Dieter"] = 1 + matrix.out.one["Klaus", "Dieter"] = 1 + matrix.out.one["Dieter", "Heinz"] = 1 + matrix.out.one["Dieter", "Klaus"] = 1 + + matrix.out.two = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(all.authors), + length(all.authors)), repr = "T") + rownames(matrix.out.two) = all.authors + colnames(matrix.out.two) = all.authors + + # order these statements so that the second arguments are ordered alphabetically + # or use the helper function as used below + matrix.out.two["Tobias", "Klaus"] = 1 + matrix.out.two["Klaus", "Tobias"] = 1 + + ## Act + result = get.expanded.adjacency.matrices(networks = list(network.in.one, network.in.two)) + + ## Assert + expect_equal(list(matrix.out.one, matrix.out.two), result) +}) + +test_that("getting cumulative sums of adjacency matrices generated from networks, two networks", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz") + ) + network.in.one = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.one = sort(c("Heinz", "Dieter", "Klaus")) + + matrix.out.one = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in.one), + length(authors.in.one)), repr = "T") + rownames(matrix.out.one) = authors.in.one + colnames(matrix.out.one) = authors.in.one + + matrix.out.one["Heinz", "Dieter"] = 1 + matrix.out.one["Klaus", "Dieter"] = 1 + matrix.out.one["Dieter", "Heinz"] = 1 + matrix.out.one["Dieter", "Klaus"] = 1 + + edges = data.frame( + from = c("Klaus"), + to = c("Dieter") + ) + network.in.two = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.two = sort(c("Heinz", "Dieter", "Klaus")) + + matrix.out.two = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in.two), + length(authors.in.two)), repr = "T") + rownames(matrix.out.two) = authors.in.two + colnames(matrix.out.two) = authors.in.two + + matrix.out.two["Heinz", "Dieter"] = 1 + matrix.out.two["Klaus", "Dieter"] = 1 + matrix.out.two["Dieter", "Heinz"] = 1 + matrix.out.two["Dieter", "Klaus"] = 1 + + ## Act + result = get.expanded.adjacency.cumulated(networks = list(network.in.one, network.in.two)) + + ## Assert + assert.sparse.matrices.equal(matrix.out.one, result[[1]]) + assert.sparse.matrices.equal(matrix.out.two, result[[2]]) +}) + +test_that("getting cumulative sums of adjacency matrices generated from networks, two networks, weighted", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz"), + weight = c(1, 2, 1) + ) + network.in.one = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.one = sort(c("Heinz", "Dieter", "Klaus")) + + matrix.out.one = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in.one), + length(authors.in.one)), repr = "T") + rownames(matrix.out.one) = authors.in.one + colnames(matrix.out.one) = authors.in.one + + matrix.out.one["Heinz", "Dieter"] = 2 + matrix.out.one["Klaus", "Dieter"] = 2 + matrix.out.one["Dieter", "Heinz"] = 2 + matrix.out.one["Dieter", "Klaus"] = 2 + + edges = data.frame( + from = c("Klaus"), + to = c("Dieter"), + weight = c(1) + ) + network.in.two = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.two = sort(c("Heinz", "Dieter", "Klaus")) + + matrix.out.two = Matrix::sparseMatrix(i = c(), j = c(), x = 0, dims = c(length(authors.in.two), + length(authors.in.two)), repr = "T") + rownames(matrix.out.two) = authors.in.two + colnames(matrix.out.two) = authors.in.two + + matrix.out.two["Heinz", "Dieter"] = 2 + matrix.out.two["Klaus", "Dieter"] = 3 + matrix.out.two["Dieter", "Heinz"] = 2 + matrix.out.two["Dieter", "Klaus"] = 3 + + ## Act + result = get.expanded.adjacency.cumulated(networks = list(network.in.one, network.in.two), weighted = TRUE) + + ## Assert + assert.sparse.matrices.equal(matrix.out.one, result[[1]]) + assert.sparse.matrices.equal(matrix.out.two, result[[2]]) +}) + +test_that("getting cumulative sums of adjacency matrices generated from networks, + two networks, then convert to array", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz") + ) + network.in.one = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.one = sort(c("Heinz", "Dieter", "Klaus")) + + edges = data.frame( + from = c("Klaus"), + to = c("Dieter") + ) + network.in.two = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + + expected.array = array(data = 0, dim = c(3, 3, 2)) + rownames(expected.array) = authors.in.one + colnames(expected.array) = authors.in.one + + expected.array[1, 2, 1] = 1 + expected.array[1, 3, 1] = 1 + expected.array[2, 1, 1] = 1 + expected.array[3, 1, 1] = 1 + + expected.array[1, 2, 2] = 1 + expected.array[1, 3, 2] = 1 + expected.array[2, 1, 2] = 1 + expected.array[3, 1, 2] = 1 + + ## Act + result.adjacency = get.expanded.adjacency.cumulated(networks = list(network.in.one, network.in.two)) + result.array = convert.adjacency.matrix.list.to.array(result.adjacency) + + ## Assert + expect_equal(expected.array, result.array) +}) + +test_that("getting cumulative sums of adjacency matrices generated from networks, + two networks, weighted, then convert to array", { + + ## Arrange + vertices = data.frame( + name = c("Heinz", "Dieter", "Klaus"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR + ) + edges = data.frame( + from = c("Heinz", "Dieter", "Dieter"), + to = c("Dieter", "Klaus", "Heinz"), + weight = c(1, 2, 1) + ) + network.in.one = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.one = sort(c("Heinz", "Dieter", "Klaus")) + + edges = data.frame( + from = c("Klaus"), + to = c("Dieter"), + weight = c(1) + ) + network.in.two = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) + authors.in.two = sort(c("Heinz", "Dieter", "Klaus")) + + expected.array = array(data = 0, dim = c(3, 3, 2)) + rownames(expected.array) = authors.in.one + colnames(expected.array) = authors.in.one + + expected.array[1, 2, 1] = 2 + expected.array[1, 3, 1] = 2 + expected.array[2, 1, 1] = 2 + expected.array[3, 1, 1] = 2 + + expected.array[1, 2, 2] = 2 + expected.array[1, 3, 2] = 3 + expected.array[2, 1, 2] = 2 + expected.array[3, 1, 2] = 3 + + ## Act + result.adjacency = get.expanded.adjacency.cumulated(networks = list(network.in.one, network.in.two), weighted = TRUE) + result.array = convert.adjacency.matrix.list.to.array(result.adjacency) + + ## Assert + expect_equal(expected.array, result.array) +}) diff --git a/tests/test-networks-multi-relation.R b/tests/test-networks-multi-relation.R index c724d155f..5b0d3b42d 100644 --- a/tests/test-networks-multi-relation.R +++ b/tests/test-networks-multi-relation.R @@ -20,6 +20,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann ## Copyright 2023 by Maximilian Löffler +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -224,8 +225,8 @@ test_that("Construction of the multi network for the feature artifact with autho ## 1) construct expected vertices vertices = data.frame( name = c("Björn", "Olaf", "Karl", "Thomas", "udo", "Fritz fritz@example.org", "georg", "Hans", - "Base_Feature", "foo", "A", "", "", "", "", - "", "", ""), + "Base_Feature", "foo", "A", "", "", "", + "", "", "", ""), kind = c(rep(TYPE.AUTHOR, 8), rep("Feature", 3), rep("Issue", 7)), type = c(rep(TYPE.AUTHOR, 8), rep(TYPE.ARTIFACT, 10)) ) @@ -312,7 +313,7 @@ test_that("Construction of the multi network for the feature artifact with autho network.expected = igraph::graph.data.frame(edges, vertices = vertices, directed = net.conf$get.value("author.directed")) - compare.networks(network.expected, network.built) + assert.networks.equal(network.expected, network.built) }) test_that("Construction of the multi-artifact bipartite network with artifact relations 'cochange' and 'issue'", { @@ -407,7 +408,7 @@ test_that("Construction of the multi-artifact bipartite network with artifact re net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - compare.networks(net.expected, net.combined) + assert.networks.equal(net.expected, net.combined) }) @@ -494,7 +495,7 @@ test_that("Construction of the multi-artifact bipartite network with artifact re net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - compare.networks(net.expected, net.combined) + assert.networks.equal(net.expected, net.combined) }) @@ -596,7 +597,7 @@ test_that("Construction of the multi-artifact bipartite network with artifact re net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - compare.networks(net.expected, net.combined) + assert.networks.equal(net.expected, net.combined) }) @@ -716,6 +717,6 @@ test_that("Construction of the multi-artifact bipartite network with artifact re net.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - compare.networks(net.expected, net.combined) + assert.networks.equal(net.expected, net.combined) }) diff --git a/tests/test-networks-multi.R b/tests/test-networks-multi.R index bbc93894d..70f26f631 100644 --- a/tests/test-networks-multi.R +++ b/tests/test-networks-multi.R @@ -16,6 +16,7 @@ ## Copyright 2018 by Barbara Eckl ## Copyright 2022 by Jonathan Baumann ## Copyright 2023 by Maximilian Löffler +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -92,6 +93,6 @@ test_that("Construction of the multi network for the feature artifact with autho network.expected = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) - compare.networks(network.expected, network.built) + assert.networks.equal(network.expected, network.built) }) diff --git a/tests/test-networks.R b/tests/test-networks.R index 24a1a0988..62e117be4 100644 --- a/tests/test-networks.R +++ b/tests/test-networks.R @@ -13,6 +13,7 @@ ## ## Copyright 2018-2019 by Claus Hunsen ## Copyright 2021 by Niklas Schneider +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. @@ -78,6 +79,198 @@ test_that("Simplify network with more than one relation", { }) +test_that("Simplify basic multi-relational network", { + + ## + ## Simplify networks with vertices connected by multi-relational edges + ## + + ## create artifact network with vertices connected by "cochange" and "mail" edges + network = + igraph::make_empty_graph(n = 0, directed = FALSE) + + igraph::vertices("A", "B", type = TYPE.ARTIFACT, kind = "feature") + for (i in 1:3) { + network = igraph::add.edges(network, c("A", "B"), type = TYPE.EDGES.INTRA, relation = "mail") + network = igraph::add.edges(network, c("A", "B"), type = TYPE.EDGES.INTRA, relation = "cochange") + } + + network.expected = igraph::make_empty_graph(n = 0, directed = FALSE) + + igraph::vertices("A", "B", type = TYPE.ARTIFACT, kind = "feature") + + igraph::edges("A", "B", type = TYPE.EDGES.INTRA, relation = "mail") + + igraph::edges("A", "B", type = TYPE.EDGES.INTRA, relation = "cochange") + + ## simplify network without simplifying multiple relations into single edges + network.simplified = simplify.network(network, simplify.multiple.relations = FALSE) + assert.networks.equal(network.simplified, network.expected) + + ## simplify network with simplifying multiple relations into single edges + network.simplified = simplify.network(network, simplify.multiple.relations = TRUE) + expect_identical(igraph::ecount(network.simplified), 1) + expect_identical(igraph::E(network.simplified)$type[[1]], "Unipartite") + expect_identical(igraph::E(network.simplified)$relation[[1]], c("cochange", "mail")) +}) + +test_that("Simplify author-network with relation = c('cochange', 'mail') using both algorithms", { + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + net.conf = NetworkConf$new() + net.conf$update.values(updated.values = list(author.relation = c("cochange", "mail"), simplify = TRUE)) + + ## construct objects + proj.data = ProjectData$new(project.conf = proj.conf) + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + + ## vertex attributes + authors = data.frame(name = c("Björn", "Olaf", "Karl", "Thomas", "udo", "Fritz fritz@example.org", "georg", "Hans"), + kind = TYPE.AUTHOR, + type = TYPE.AUTHOR) + + + ## ---------------------- simplify.multiple.relations == FALSE -------------------------- ## + + ## edge attributes + data = data.frame(comb.1. = c("Björn", "Olaf", "Olaf", "Karl", # cochange + "Björn", "Olaf"), # mail + comb.2. = c("Olaf", "Karl", "Thomas", "Thomas", # cochange + "Olaf", "Thomas")) # mail + data$date = list(get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45")), + get.date.from.string(c("2016-07-12 16:05:41", "2016-07-12 16:06:10")), + get.date.from.string(c("2016-07-12 16:05:41", "2016-07-12 16:06:32")), + get.date.from.string(c("2016-07-12 16:06:10", "2016-07-12 16:06:32")), # cochange + get.date.from.string(c("2016-07-12 15:58:40", "2016-07-12 15:58:50")), + get.date.from.string(c("2016-07-12 16:04:40", "2016-07-12 16:05:37"))) # mail + data$artifact.type = list(c("Feature", "Feature"), c("Feature", "Feature"), + c("Feature", "Feature"), c("Feature", "Feature"), # cochange + c("Mail", "Mail"), c("Mail", "Mail")) # mail + data$hash = list(c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338"), + c("3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61"), + c("3a0ed78458b3976243db6829f63eba3eead26774", "0a1a5c523d835459c42f33e863623138555e2526"), + c("1143db502761379c2bfcecc2007fc34282e7ee61", "0a1a5c523d835459c42f33e863623138555e2526"), + as.character(c(NA, NA)), as.character(c(NA, NA))) + data$file = list(c("test.c", "test.c"), c("test2.c", "test3.c"), c("test2.c", "test2.c"), c("test3.c", "test2.c"), + as.character(c(NA, NA)), as.character(c(NA, NA))) + data$artifact = list(c("A", "A"), c("Base_Feature", "Base_Feature"), c("Base_Feature", "Base_Feature"), + c("Base_Feature", "Base_Feature"), as.character(c(NA, NA)), as.character(c(NA, NA))) + data$weight = rep(2, 6) + data$type = rep(TYPE.EDGES.INTRA, 6) + data$relation = c(rep("cochange", 4), rep("mail", 2)) + data$message.id = list(as.character(c(NA, NA)), as.character(c(NA, NA)), as.character(c(NA, NA)), as.character(c(NA, NA)), + c("<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>", + "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>"), + c("<65a1sf31sagd684dfv31@mail.gmail.com>", + "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>")) + data$thread = list(as.character(c(NA, NA)), as.character(c(NA, NA)), as.character(c(NA, NA)), as.character(c(NA, NA)), + c("", ""), c("", "")) + + ## build expected network + network.expected = igraph::graph.data.frame(data, vertices = authors, + directed = net.conf$get.value("author.directed")) + + ## build simplified network + network.built = network.builder$get.author.network() + + assert.networks.equal(network.built, network.expected) + + + ## ---------------------- simplify.multiple.relations == TRUE --------------------------- ## + + data = data.frame(comb.1. = c("Björn", "Olaf", "Olaf", "Karl"), + comb.2. = c("Olaf", "Karl", "Thomas", "Thomas")) + + data$date = list(get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", # cochange + "2016-07-12 15:58:40", "2016-07-12 15:58:50")), # mail + get.date.from.string(c("2016-07-12 16:05:41", "2016-07-12 16:06:10")), # cochange + get.date.from.string(c("2016-07-12 16:05:41", "2016-07-12 16:06:32", # cochange + "2016-07-12 16:04:40", "2016-07-12 16:05:37")), # mail + get.date.from.string(c("2016-07-12 16:06:10", "2016-07-12 16:06:32"))) # cochange + data$artifact.type = list(c("Feature", "Feature", "Mail", "Mail"), + c("Feature", "Feature"), + c("Feature", "Feature", "Mail", "Mail"), + c("Feature", "Feature")) + data$hash = list(as.character(c("72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "5a5ec9675e98187e1e92561e1888aa6f04faa338", NA, NA)), + c("3a0ed78458b3976243db6829f63eba3eead26774", "1143db502761379c2bfcecc2007fc34282e7ee61"), + as.character(c("3a0ed78458b3976243db6829f63eba3eead26774", "0a1a5c523d835459c42f33e863623138555e2526", NA, NA)), + c("1143db502761379c2bfcecc2007fc34282e7ee61", "0a1a5c523d835459c42f33e863623138555e2526")) + data$file = list(as.character(c("test.c", "test.c", NA, NA)), c("test2.c", "test3.c"), + as.character(c("test2.c", "test2.c", NA, NA)), c("test3.c", "test2.c")) + data$artifact = list(as.character(c("A", "A", NA, NA)), c("Base_Feature", "Base_Feature"), + as.character(c("Base_Feature", "Base_Feature", NA, NA)), c("Base_Feature", "Base_Feature")) + data$weight = c(4, 2, 4, 2) + data$type = rep(TYPE.EDGES.INTRA, 4) + data$relation = list(c("cochange", "mail"), c("cochange"), c("cochange", "mail"), c("cochange")) + data$message.id = list(as.character(c(NA, NA, "<4cbaa9ef0802201124v37f1eec8g89a412dfbfc8383a@mail.gmail.com>", + "<6784529b0802032245r5164f984l342f0f0dc94aa420@mail.gmail.com>")), + as.character(c(NA, NA)), + as.character(c(NA, NA, "<65a1sf31sagd684dfv31@mail.gmail.com>", + "<9b06e8d20801220234h659c18a3g95c12ac38248c7e0@mail.gmail.com>")), + as.character(c(NA, NA))) + data$thread = list(as.character(c(NA, NA, "", "")), + as.character(c(NA, NA)), + as.character(c(NA, NA, "", "")), + as.character(c(NA, NA))) + + ## build expected network + network.expected = igraph::graph.data.frame(data, vertices = authors, + directed = net.conf$get.value("author.directed")) + + ## build simplified network + network.builder$update.network.conf(updated.values = list(simplify.multiple.relations = TRUE)) + network.built = network.builder$get.author.network() + + assert.networks.equal(network.built, network.expected) + +}) + +test_that("Simplify multiple basic multi-relational networks", { + + ## + ## Simplify networks with vertices connected by multi-relational edges + ## + + ## create artifact network with vertices connected by "cochange" and "mail edges" + network.A = + igraph::make_empty_graph(n = 0, directed = FALSE) + + igraph::vertices("A", "B", type = TYPE.ARTIFACT, kind = "feature") + network.B = + igraph::make_empty_graph(n = 0, directed = FALSE) + + igraph::vertices("C", "D", type = TYPE.AUTHOR, kind = TYPE.AUTHOR) + for (i in 1:3) { + network.A = igraph::add.edges(network.A, c("A", "B"), type = TYPE.EDGES.INTRA, relation = "mail") + network.A = igraph::add.edges(network.A, c("A", "B"), type = TYPE.EDGES.INTRA, relation = "cochange") + network.B = igraph::add.edges(network.B, c("C", "D"), type = TYPE.EDGES.INTRA, relation = "mail") + network.B = igraph::add.edges(network.B, c("C", "D"), type = TYPE.EDGES.INTRA, relation = "cochange") + } + + network.A.expected = igraph::make_empty_graph(n = 0, directed = FALSE) + + igraph::vertices("A", "B", type = TYPE.ARTIFACT, kind = "feature") + + igraph::edges("A", "B", type = TYPE.EDGES.INTRA, relation = "mail") + + igraph::edges("A", "B", type = TYPE.EDGES.INTRA, relation = "cochange") + network.B.expected = igraph::make_empty_graph(n = 0, directed = FALSE) + + igraph::vertices("C", "D", type = TYPE.AUTHOR, kind = TYPE.AUTHOR) + + igraph::edges("C", "D", type = TYPE.EDGES.INTRA, relation = "mail") + + igraph::edges("C", "D", type = TYPE.EDGES.INTRA, relation = "cochange") + networks = list(A = network.A, B = network.B) + + ## simplify networks without simplifying multiple relations into single edges + networks.simplified = simplify.networks(networks, simplify.multiple.relations = FALSE) + expect_true(length(networks.simplified) == 2) + expect_identical(names(networks.simplified), names(networks)) + assert.networks.equal(networks.simplified[["A"]], network.A.expected) + assert.networks.equal(networks.simplified[["B"]], network.B.expected) + + ## simplify networks with simplifying multiple relations into single edges + networks.simplified = simplify.networks(networks, simplify.multiple.relations = TRUE) + expect_true(length(networks.simplified) == 2) + expect_identical(names(networks.simplified), names(networks)) + for (i in 1:2) { + expect_identical(igraph::ecount(networks.simplified[[i]]), 1) + expect_identical(igraph::E(networks.simplified[[i]])$type[[1]], "Unipartite") + expect_identical(igraph::E(networks.simplified[[i]])$relation[[1]], c("cochange", "mail")) + } +}) + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Merge ------------------------------------------------------------------- @@ -727,3 +920,24 @@ test_that("Get the data sources from a network with one relation", { expect_identical(expected.data.sources, get.data.sources.from.relations(network), info = "data sources: mails") }) + +test_that("Get the data sources from a network with multiple relations on a single edge", { + expected.data.sources = c("commits", "mails") + + ## configurations + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + proj.conf$update.value("commits.filter.base.artifact", FALSE) + ## construct data object + proj.data = ProjectData$new(project.conf = proj.conf) + + ## construct network builder + net.conf = NetworkConf$new() + network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) + network.builder$update.network.conf(updated.values = list(author.relation = c("mail", "cochange"))) + + ## build network + network = network.builder$get.author.network() + network = simplify.network(network, simplify.multiple.relations = TRUE) + + expect_identical(expected.data.sources, get.data.sources.from.relations(network), info = "data sources: commits, mails") +}) diff --git a/tests/test-read.R b/tests/test-read.R index 9a597f230..db3645d4d 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -21,7 +21,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Mirabdulla Yusifli ## Copyright 2022 by Jonathan Baumann -## Copyright 2022-2023 by Maximilian Löffler +## Copyright 2022-2024 by Maximilian Löffler ## All Rights Reserved. @@ -351,107 +351,110 @@ test_that("Read and parse the issue data.", { issue.data.read.github = read.issues(proj.conf$get.value("datapath.issues"), proj.conf$get.value("issues.from.source")) ## build the expected data.frame - issue.data.expected = data.frame(issue.id = c(rep("", 13), rep("", 6), - rep("", 7), rep("", 10), - rep("", 6), rep("", 4), rep("", 3)), - issue.title = c(rep("[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name", 13), - rep("[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", 6), - rep("Error in construct.networks.from.list for openssl function networks", 7), - rep("Distinguish directedness of networks and edge-construction algorithm", 10), + issue.data.expected = data.frame(issue.id = c(rep("", 15), rep("", 8), + rep("", 9), rep("", 11), + rep("", 6), rep("", 5), rep("", 3)), + issue.title = c(rep("[ZEPPELIN-328] Interpreter page should clarify the % magic syntax for interpreter group.name", 15), + rep("[ZEPPELIN-332] CNFE when running SQL query against Cassandra temp table", 8), + rep("Error in construct.networks.from.list for openssl function networks", 9), + rep("Distinguish directedness of networks and edge-construction algorithm", 11), rep("Example pull request 1", 6), - rep("Example pull request 2", 4), + rep("Example pull request 2", 5), rep("Example pull request 4", 3)), - issue.type = I(c(rep(list(list("issue" , "bug")), 13), rep(list(list("issue" , "bug")), 6), - rep(list(list("issue" , "bug")), 7), rep(list(list("issue", "bug", "enhancement")), 10), - rep(list(list("pull request")), 6), rep(list(list("pull request")), 4), rep(list(list("pull request", "enhancement")), 3))), - issue.state = c(rep("closed", 13), rep("open", 6), rep("closed", 7), rep("open", 10), - rep("reopened", 6), rep("closed", 4), rep("open", 3)), - issue.resolution = I(c(rep(list(list("fixed")), 13), rep(list(list("unresolved")), 6), - rep(list(list()), 7), rep(list(list()), 10), - rep(list(list()), 6), rep(list(list()), 4), rep(list(list()), 3))), - creation.date = get.date.from.string(c(rep("2013-04-21 23:52:09", 13), - rep("2016-07-12 16:01:30", 6), - rep("2016-07-12 15:59:25", 7), - rep("2016-07-12 14:30:13", 10), + issue.type = I(c(rep(list(list("issue" , "bug")), 15), rep(list(list("issue" , "bug")), 8), + rep(list(list("issue" , "bug")), 9), rep(list(list("issue", "bug", "enhancement")), 11), + rep(list(list("pull request")), 6), rep(list(list("pull request")), 5), rep(list(list("pull request", "enhancement")), 3))), + issue.state = c(rep("closed", 15), rep("open", 8), rep("closed", 9), rep("open", 11), + rep("reopened", 6), rep("closed", 5), rep("open", 3)), + issue.resolution = I(c(rep(list(list("fixed")), 15), rep(list(list("unresolved")), 8), + rep(list(list()), 9), rep(list(list()), 11), + rep(list(list()), 6), rep(list(list()), 5), rep(list(list()), 3))), + creation.date = get.date.from.string(c(rep("2013-04-21 23:52:09", 15), + rep("2016-07-12 16:01:30", 8), + rep("2016-07-12 15:59:25", 9), + rep("2016-07-12 14:30:13", 11), rep("2016-07-14 13:37:00", 6), - rep("2016-07-12 14:59:25", 4), + rep("2016-07-12 14:59:25", 5), rep("2016-07-12 16:02:02", 3))), - closing.date = get.date.from.string(c(rep("2013-05-25 20:02:08", 13), rep(NA, 6), - rep("2016-07-12 16:06:30", 7), rep(NA, 10), + closing.date = get.date.from.string(c(rep("2013-05-25 20:02:08", 15), rep(NA, 8), + rep("2016-07-12 16:06:30", 9), rep(NA, 11), rep(NA, 6), - rep("2016-07-12 16:04:59", 4), + rep("2016-07-12 16:04:59", 5), rep(NA, 3))), - issue.components = I(c(rep(list(list("GUI" , "Interpreters")), 13), rep(list(list("Interpreters")), 6), - rep(list(list()), 7), rep(list(list()), 10), - rep(list(list()), 6), rep(list(list()), 4), rep(list(list()), 3))), + issue.components = I(c(rep(list(list("GUI" , "Interpreters")), 15), rep(list(list("Interpreters")), 8), + rep(list(list()), 9), rep(list(list()), 11), + rep(list(list()), 6), rep(list(list()), 5), rep(list(list()), 3))), event.name = c("created", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", "commented", - "resolution_updated", "created", "commented", "commented", "commented", "commented", - "commented", "created", "assigned", "commented", "state_updated", "add_link", - "referenced", "referenced", "mentioned", "subscribed", "commented", "mentioned", - "subscribed", "add_link", "mentioned", "subscribed", "labeled", "commented", - "created", "commented", "state_updated", "commented", "commented", "state_updated", - "created", "commented", "merged", "state_updated", - "commit_added", "created", "commented"), - author.name = c("Thomas", "Thomas", "Björn", "Björn", "Björn", "Björn", "Olaf", "Björn", - "Björn", "Olaf", "Olaf", "Olaf", "Björn", "Björn", "Björn", "Björn", "Max", - "Max", "Max", "Karl", "Olaf", "Karl", "Olaf", "Karl", "Karl", "Thomas", "udo", - "udo", "Thomas", "Björn", "Björn", "Thomas", "Björn", "Björn", "Olaf", "Björn", - "Thomas", "Thomas", "Thomas", "Olaf", "Björn", "Olaf", - "Björn", "Björn", "Olaf", "Olaf", "Björn", "Olaf", "Olaf"), + "resolution_updated", "referenced_by", "add_link", "referenced_by", "add_link", "created", + "commented", "commented", "commented", "commented", "commented", "created", "assigned", "commented", + "state_updated", "add_link", "referenced", "referenced", "add_link", "add_link", "mentioned", "subscribed", + "commented", "mentioned", "subscribed", "add_link", "mentioned", "subscribed", "labeled", "commented", + "referenced_by", "created", "commented", "state_updated", "commented", "commented", "state_updated", + "created", "commented", "merged", "state_updated", "referenced_by", "commit_added", "created", "commented"), + author.name = c("Thomas", "Thomas", "Björn", "Björn", "Björn", "Björn", "Olaf", "Björn", "Björn", "Olaf", "Olaf", "Olaf", + "Björn", "Thomas", "Thomas", "Thomas", "Thomas", "Björn", "Björn", "Björn", "Max", "Max", "Max", "Karl", + "Olaf", "Karl", "Olaf", "Karl", "Karl", "Thomas", "Karl", "Thomas", "udo", "udo", "Thomas", "Björn", "Björn", + "Thomas", "Björn", "Björn", "Olaf", "Björn", "Karl", "Thomas", "Thomas", "Thomas", "Olaf", "Björn", "Olaf", + "Björn", "Björn", "Olaf", "Olaf", "Thomas", "Björn", "Olaf", "Olaf"), author.email = c("thomas@example.org", "thomas@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", "olaf@example.org", "olaf@example.org", - "bjoern@example.org", "bjoern@example.org", "bjoern@example.org", - "bjoern@example.org", "max@example.org", "max@example.org", + "bjoern@example.org", "thomas@example.org", "thomas@example.org", + "thomas@example.org", "thomas@example.org", "bjoern@example.org", + "bjoern@example.org", "bjoern@example.org", "max@example.org", "max@example.org", "max@example.org", "karl@example.org", "olaf@example.org", - "karl@example.org", "olaf@example.org", "karl@example.org", - "karl@example.org", "thomas@example.org", "udo@example.org", + "karl@example.org", "olaf@example.org", "karl@example.org", "karl@example.org", + "thomas@example.org", "karl@example.org", "thomas@example.org", "udo@example.org", "udo@example.org", "thomas@example.org", "bjoern@example.org", - "bjoern@example.org", "thomas@example.org", "bjoern@example.org", - "bjoern@example.org", "olaf@example.org", "bjoern@example.org", + "bjoern@example.org", "thomas@example.org", "bjoern@example.org", "bjoern@example.org", + "olaf@example.org", "bjoern@example.org", "karl@example.org", "thomas@example.org", "thomas@example.org", "thomas@example.org", "olaf@example.org", "bjoern@example.org", "olaf@example.org", "bjoern@example.org", "bjoern@example.org", "olaf@example.org", - "olaf@example.org", "bjoern@example.org", "olaf@example.org", - "olaf@example.org"), + "olaf@example.org", "thomas@example.org", "bjoern@example.org", + "olaf@example.org", "olaf@example.org"), date = get.date.from.string(c("2013-04-21 23:52:09", "2013-04-21 23:52:09", "2013-05-05 21:46:30", "2013-05-05 21:49:21", "2013-05-05 21:49:34", "2013-05-06 01:04:34", "2013-05-25 03:25:06", "2013-05-25 03:48:41", "2013-05-25 04:08:07", "2013-05-25 06:06:53", "2013-05-25 06:22:23", "2013-06-01 06:50:26", - "2013-06-01 06:53:06", "2016-07-12 16:01:30", + "2013-06-01 06:53:06", "2017-05-21 12:00:00", + "2017-05-21 12:00:00", "2017-05-21 12:00:00", + "2017-05-21 12:00:00", "2016-07-12 16:01:30", "2016-07-12 16:02:30", "2016-07-15 19:55:39", "2016-07-15 20:07:47", "2016-07-27 20:12:08", "2016-07-28 06:27:52", "2016-07-12 15:59:25", "2016-07-12 15:59:25", "2016-07-12 15:59:59", "2016-07-12 16:06:30", "2016-08-07 15:37:02", "2016-08-31 16:45:09", "2016-10-05 16:45:09", + "2016-08-07 15:37:02", "2016-08-07 15:30:00", "2016-07-12 15:30:02", "2016-07-12 15:30:02", "2016-07-12 16:03:59", "2016-08-31 15:30:02", "2016-10-05 15:30:02", "2016-10-13 15:30:02", "2016-12-07 15:30:02", "2016-12-07 15:30:02", "2017-05-23 12:31:34", "2017-05-23 12:32:39", - "2016-07-12 15:59:25", "2016-07-12 15:59:25", - "2016-07-12 15:59:59", "2016-07-12 16:01:01", - "2016-07-12 16:06:01", "2016-07-14 13:37:00", - "2016-07-12 14:59:25", "2016-07-12 14:59:25", - "2016-07-12 16:04:59", "2016-07-12 16:04:59", + "2016-08-07 15:37:02", "2016-07-12 15:59:25", + "2016-07-12 15:59:25", "2016-07-12 15:59:59", + "2016-07-12 16:01:01", "2016-07-12 16:06:01", + "2016-07-14 13:37:00", "2016-07-12 14:59:25", + "2016-07-12 14:59:25", "2016-07-12 16:04:59", + "2016-07-12 16:04:59", "2016-08-07 15:30:00", "2016-07-12 16:02:02", "2016-07-12 16:02:02", "2016-07-12 16:02:02")), event.info.1 = c("open", "open", "open", "open", "open", "open", "open", "open", "open", - "open", "open", "open", "fixed", "open", "open", "open", "open", "open", - "open", "open", "", "open", "closed", "930af63a030fb92e48eddff01f53284c3eeba80e", - "", "", "Thomas", "Thomas", "open", "Thomas", "Thomas", "fb52357f05958007b867da06f4077abdc04fa0d8", - "udo", "udo", "decided", "open", - "open", "open", "closed", "closed", "closed", "open", - "open", "open", "", "closed", - "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "open", "open"), + "open", "open", "open", "fixed", "", "", + "", "", "open", "open", "open", "open", "open", "open", "open", + "", "open", "closed", "930af63a030fb92e48eddff01f53284c3eeba80e", "", "", "", + "", "Thomas", "Thomas", "open", "Thomas", "Thomas", "fb52357f05958007b867da06f4077abdc04fa0d8", + "udo", "udo", "decided", "open", "", "open", "open", "closed", "closed", "closed", "open", + "open", "open", "", "closed", "", + "72c8dd25d3dd6d18f46e2b26a5f5b1e2e8dc28d0", "open", "open"), event.info.2 = NA, # is assigned later event.id = NA, # is assigned later - issue.source = c(rep("jira", 19), rep("github", 17), rep("github", 13)), + issue.source = c(rep("jira", 23), rep("github", 20), rep("github", 14)), artifact.type = "IssueEvent" ) @@ -459,13 +462,12 @@ test_that("Read and parse the issue data.", { list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), - "unresolved", list("unresolved"), list("unresolved"), list("unresolved"), - list("unresolved"), list("unresolved"), list("unresolved"), list(), "", list(), "open", - "commit", "", "", "thomas@example.org", "thomas@example.org", list(), - "thomas@example.org", "thomas@example.org", "commit", "udo@example.org", - "udo@example.org", "", list(), - list(), list(), "open", list(), list(), "closed", - list(), list(), "", "open", + "unresolved", "issue", "issue", "issue", "issue", list("unresolved"), list("unresolved"), + list("unresolved"), list("unresolved"), list("unresolved"), list("unresolved"), list(), "", list(), + "open", "commit", "", "", "issue", "issue", "thomas@example.org", "thomas@example.org", list(), + "thomas@example.org", "thomas@example.org", "commit", "udo@example.org", "udo@example.org", + "", list(), "issue", list(), list(), "open", list(), list(), "closed", + list(), list(), "", "open", "issue", "2016-07-12 15:58:59", list(), list() )) diff --git a/tests/test-split-data-activity-based.R b/tests/test-split-data-activity-based.R index c654446f0..f0c2812cf 100644 --- a/tests/test-split-data-activity-based.R +++ b/tests/test-split-data-activity-based.R @@ -19,6 +19,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, activity-based splitting of data.") @@ -86,14 +87,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:10", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) + lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) names(actual) = names(expected.config) @@ -113,8 +116,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -169,13 +172,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 18, split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -192,7 +196,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(18:19, 24:27, 35, 44:48, 52:53, 55:57), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -268,15 +272,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2010-07-12 11:05:35", "2010-07-12 12:05:41", + "2010-07-12 12:05:44" ,"2016-07-12 15:58:40", "2016-07-12 16:05:37", + "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 11:05:35", "2010-07-12 12:05:41", - "2010-07-12 12:05:44" ,"2016-07-12 15:58:40", "2016-07-12 16:05:37", - "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -306,8 +311,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2010-07-12 11:05:35" = data$issues[0, ], "2010-07-12 11:05:35-2010-07-12 12:05:41" = data$issues[0, ], "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ], + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 33:34, 50:51), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:47, 52:53, 55:57), ], "2016-07-12 16:05:37-2016-07-12 16:05:38" = data$issues[0, ] ), mails = list( @@ -372,13 +377,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 26, split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -395,7 +401,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -458,8 +464,9 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2013-04-21 23:52:09-2013-05-25 06:22:23", "2013-05-25 06:22:23-2016-07-12 15:59:59", "2016-07-12 15:59:59-2016-07-12 16:06:30", - "2016-07-12 16:06:30-2016-10-05 15:30:02", - "2016-10-05 15:30:02-2017-05-23 12:32:40" + "2016-07-12 16:06:30-2016-08-07 15:37:02", + "2016-08-07 15:37:02-2017-05-23 12:31:34", + "2017-05-23 12:31:34-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -470,14 +477,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", + "2016-07-12 16:06:30", "2016-08-07 15:37:02", "2017-05-23 12:31:34", + "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 9, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2013-05-25 06:22:23", "2016-07-12 15:59:59", - "2016-07-12 16:06:30", "2016-10-05 15:30:02", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -491,44 +500,50 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commits[0, ], "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commits[1, ], "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commits[2:5, ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commits[6:8, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commits[6:8, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commits[0, ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$commit.messages, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commit.messages, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commit.messages, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commit.messages, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commit.messages, + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 20:21, 27:28, 43:44, 37:38), ], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(14:15, 22, 29, 39:41, 45:49), ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 30, 42), ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 24:26, 33:34, 44:45, 50:51), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(18:19, 35, 46:48, 52:53, 55:57), ], + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(20:23, 27:28, 31:32, 49, 54), ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:17, 29:30, 36:40, 43), ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(41:42), ] ), mails = list( ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 15:16 + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$mails[0, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$mails[0, ], + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$pasta, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$pasta, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$pasta, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$pasta, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$pasta, + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$synchronicity, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$synchronicity, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$synchronicity, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$synchronicity, + "2017-05-23 12:31:34-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -567,13 +582,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", - split.length = 59, + split.length = 67, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -666,14 +682,15 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "commits", split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -696,10 +713,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29, 40:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(18:19, 35, 47:48, 52:53, 55:57), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 27, ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -757,13 +774,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 18, split.basis = "commits", - split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -780,7 +798,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(14:15, 20:23, 29, 37:41, 45:49), ] + "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(18:19, 24:27, 35, 44:48, 52:53, 55:57), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -847,7 +865,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10", "2016-07-12 16:00:45-2016-07-12 16:06:20", "2016-07-12 16:06:10-2016-07-12 16:06:32", - "2016-07-12 16:06:20-2016-07-12 16:06:33", + "2016-07-12 16:06:20-2016-07-12 16:06:32", "2016-07-12 16:06:32-2016-07-12 16:06:33" ) lapply(results, function(res) { @@ -860,15 +878,16 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", + "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:32", + "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "commits", split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:06:10", - "2016-07-12 16:06:20", "2016-07-12 16:06:32", "2016-07-12 16:06:33", - "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -882,21 +901,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commits[1:3, ], "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commits[2:4, ], "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commits[4:6, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commits[5:8, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commits[5:8, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commits[7:9, ] ), commit.messages = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$commit.messages, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$commit.messages, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$commit.messages, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$commit.messages, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 29, 40:41, 45:49), ], - "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 23, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ], + "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(18:19, 35, 47:48, 52:53, 55:57), ], + "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$issues[rownames(data$issues) == 27, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$issues[0, ] ), mails = list( @@ -904,21 +923,21 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$mails[0, ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$mails[0, ], + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$mails[0, ], "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$mails[0, ] ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$pasta, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$pasta, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$pasta, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$pasta, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$pasta, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$pasta ), synchronicity = list( "2016-07-12 15:58:59-2016-07-12 16:06:10" = data$synchronicity, "2016-07-12 16:00:45-2016-07-12 16:06:20" = data$synchronicity, "2016-07-12 16:06:10-2016-07-12 16:06:32" = data$synchronicity, - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$synchronicity, + "2016-07-12 16:06:20-2016-07-12 16:06:32" = data$synchronicity, "2016-07-12 16:06:32-2016-07-12 16:06:33" = data$synchronicity ) ) @@ -991,16 +1010,17 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2005-02-09 18:49:49", "2010-07-12 11:05:35", + "2010-07-12 12:05:34", "2010-07-12 12:05:41", "2010-07-12 12:05:42", + "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2016-07-12 15:58:40", + "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 3, split.basis = "mails", split.sliding.window = TRUE, - split.revisions = c("2004-10-09 18:38:13", "2005-02-09 18:49:49", "2010-07-12 11:05:35", - "2010-07-12 12:05:34", "2010-07-12 12:05:41", "2010-07-12 12:05:42", - "2010-07-12 12:05:44", "2010-07-12 12:05:45", "2016-07-12 15:58:40", - "2016-07-12 15:58:50", "2016-07-12 16:05:37", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1041,10 +1061,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2010-07-12 12:05:34-2010-07-12 12:05:42" = data$issues[0, ], "2010-07-12 12:05:41-2010-07-12 12:05:44" = data$issues[0, ], "2010-07-12 12:05:42-2010-07-12 12:05:45" = data$issues[0, ], - "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 27:28, 43:44), ], - "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ], - "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:40, 45:49), ] + "2010-07-12 12:05:44-2016-07-12 15:58:40" = data$issues[rownames(data$issues) %in% c(1:13, 33:34, 50:51), ], + "2010-07-12 12:05:45-2016-07-12 15:58:50" = data$issues[rownames(data$issues) %in% c(1:13, 33:34, 50:51), ], + "2016-07-12 15:58:40-2016-07-12 16:05:37" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:47, 52:53, 55:57), ], + "2016-07-12 15:58:50-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:47, 52:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1120,13 +1140,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 26, split.basis = "mails", - split.sliding.window = TRUE, - split.revisions = c("2004-10-09 18:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1143,7 +1164,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$commit.messages ), issues = list( - "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:49), ] + "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( "2004-10-09 18:38:13-2016-07-12 16:05:38" = data$mails @@ -1209,9 +1230,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02", "2016-07-12 15:59:59-2016-07-12 16:06:30", "2016-07-12 16:02:02-2016-07-27 20:12:08", - "2016-07-12 16:06:30-2016-10-05 15:30:02", - "2016-07-27 20:12:08-2017-05-23 12:31:34", - "2016-10-05 15:30:02-2017-05-23 12:32:40" + "2016-07-12 16:06:30-2016-08-07 15:37:02", + "2016-07-27 20:12:08-2016-10-05 16:45:09", + "2016-08-07 15:37:02-2017-05-23 12:31:34", + "2016-10-05 16:45:09-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, info = "Time ranges.") @@ -1222,16 +1244,17 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", + "2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 16:02:02", + "2016-07-12 16:06:30", "2016-07-27 20:12:08", "2016-08-07 15:37:02", + "2016-10-05 16:45:09", "2017-05-23 12:31:34", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", split.length = 9, split.basis = "issues", split.sliding.window = TRUE, - split.revisions = c("2013-04-21 23:52:09", "2013-05-06 01:04:34", "2013-05-25 06:22:23", - "2016-07-12 15:30:02", "2016-07-12 15:59:59", "2016-07-12 16:02:02", - "2016-07-12 16:06:30", "2016-07-27 20:12:08", "2016-10-05 15:30:02", - "2017-05-23 12:31:34", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1248,9 +1271,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$commits[1:2, ], "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commits[2:5, ], "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$commits[3:8, ], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commits[6:8, ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commits[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commits[0, ] + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commits[6:8, ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$commits[0, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commits[0, ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$commits[0, ] ), commit.messages = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$commit.messages, @@ -1259,32 +1283,35 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$commit.messages, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$commit.messages, "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$commit.messages, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$commit.messages, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$commit.messages, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$commit.messages + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$commit.messages, + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$commit.messages, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$commit.messages, + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$commit.messages ), issues = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$issues[rownames(data$issues) %in% 1:10, ], - "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$issues[rownames(data$issues) %in% c(6:13, 43:44), ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 20:21, 27:28, 37:38, 43:44), ], - "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(14, 20:22, 27:28, 37:40),], - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(14:15, 22, 29, 39:41, 45:49), ], - "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(15:17, 23, 29, 41:42, 45:49),], - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$issues[rownames(data$issues) %in% c(16:19, 23:25, 30, 42), ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 30:34), ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(26, 31:36), ] + "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$issues[rownames(data$issues) %in% c(6:13, 50:51), ], + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$issues[rownames(data$issues) %in% c(11:13, 24:26, 33:34, 44:45, 50:51), ], + "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(18, 24:26, 33:34, 44:47, 55), ], + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$issues[rownames(data$issues) %in% c(18:19, 35, 46:48, 52:53, 55:57), ], + "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$issues[rownames(data$issues) %in% c(19:21, 27, 35, 48:49, 52:53, 56:57), ], + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$issues[rownames(data$issues) %in% c(20:23, 27:28, 31:32, 49, 54), ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$issues[rownames(data$issues) %in% c(22:23, 28:29, 31:32, 36:37, 43, 54), ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$issues[rownames(data$issues) %in% c(14:17, 29:30, 36:40, 43), ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:17, 30, 38:42), ] ), mails = list( ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$mails[0, ], "2013-05-06 01:04:34-2016-07-12 15:30:02" = data$mails[0, ], - "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 - "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$mails[15:16, ], # rownames(data$mails) %in% 16:17 - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$mails[0, ], - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$mails[0, ], - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$mails[0, ] + "2013-05-25 06:22:23-2016-07-12 15:59:59" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$mails[15:16, ], # rownames(data$mails) %in% 15:16 + "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$mails[15:16, ], # rownames(data$mails) %in% 15:16 + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$mails[0, ], + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$mails[0, ], + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$mails[0, ], + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$pasta, @@ -1293,9 +1320,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$pasta, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$pasta, "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$pasta, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$pasta, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$pasta, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$pasta + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$pasta, + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$pasta, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$pasta, + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( "2013-04-21 23:52:09-2013-05-25 06:22:23" = data$synchronicity, @@ -1304,9 +1332,10 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity "2016-07-12 15:30:02-2016-07-12 16:02:02" = data$synchronicity, "2016-07-12 15:59:59-2016-07-12 16:06:30" = data$synchronicity, "2016-07-12 16:02:02-2016-07-27 20:12:08" = data$synchronicity, - "2016-07-12 16:06:30-2016-10-05 15:30:02" = data$synchronicity, - "2016-07-27 20:12:08-2017-05-23 12:31:34" = data$synchronicity, - "2016-10-05 15:30:02-2017-05-23 12:32:40" = data$synchronicity + "2016-07-12 16:06:30-2016-08-07 15:37:02" = data$synchronicity, + "2016-07-27 20:12:08-2016-10-05 16:45:09" = data$synchronicity, + "2016-08-07 15:37:02-2017-05-23 12:31:34" = data$synchronicity, + "2016-10-05 16:45:09-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( @@ -1345,13 +1374,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (activity info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", - split.length = 59, + split.length = 67, split.basis = "issues", - split.sliding.window = TRUE, - split.revisions = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.sliding.window = FALSE, # The sliding-window approach does not apply if we only have one range or less + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1443,13 +1473,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33") expected.config = list( split.type = "activity-based", split.length = 4, split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:06:20", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1468,8 +1499,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 29, 37:41, 45:49), ], - "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 23, ] + "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 35, 44:48, 52:53, 55:57), ], + "2016-07-12 16:06:20-2016-07-12 16:06:33" = data$issues[rownames(data$issues) == 27, ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:06:20" = data$mails[15:16, ], # when pasta is not configured: rownames(data$mails) %in% 16:17 @@ -1569,13 +1600,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38") expected.config = list( split.type = "activity-based", split.length = 8, split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2010-07-12 12:05:43", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1595,7 +1627,7 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ), issues = list( "2004-10-09 18:38:13-2010-07-12 12:05:43" = data$issues[0, ], - "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:15, 20:22, 27:29, 37:40, 43:45, 46:49), ] + "2010-07-12 12:05:43-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -1683,8 +1715,8 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ## check time ranges expected = c( - "2013-04-21 23:52:09-2016-07-12 16:02:02", - "2016-07-12 16:02:02-2017-05-23 12:32:40" + "2013-04-21 23:52:09-2016-07-12 16:03:59", + "2016-07-12 16:03:59-2017-05-23 12:32:40" ) lapply(results, function(res) { expect_equal(res$get.project.conf()$get.value("ranges"), expected, @@ -1696,13 +1728,14 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2016-07-12 16:03:59", "2017-05-23 12:32:40") expected.config = list( split.type = "activity-based", - split.length = 21, + split.length = 24, split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2016-07-12 16:02:02", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1713,29 +1746,29 @@ patrick::with_parameters_test_that("Split a data object activity-based (number.w ## check data for all ranges expected.data = list( commits = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commits[1:2, ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commits[3:8, ] + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$commits[1:2, ], + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$commits[3:8, ] ), commit.messages = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$commit.messages, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$commit.messages + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$commit.messages, + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$commit.messages ), issues = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$issues[rownames(data$issues) %in% c(1:14, 20:22, 27:28, 37:40, 43:44), ], - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(15:19, 23:26, 29:36, 41:42, 45:49), ] + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$issues[rownames(data$issues) %in% c(1:13, 18:19, 24:26, 33:34, 44:47, 50:51, 55:57), ], + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:17, 20:23, 28:31, 32, 35:42, 27, 43, 48:49, 52:54), ] ), mails = list( ## comments indicate row names when pasta is not configured - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$mails[15:16, ] # rownames(data$maisl) %in% 16:17 + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$mails[15:16, ] # rownames(data$maisl) %in% 15:16 ), pasta = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$pasta, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$pasta + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$pasta, + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$pasta ), synchronicity = list( - "2013-04-21 23:52:09-2016-07-12 16:02:02" = data$synchronicity, - "2016-07-12 16:02:02-2017-05-23 12:32:40" = data$synchronicity + "2013-04-21 23:52:09-2016-07-12 16:03:59" = data$synchronicity, + "2016-07-12 16:03:59-2017-05-23 12:32:40" = data$synchronicity ) ) results.data = list( diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R index 3f28a7907..67945105d 100644 --- a/tests/test-split-data-time-based.R +++ b/tests/test-split-data-time-based.R @@ -20,6 +20,7 @@ ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, time-based splitting of data.") @@ -86,13 +87,14 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:59", "2016-07-12 16:04:59", "2016-07-12 16:06:33") expected.config = list( split.type = "time-based", split.length = "3 min", split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:59", "2016-07-12 16:04:59", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -113,9 +115,9 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15,29, 47:49), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45:46), ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(18, 24:26, 44:47), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(19, 35, 55:57), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(27, 48, 52:53), ] ), mails = list( ## comments indicate row names when pasta is not configured @@ -197,14 +199,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2007-10-10 12:38:13", "2010-10-10 06:38:13", "2013-10-10 00:38:13", + "2016-07-12 16:05:38") expected.config = list( split.type = "time-based", split.length = "3 years", split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2007-10-10 12:38:13", "2010-10-10 06:38:13", - "2013-10-10 00:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -230,14 +233,14 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$issues[0, ], "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$issues[0, ], "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:16 ), pasta = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, @@ -313,13 +316,14 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2015-04-22 11:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "time-based", split.length = "2 years", split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2015-04-22 11:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -341,12 +345,12 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis ), issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 35:36, ] + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ], + "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:17, 41:42), ] ), mails = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # when pasta is not configured: rownames(data$mails) %in% 14:17 + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # when pasta is not configured: rownames(data$mails) %in% 13:16 "2017-04-21 23:52:09-2017-05-23 12:32:40" = data$mails[0, ] ), pasta = list( @@ -425,15 +429,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:29", "2016-07-12 16:01:59", "2016-07-12 16:03:29", + "2016-07-12 16:04:59", "2016-07-12 16:06:29", "2016-07-12 16:06:33") expected.config = list( split.type = "time-based", split.length = "3 min", split.basis = "commits", split.sliding.window = TRUE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:00:29", "2016-07-12 16:01:59", - "2016-07-12 16:03:29", "2016-07-12 16:04:59", "2016-07-12 16:06:29", - "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -458,19 +462,19 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(14, 20:22, 37:40), ], - "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(14:15, 40, 47:49), ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(15, 29, 47:49), ], - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(29,41,45,46), ], - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23,41,45,46), ] + "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$issues[rownames(data$issues) %in% c(18, 24:26, 44:47), ], + "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$issues[rownames(data$issues) %in% c(18:19, 47, 55:57), ], + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$issues[rownames(data$issues) %in% c(19, 35, 55:57), ], + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$issues[rownames(data$issues) %in% c(35, 48, 52:53), ], + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(27, 48, 52:53), ] ), mails = list( ## comments indicate row names when pasta is not configured "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$mails[0, ], "2016-07-12 16:00:29-2016-07-12 16:03:29" = data$mails[0, ], - "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[15, ], # rownames(data$mails) == 16 - "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[15:16, ], # rownames(data$mails) %in% c(16,17) - "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[16, ] # rownames(data$mails) == 17 + "2016-07-12 16:01:59-2016-07-12 16:04:59" = data$mails[15, ], # rownames(data$mails) == 15 + "2016-07-12 16:03:29-2016-07-12 16:06:29" = data$mails[15:16, ], # rownames(data$mails) %in% c(15,16) + "2016-07-12 16:04:59-2016-07-12 16:06:33" = data$mails[16, ] # rownames(data$mails) == 16 ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:01:59" = data$pasta, @@ -553,15 +557,16 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2006-04-10 15:38:13", "2007-10-10 12:38:13", "2009-04-10 09:38:13", + "2010-10-10 06:38:13", "2012-04-10 03:38:13", "2013-10-10 00:38:13", "2015-04-10 21:38:13", + "2016-07-12 16:05:38") expected.config = list( split.type = "time-based", split.length = "3 years", split.basis = "mails", split.sliding.window = TRUE, - split.revisions = c("2004-10-09 18:38:13", "2006-04-10 15:38:13", "2007-10-10 12:38:13", - "2009-04-10 09:38:13", "2010-10-10 06:38:13", "2012-04-10 03:38:13", - "2013-10-10 00:38:13", "2015-04-10 21:38:13", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -597,17 +602,17 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$issues[0, ], "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured - "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 + "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$mails[1:2, ], # rownames(data$mails) %in% 1:2 "2006-04-10 15:38:13-2009-04-10 09:38:13" = data$mails[0, ], "2007-10-10 12:38:13-2010-10-10 06:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2009-04-10 09:38:13-2012-04-10 03:38:13" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2010-10-10 06:38:13-2013-10-10 00:38:13" = data$mails[0, ], "2012-04-10 03:38:13-2015-04-10 21:38:13" = data$mails[0, ], - "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + "2013-10-10 00:38:13-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:16 ), pasta = list( "2004-10-09 18:38:13-2007-10-10 12:38:13" = data$pasta, @@ -691,14 +696,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2014-04-22 05:52:09", "2015-04-22 11:52:09", "2016-04-21 17:52:09", + "2017-04-21 23:52:09", "2017-05-23 12:32:40") expected.config = list( split.type = "time-based", split.length = "2 years", split.basis = "issues", split.sliding.window = TRUE, - split.revisions = c("2013-04-21 23:52:09", "2014-04-22 05:52:09", "2015-04-22 11:52:09", - "2016-04-21 17:52:09", "2017-04-21 23:52:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -723,15 +729,15 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis issues = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$issues[rownames(data$issues) %in% 1:13, ], "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$issues[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:36, 37:49), ] + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ], + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% c(14:57), ] ), mails = list( ## comments indicate row names when pasta is not configured "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$mails[0, ], "2014-04-22 05:52:09-2016-04-21 17:52:09" = data$mails[0, ], - "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # rownames(data$mails) %in% 14:17 - "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # rownames(data$mails) %in% 14:17 + "2015-04-22 11:52:09-2017-04-21 23:52:09" = data$mails[13:16, ], # rownames(data$mails) %in% 13:17 + "2016-04-21 17:52:09-2017-05-23 12:32:40" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 ), pasta = list( "2013-04-21 23:52:09-2015-04-22 11:52:09" = data$pasta, @@ -806,13 +812,14 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... ) info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59") expected.config = list( split.type = "time-based", - split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), + split.length = revisions, split.basis = NULL, split.sliding.window = FALSE, - split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -830,7 +837,7 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... ) "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ] + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ] ), mails = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ] @@ -902,13 +909,14 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03") expected.config = list( split.type = "time-based", - split.length = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), + split.length = revisions, split.basis = NULL, split.sliding.window = FALSE, - split.revisions = c("2016-01-01 00:00:00", "2016-12-31 23:59:59", "2017-06-03 03:03:03"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -927,8 +935,8 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$commit.messages ), issues = list( - "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(14:34, 37:49), ], - "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% 35:36, ] + "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$issues[rownames(data$issues) %in% c(18:40, 43:57), ], + "2016-12-31 23:59:59-2017-06-03 03:03:03" = data$issues[rownames(data$issues) %in% c(14:17, 41:42), ] ), mails = list( "2016-01-01 00:00:00-2016-12-31 23:59:59" = data$mails[rownames(data$mails) %in% 13:17, ], @@ -960,6 +968,44 @@ patrick::with_parameters_test_that("Split a data object time-based (bins = ... , "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE) )) +## +## Verify that split.data.time.based does not accept an invalid bins parameter +## + +test_that("Split a data object time-based with invalid bins parameter.", { + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## define invalid bins + invalid.bins.not.a.date = c("These", "bins", "are", "invalid") + invalid.bins.contains.NA = c("2016-01-01 00:00:00", NA, "2016-12-31 23:59:59", "2017-06-03 03:03:03") + invalid.bins.not.a.list = "2016-01-01 00:00:00 2016-12-31 23:59:59" + invalid.bins.format.of.split.by.bins = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), vector = replicate(24, 1)) + + invalid.bins = list(invalid.bins.not.a.date, invalid.bins.contains.NA, invalid.bins.not.a.list, + invalid.bins.format.of.split.by.bins) + + ## test that all invalid bins produce an error + for (invalid.bin in invalid.bins) { + expect_error(split.data.time.based(project.data, bins = invalid.bin, split.basis = "issues"), + regexp = "Stopped due to incorrect parameter types", + info = "Bins need to be a list of characters representing dates.") + } +}) + + ## * * custom event timestamps ---------------------------------------------------------------- ## @@ -1005,15 +1051,15 @@ patrick::with_parameters_test_that("Split a data object time-based using custom info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", "2016-08-08 00:00:00", + "2016-10-05 09:00:00") expected.config = list( split.type = "time-based", - split.length = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", - "2016-08-08 00:00:00", "2016-10-05 09:00:00"), + split.length = revisions, split.basis = NULL, split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:00:00", "2016-07-12 16:00:00", "2016-07-12 16:05:00", - "2016-08-08 00:00:00", "2016-10-05 09:00:00"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1036,16 +1082,16 @@ patrick::with_parameters_test_that("Split a data object time-based using custom "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$commit.messages ), issues = list( - "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(20:22, 27, 28, 37:39), ], - "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(14, 15, 29, 40, 45:49), ], - "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(16:19, 23:24, 41, 42), ], - "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(25, 30), ] + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$issues[rownames(data$issues) %in% c(24:26, 33:34, 44:46), ], + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$issues[rownames(data$issues) %in% c(18:19, 35, 47, 52:53, 55:57), ], + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$issues[rownames(data$issues) %in% c(20:23, 27:28, 31:32, 43, 48:49, 54), ], + "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$issues[rownames(data$issues) %in% c(29, 36), ] ), mails = list( ## comments indicate rownames when pasta is not configured - "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$mails[13:14, ], # rownames(data$mails) %in% 14:15 - "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$mails[15, ], # rownames(data$mails) %in% 16 - "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$mails[16, ], # rownames(data$mails) %in% 17 + "2016-07-12 15:00:00-2016-07-12 16:00:00" = data$mails[13:14, ], # rownames(data$mails) %in% 13:14 + "2016-07-12 16:00:00-2016-07-12 16:05:00" = data$mails[15, ], # rownames(data$mails) %in% 15 + "2016-07-12 16:05:00-2016-08-08 00:00:00" = data$mails[16, ], # rownames(data$mails) %in% 16 "2016-08-08 00:00:00-2016-10-05 09:00:00" = data$mails[0, ] ), pasta = list( @@ -1244,14 +1290,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information - + revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:30", "2016-07-12 16:04:01", "2016-07-12 16:06:33") expected.config = list( split.type = "time-based", split.length = "2M 31S", split.basis = "commits", split.sliding.window = FALSE, - split.revisions = c("2016-07-12 15:58:59", "2016-07-12 16:01:30", "2016-07-12 16:04:01", "2016-07-12 16:06:33"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1272,14 +1318,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$commit.messages ), issues = list( - "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(20:22, 37:40), ], - "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(14, 15, 29, 47:49), ], - "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(23, 41, 45:46), ] + "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$issues[rownames(data$issues) %in% c(24:26, 44:47), ], + "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$issues[rownames(data$issues) %in% c(18:19, 35, 55:57), ], + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$issues[rownames(data$issues) %in% c(27, 48, 52:53), ] ), mails = list( "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$mails[0, ], "2016-07-12 16:01:30-2016-07-12 16:04:01" = data$mails[0, ], - "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 16:17 + "2016-07-12 16:04:01-2016-07-12 16:06:33" = data$mails[15:16, ] # when pasta is not configured: rownames(data$mails) %in% 15:16 ), pasta = list( "2016-07-12 15:58:59-2016-07-12 16:01:30" = data$pasta, @@ -1355,14 +1401,15 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2004-10-09 18:38:13", "2007-09-18 06:00:04", "2010-08-26 17:21:55", "2013-08-04 04:43:46", + "2016-07-12 16:05:38") expected.config = list( split.type = "time-based", split.length = "2y 0m 342d 23H 21M 51S", split.basis = "mails", split.sliding.window = FALSE, - split.revisions = c("2004-10-09 18:38:13", "2007-09-18 06:00:04", "2010-08-26 17:21:55", - "2013-08-04 04:43:46", "2016-07-12 16:05:38"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1388,14 +1435,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$issues[0, ], "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$issues[0, ], "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$issues[rownames(data$issues) %in% 1:13, ], - "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(14:15, 20:22, 27:29, 37:40, 43:49), ] + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$issues[rownames(data$issues) %in% c(18:19, 24:26, 33:35, 44:47, 50:53, 55:57), ] ), mails = list( ## comments indicate row names when pasta is not configured "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$mails[1:2, ], "2007-09-18 06:00:04-2010-08-26 17:21:55" = data$mails[3:12, ], # rownames(data$mails) %in% 3:12 "2010-08-26 17:21:55-2013-08-04 04:43:46" = data$mails[0, ], - "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:17 + "2013-08-04 04:43:46-2016-07-12 16:05:38" = data$mails[13:16, ] # rownames(data$mails) %in% 13:16 ), pasta = list( "2004-10-09 18:38:13-2007-09-18 06:00:04" = data$pasta, @@ -1471,13 +1518,14 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si info = "Splitting must not modify the original ProjectConf.") ## test that the config contains the correct splitting information + revisions = c("2013-04-21 23:52:09", "2014-09-01 12:05:39", "2016-01-12 00:19:09", "2017-05-23 12:32:40") expected.config = list( split.type = "time-based", split.length = "1y 0m 132d 6H 13M 30S", split.basis = "issues", split.sliding.window = FALSE, - split.revisions = c("2013-04-21 23:52:09", "2014-09-01 12:05:39", "2016-01-12 00:19:09", "2017-05-23 12:32:40"), - split.revision.dates = NULL + split.revisions = revisions, + split.revisions.dates = get.date.from.string(revisions) ) lapply(results, function(res) { actual = lapply(names(expected.config), res$get.project.conf()$get.value) @@ -1500,12 +1548,12 @@ patrick::with_parameters_test_that("Split a data object time-based with equal-si issues = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$issues[rownames(data$issues) %in% 1:13, ], "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$issues[0, ], - "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:49, ] + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$issues[rownames(data$issues) %in% 14:57, ] ), mails = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$mails[0, ], "2014-09-01 12:05:39-2016-01-12 00:19:09" = data$mails[0, ], - "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 13:17 + "2016-01-12 00:19:09-2017-05-23 12:32:40" = data$mails[13:16, ] # when pasta is not configured: rownames(data$mails) %in% 13:16 ), pasta = list( "2013-04-21 23:52:09-2014-09-01 12:05:39" = data$pasta, diff --git a/tests/test-split-misc.R b/tests/test-split-misc.R index c2a9c7234..7a2e42b66 100644 --- a/tests/test-split-misc.R +++ b/tests/test-split-misc.R @@ -14,6 +14,7 @@ ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2018 by Jakob Kronawitter ## Copyright 2022 by Jonathan Baumann +## Copyright 2023 by Maximilian Löffler ## All Rights Reserved. @@ -42,10 +43,10 @@ if (!dir.exists(CF.DATA)) CF.DATA = file.path(".", "tests", "codeface-data") ## Split raw data (data and networks by bins) ------------------------------ ## -## Tests for split.data.by.bins and split.network.by.bins +## Tests for split.dataframe.by.bins and split.network.by.bins ## -test_that("Split network and data on low level (split.data.by.bins, split.network.by.bins).", { +test_that("Split network and data on low level (split.dataframe.by.bins, split.network.by.bins).", { length.dates = 15 length.bins = 5 @@ -69,7 +70,7 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ ## sprintf("c(\"%s\")", paste( sample(bins, size = length.dates, replace = TRUE), collapse = "', '") ) ## - ## split.data.by.bins + ## split.dataframe.by.bins ## ## generate data frame with dates and IDs @@ -86,7 +87,7 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ "4" = df[ c(4, 11, 13), ], "5" = df[ c(3, 10, 15), ] ) - results = split.data.by.bins(df, bins.vector) + results = split.dataframe.by.bins(df, bins.vector) ## check result expect_equal(results, expected, info = "Split data by bins.") @@ -124,6 +125,50 @@ test_that("Split network and data on low level (split.data.by.bins, split.networ }) +## +## Verify that split.data.by.bins does not accept an invalid bins parameter +## + +test_that("Split a data object by activity-based bins with invalid bins parameter.", { + ## configuration objects + proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) + net.conf = NetworkConf$new() + + ## data object + project.data = ProjectData$new(proj.conf) + data = list( + commits = project.data$get.commits.unfiltered(), + commit.messages = project.data$get.commit.messages(), + issues = project.data$get.issues(), + mails = project.data$get.mails(), + pasta = project.data$get.pasta(), + synchronicity = project.data$get.synchronicity() + ) + + ## define invalid bins + invalid.bins.not.a.date = list(bins = c("These", "bins", "are", "invalid"), vector = replicate(24, 1)) + invalid.bins.not.a.number = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40"), vector = replicate(24, "NaN")) + invalid.bins.contains.NA = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40", NA), vector = replicate(24, 1)) + invalid.bins.missing.bins = list(vector = replicate(24, 1)) + invalid.bins.missing.vector = list(bins = c("2013-04-21 23:52:09", "2017-05-23 12:32:40")) + invalid.bins.format.of.split.time.based = list("2013-04-21 23:52:09", "2017-05-23 12:32:40") + + invalid.bins = list(invalid.bins.not.a.date, invalid.bins.contains.NA, invalid.bins.missing.bins, + invalid.bins.missing.vector, invalid.bins.format.of.split.time.based) + + ## test that all invalid bins produce an error + for (invalid.bin in invalid.bins) { + expect_error(split.data.by.bins(project.data, + bins = invalid.bin, + split.basis = "issues", + activity.amount = 3000, + sliding.window = FALSE), + regexp = "Stopped due to incorrect parameter types", + info = "Bins need to be a named list with a 'bins' component including characters representing dates" + + " and a 'vector' including numerics.") + } +}) + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Bin identification ------------------------------------------------------ diff --git a/tests/test-split-network-activity-based.R b/tests/test-split-network-activity-based.R index 52d7b8f03..5c9036416 100644 --- a/tests/test-split-network-activity-based.R +++ b/tests/test-split-network-activity-based.R @@ -15,6 +15,7 @@ ## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, activity-based splitting of networks.") @@ -71,6 +72,11 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:05:41", "2016-07-12 16:06:10", + "2016-07-12 16:06:32", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -90,6 +96,10 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -111,6 +121,11 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:05:41", "2016-07-12 16:06:32", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -162,7 +177,7 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(3, 5)), "2016-07-12 16:05:41-2016-07-12 16:06:10" = igraph::subgraph.edges(author.net, c(5, 4)), "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(4, 7)), - "2016-07-12 16:06:10-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(7, 6)), + "2016-07-12 16:06:10-2016-07-12 16:06:32" = igraph::subgraph.edges(author.net, c(7, 6)), "2016-07-12 16:06:32-2016-07-12 16:06:33" = igraph::subgraph.edges(author.net, c(6, 8)) ) results = split.network.activity.based(author.net, number.edges = 2, sliding.window = TRUE) @@ -170,6 +185,12 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:10", + "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -190,6 +211,10 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (2)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -211,6 +236,11 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.windows (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:05:41", "2016-07-12 16:06:32", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -268,6 +298,13 @@ patrick::with_parameters_test_that("Split a network activity-based (number.edges ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges (number.edges (1)).") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:45", "2016-07-12 16:05:41", + "2016-07-12 16:05:41", "2016-07-12 16:06:10", "2016-07-12 16:06:10", + "2016-07-12 16:06:32", "2016-07-12 16:06:32", "2020-02-20 20:20:20", + "2020-02-20 20:20:21")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) diff --git a/tests/test-split-network-time-based.R b/tests/test-split-network-time-based.R index bdcd21d35..c878d4cbe 100644 --- a/tests/test-split-network-time-based.R +++ b/tests/test-split-network-time-based.R @@ -16,6 +16,7 @@ ## Copyright 2020 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. context("Splitting functionality, time-based splitting of networks.") @@ -76,6 +77,12 @@ patrick::with_parameters_test_that("Split a network time-based (time.period = .. ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:59", + "2016-07-12 16:02:59", "2016-07-12 16:04:59", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -195,6 +202,14 @@ patrick::with_parameters_test_that("Split a network time-based (time.period = .. ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 15:59:59", + "2016-07-12 16:00:59", "2016-07-12 16:01:59", + "2016-07-12 16:02:59", "2016-07-12 16:03:59", + "2016-07-12 16:04:59", "2016-07-12 16:05:59", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -259,6 +274,12 @@ patrick::with_parameters_test_that("Split a network time-based (bins = ...), ", ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:00", "2016-07-12 16:00:59", + "2016-07-12 16:02:59", "2016-07-12 16:04:59", + "2016-07-12 17:21:43")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) @@ -366,6 +387,12 @@ patrick::with_parameters_test_that("Split a network time-based with equal-sized ## check ranges (labels) expect_equal(names(results), names(expected), info = "Time ranges.") + ## check bins + expected.bins = get.date.from.string(c("2016-07-12 15:58:59", "2016-07-12 16:00:53", + "2016-07-12 16:02:47", "2016-07-12 16:04:41", + "2016-07-12 16:06:33")) + expect_equal(expected.bins, attr(results, "bins")) + ## check networks check.identical = mapply(results, expected, FUN = function(r, e) { igraph::identical_graphs(r, e) diff --git a/tests/testing-utils.R b/tests/testing-utils.R index 71ac36f79..567cdd784 100644 --- a/tests/testing-utils.R +++ b/tests/testing-utils.R @@ -12,6 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -117,11 +118,11 @@ remove.row.names.from.inner.list.of.dfs = function(list.of.lists.of.dfs) { return(lapply(list.of.lists.of.dfs, remove.row.names.from.data)) } -#' Compare edges and vertices of two networks +#' Assert that two networks are equal. Used for testing purposes. #' #' @param network.expected the expected network #' @param network.actual the actual network -compare.networks = function(network.expected, network.actual) { +assert.networks.equal = function(network.expected, network.actual) { ## TODO as soon as the bug in igraph is fixed switch to the expect_true function below # expect_true(igraph::identical_graphs(network.expected, network.actual)) expected.edges = igraph::as_data_frame(network.expected, what = "edges") @@ -133,3 +134,20 @@ compare.networks = function(network.expected, network.actual) { expect_identical(expected.edges, actual.edges, info = "network edges") expect_identical(expected.vertices, actual.vertices, info = "network vertices") } + +#' Assert that two sparse matrices are equal. Used for testing purposes. +#' +#' @param matrix.expected the expected matrix +#' @param matrix.actual the actual matrix +assert.sparse.matrices.equal = function(matrix.expected, matrix.actual) { + # check if colnames and rownames are equal + expect_equal(colnames(matrix.expected), colnames(matrix.actual)) + expect_equal(rownames(matrix.expected), rownames(matrix.actual)) + # check if matrices have the same size + expected.size = length(matrix.expected) + expect_equal(expected.size, length(matrix.actual)) + # check if contents are the same + for (i in seq_len(expected.size)) { + expect_equal(matrix.expected[i], matrix.actual[i]) + } +} \ No newline at end of file diff --git a/util-conf.R b/util-conf.R index f05c2b924..0031771a4 100644 --- a/util-conf.R +++ b/util-conf.R @@ -18,7 +18,7 @@ ## Copyright 2020-2021 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2019 by Thomas Bock -## Copyright 2021, 2023 by Thomas Bock +## Copyright 2021, 2023-2024 by Thomas Bock ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2019 by Anselm Fehnker @@ -863,6 +863,12 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, allowed = c(TRUE, FALSE), allowed.number = 1 ), + simplify.multiple.relations = list( + default = FALSE, + type = "logical", + allowed = c(TRUE, FALSE), + allowed.number = 1 + ), skip.threshold = list( default = Inf, type = "numeric", diff --git a/util-data-misc.R b/util-data-misc.R index 6d0d1803c..61d5bb840 100644 --- a/util-data-misc.R +++ b/util-data-misc.R @@ -20,6 +20,7 @@ ## Copyright 2021 by Johannes Hostert ## Copyright 2021 by Christian Hechtl ## Copyright 2022 by Jonathan Baumann +## Copyright 2024 by Thomas Bock ## All Rights Reserved. ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -630,7 +631,7 @@ get.issue.comment.count = function(proj.data, type = c("all", "issues", "pull.re issue.id.to.events = get.key.to.value.from.df(df, "issue.id", "event.name") issue.id.to.comment.count = lapply(issue.id.to.events, function(df) { event.names = df[["data.vertices"]] - return (length(event.names[event.names == "commented"])) + return(length(event.names[event.names == "commented"])) }) logging::logdebug("get.issue.comment.count: finished") return(issue.id.to.comment.count) @@ -745,9 +746,9 @@ get.pr.open.merged.or.closed = function(proj.data, use.unfiltered.data = TRUE) { retained.cols = c("issue.id", "issue.state", "event.name")) issue.id.to.events = get.key.to.value.from.df(df, "issue.id", "event.name") issue.id.to.state = lapply(issue.id.to.events, function(df) { - return (if ("open" %in% df[["issue.state"]] || "reopened" %in% df[["issue.state"]]) "open" - else if ("merged" %in% df[["event.name"]]) "merged" - else "closed") + return(if ("open" %in% df[["issue.state"]] || "reopened" %in% df[["issue.state"]]) "open" + else if ("merged" %in% df[["event.name"]]) "merged" + else "closed") }) logging::logdebug("get.pr.open.merged.or.closed: finished") return(issue.id.to.state) diff --git a/util-data.R b/util-data.R index 80470d750..e8c9ee4d1 100644 --- a/util-data.R +++ b/util-data.R @@ -13,7 +13,7 @@ ## ## Copyright 2016-2019 by Claus Hunsen ## Copyright 2017-2019 by Thomas Bock -## Copyright 2020-2021, 2023 by Thomas Bock +## Copyright 2020-2021, 2023-2024 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2020 by Christian Hechtl @@ -90,7 +90,7 @@ DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION = list( #' @return \code{lst}, with the keys changed rename.list.keys = function(lst, map.function) { names(lst) = lapply(names(lst), map.function) - return (lst) + return(lst) } ## Combine \code{DATASOURCE.TO.ARTIFACT.FUNCTION}, \code{DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION} @@ -288,8 +288,8 @@ ProjectData = R6::R6Class("ProjectData", } ## return the mails of the thread with all patchstack mails but the first one being removed - return (list(keep = thread[setdiff(seq_len(nrow(thread)), seq_len(i)[-1]), ], - patchstack = thread[seq_len(i), ])) + return(list(keep = thread[setdiff(seq_len(nrow(thread)), seq_len(i)[-1]), ], + patchstack = thread[seq_len(i), ])) }) ## override thread data with filtered thread data @@ -579,13 +579,15 @@ ProjectData = R6::R6Class("ProjectData", if (private$project.conf$get.value("pasta")) { ## merge PaStA data private$mails.unfiltered = merge(private$mails.unfiltered, private$pasta.mails, - by = "message.id", all.x = TRUE, sort = FALSE) + by = "message.id", all.x = TRUE, sort = FALSE) ## sort by date again because 'merge' disturbs the order - private$mails.unfiltered = private$mails.unfiltered[order(private$mails.unfiltered[["date"]], decreasing = FALSE), ] + private$mails.unfiltered = private$mails.unfiltered[order(private$mails.unfiltered[["date"]], + decreasing = FALSE), ] ## remove duplicated revision set ids - private$mails.unfiltered[["revision.set.id"]] = lapply(private$mails.unfiltered[["revision.set.id"]], function(rev.id) { + private$mails.unfiltered[["revision.set.id"]] = lapply(private$mails.unfiltered[["revision.set.id"]], + function(rev.id) { return(unique(rev.id)) }) } @@ -669,10 +671,11 @@ ProjectData = R6::R6Class("ProjectData", if (private$project.conf$get.value("synchronicity")) { ## merge synchronicity data private$commits.unfiltered = merge(private$commits.unfiltered, private$synchronicity, - by = "hash", all.x = TRUE, sort = FALSE) + by = "hash", all.x = TRUE, sort = FALSE) ## sort by date again because 'merge' disturbs the order - private$commits.unfiltered = private$commits.unfiltered[order(private$commits.unfiltered[["date"]], decreasing = FALSE), ] + private$commits.unfiltered = private$commits.unfiltered[order(private$commits.unfiltered[["date"]], + decreasing = FALSE), ] } ## remove previous synchronicity data private$commits["synchronicity"] = NULL @@ -685,16 +688,15 @@ ProjectData = R6::R6Class("ProjectData", by = "hash", all.x = TRUE, sort = FALSE) ## sort by date again because 'merge' disturbs the order - private$commits = private$commits[order(private$commits[["date"]], - decreasing = FALSE), ] + private$commits = private$commits[order(private$commits[["date"]], decreasing = FALSE), ] } ## get the caller function as a string stacktrace = get.stacktrace(sys.calls()) caller = get.second.last.element(stacktrace) - ## only print warning if this function has not been called by 'cleanup.synchronicity.data' including the case - ## that it is called manually, i.e. the stack is too short. + ## only print warning if this function has not been called by 'cleanup.synchronicity.data' including the + ## case that it is called manually, i.e. the stack is too short. if (all(is.na(caller)) || paste(caller, collapse = " ") != "cleanup.synchronicity.data()") { logging::logwarn("There might be synchronicity data that does not appear in the commit data. To clean this up you can call the function 'cleanup.synchronicity.data()'.") @@ -894,7 +896,7 @@ ProjectData = R6::R6Class("ProjectData", params.keep.environment = params %in% CONF.PARAMETERS.NO.RESET.ENVIRONMENT ## only reset if at least one of them should cause a reset - if(!all(params.keep.environment)) { + if (!all(params.keep.environment)) { self$reset.environment() } else { ## if the 'commit.messages' parameter has been changed, update the commit message data, since we want to @@ -1017,7 +1019,7 @@ ProjectData = R6::R6Class("ProjectData", #' @seealso get.commits get.commits.uncached = function(remove.untracked.files, remove.base.artifact, filter.bots = FALSE) { logging::loginfo("Getting commit data (uncached).") - return (private$filter.commits(self$get.commits.unfiltered(), remove.untracked.files, remove.base.artifact, filter.bots)) + return(private$filter.commits(self$get.commits.unfiltered(), remove.untracked.files, remove.base.artifact, filter.bots)) }, #' Get the list of commits which have the artifact kind configured in the \code{project.conf}. @@ -1513,7 +1515,7 @@ ProjectData = R6::R6Class("ProjectData", if (!self$is.data.source.cached("authors")) { ## read author data - author.data = read.authors(self$get.data.path()); + author.data = read.authors(self$get.data.path()) ## set author data and add gender data (if configured in the 'project.conf') self$set.authors(author.data) @@ -1567,7 +1569,7 @@ ProjectData = R6::R6Class("ProjectData", authors[["author.email"]]), "is.bot"] ## retain if entry is FALSE or NA bot.indices = !bot.indices | is.na(bot.indices) - return (data.to.filter[bot.indices,]) + return(data.to.filter[bot.indices,]) }, #' Get the issue data, filtered according to options in the project configuration: @@ -2137,7 +2139,7 @@ ProjectData = R6::R6Class("ProjectData", data = lapply(data.sources, function(data.source){ data.source.func = DATASOURCE.TO.ARTIFACT.FUNCTION[[data.source]] data.source.authors = self[[data.source.func]]()[c("author.name", "author.email")] - return (data.source.authors) + return(data.source.authors) }) data = plyr::rbind.fill(data) @@ -2145,7 +2147,7 @@ ProjectData = R6::R6Class("ProjectData", ## remove duplicates data = unique(data) - return (data) + return(data) }, #' Get the list of custom event timestamps, @@ -2158,14 +2160,14 @@ ProjectData = R6::R6Class("ProjectData", && !private$project.conf$get.value("custom.event.timestamps.locked")) { file.name = self$get.project.conf.entry("custom.event.timestamps.file") - if(is.na(file.name)) { + if (is.na(file.name)) { logging::logwarn("get.custom.event.timestamps: No file configured") - return (list()) + return(list()) } timestamps = read.custom.event.timestamps(self$get.data.path(), file.name) self$set.custom.event.timestamps(timestamps) } - return (private$custom.event.timestamps) + return(private$custom.event.timestamps) }, #' Set the list of custom event timestamps. @@ -2178,7 +2180,7 @@ ProjectData = R6::R6Class("ProjectData", logging::logerror(error.message) stop(error.message) } - if(length(custom.event.timestamps) != 0){ + if (length(custom.event.timestamps) != 0){ private$custom.event.timestamps = custom.event.timestamps[ order(unlist(get.date.from.string(custom.event.timestamps))) ] @@ -2305,7 +2307,7 @@ RangeData = R6::R6Class("RangeData", inherit = ProjectData, #' or of type character if input was a commit hash or version; #' or NULL if the string could not be parsed get.range.bounds = function() { - return (get.range.bounds(private$range)) + return(get.range.bounds(private$range)) }, #' Get the 'revision.callgraph' of the current instance diff --git a/util-misc.R b/util-misc.R index 152f13ca2..4722ccb2e 100644 --- a/util-misc.R +++ b/util-misc.R @@ -16,11 +16,11 @@ ## Copyright 2017 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock -## Copyright 2020-2021, 2023 by Thomas Bock +## Copyright 2020-2021, 2023-2024 by Thomas Bock ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann -## Copyright 2022-2023 by Maximilian Löffler +## Copyright 2022-2024 by Maximilian Löffler ## All Rights Reserved. @@ -977,13 +977,23 @@ get.range.bounds = function(range) { start.end = regmatches(range, gregexpr(pattern = pattern[[1]], range))[[1]] if (length(start.end) == 2) { - return (pattern[[2]](start.end)) + return(pattern[[2]](start.end)) } } - return (range) + return(range) } +#' Obtain the start and end dates from given ranges. +#' +#' @param ranges the ranges to get the dates from +#' +#' @return a vector that contains the start and end dates of all given ranges +#' sorted and disambiguated +get.bin.dates.from.ranges = function(ranges) { + dates = sort(unique(get.date.from.unix.timestamp(unlist(ranges)))) + return(dates) +} #' Get the data from a data frame in a specific range. #' @@ -998,7 +1008,7 @@ get.data.from.range = function(range, data) { ## split data by this bin; this gives a list of three data frames, "0" contains the data before the range, "1" the ## data within the range and "2" the holds the data after the range - split.data = split.data.by.bins(data, df.bins) + split.data = split.dataframe.by.bins(data, df.bins) ## look for the element with name "1", as we are interested in the data within the range ## if there is no data, return an empty data frame corresponding to the data we want to cut @@ -1011,3 +1021,4 @@ get.data.from.range = function(range, data) { return(data.between) } } + diff --git a/util-networks-covariates.R b/util-networks-covariates.R index 9d560fed0..5b68cbffe 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -14,7 +14,7 @@ ## Copyright 2017 by Felix Prasse ## Copyright 2018-2019 by Claus Hunsen ## Copyright 2018-2019 by Thomas Bock -## Copyright 2021, 2023 by Thomas Bock +## Copyright 2021, 2023-2024 by Thomas Bock ## Copyright 2018-2019 by Klara Schlüter ## Copyright 2018 by Jakob Kronawitter ## Copyright 2020 by Christian Hechtl @@ -136,7 +136,7 @@ add.vertex.attribute = function(net.to.range.list, attr.name, default.value, com } ) - return (nets.with.attr) + return(nets.with.attr) } diff --git a/util-networks-metrics.R b/util-networks-metrics.R index faa7c4f69..dcdbbcf17 100644 --- a/util-networks-metrics.R +++ b/util-networks-metrics.R @@ -12,7 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2015, 2019 by Thomas Bock -## Copyright 2021, 2023 by Thomas Bock +## Copyright 2021, 2023-2024 by Thomas Bock ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2019 by Claus Hunsen ## Copyright 2017-2018 by Christian Hechtl @@ -167,7 +167,7 @@ metrics.smallworldness = function(network) { if (!is.simple(network)) { ## if this is not the case, raise an error and stop the execution error.message = "The input network has too many edges. Try again with a simplified network." - logging::error(error.message) + logging::logerror(error.message) stop(error.message) } @@ -193,7 +193,7 @@ metrics.smallworldness = function(network) { ## indicator s.delta s.delta = gamma / lambda - return (c(smallworldness = s.delta)) + return(c(smallworldness = s.delta)) } #' Decide, whether a network is smallworld or not. @@ -217,8 +217,19 @@ metrics.is.smallworld = function(network) { #' @param minimum.number.vertices the minimum number of vertices with which #' a network can be scale free [default: 30] #' -#' @return A dataframe containing the different values, connected to scale-freeness. +#' @return If the network is empty (i.e., has no vertices), \code{NA}. +#' Otherwise, a dataframe containing the different values, connected to scale-freeness. metrics.scale.freeness = function(network, minimum.number.vertices = 30) { + + ## check whether the network is empty, i.e., if it has no vertices + if (igraph::vcount(network) == 0) { + ## print user warning instead of igraph error + logging::logwarn("The input network has no vertices. Will return NA right away.") + + ## cancel the execution and return NA + return(NA) + } + v.degree = sort(igraph::degree(network, mode = "total"), decreasing = TRUE) ## Power-law fiting @@ -235,7 +246,7 @@ metrics.scale.freeness = function(network, minimum.number.vertices = 30) { ## If less than minimum.number.vertices vertices are in the power law, set x_min manually ## to include a minimum of number of vertices and recompute the powerlaw fit non.zero.degree.v.count = length(v.degree[v.degree > 0]) - if(res[["num.power.law"]] < minimum.number.vertices + if (res[["num.power.law"]] < minimum.number.vertices & non.zero.degree.v.count >= minimum.number.vertices) { ## vertex degree is sorted above x.min = v.degree[[minimum.number.vertices]] @@ -248,7 +259,7 @@ metrics.scale.freeness = function(network, minimum.number.vertices = 30) { } ## Remove non conclusive sample sizes - if(res[["num.power.law"]] < minimum.number.vertices) { + if (res[["num.power.law"]] < minimum.number.vertices) { res[["KS.p"]] = 0 # 0 instead of NA } @@ -263,10 +274,15 @@ metrics.scale.freeness = function(network, minimum.number.vertices = 30) { #' a network can be scale free [default: 30] #' #' @return \code{TRUE}, if the network is scale free, -#' \code{FALSE}, otherwise. +#' \code{FALSE}, if it is not scale free, +#' \code{NA}, if the network is empty (i.e., has no vertices). metrics.is.scale.free = function(network, minimum.number.vertices = 30) { df = metrics.scale.freeness(network, minimum.number.vertices) - return(df[["KS.p"]] >= 0.05) + if (is.single.na(df)) { + return(NA) + } else { + return(df[["KS.p"]] >= 0.05) + } } #' Calculate the hierarchy values for a network, i.e., the vertex degrees and the local diff --git a/util-networks-misc.R b/util-networks-misc.R index c2ebc509f..a183f6039 100644 --- a/util-networks-misc.R +++ b/util-networks-misc.R @@ -14,10 +14,11 @@ ## Copyright 2016-2017 by Sofie Kemper ## Copyright 2016-2017 by Claus Hunsen ## Copyright 2016-2018 by Thomas Bock -## Copyright 2020, 2023 by Thomas Bock +## Copyright 2020, 2023-2024 by Thomas Bock ## Copyright 2017 by Angelika Schmid ## Copyright 2019 by Jakob Kronawitter ## Copyright 2019-2020 by Anselm Fehnker +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -36,12 +37,13 @@ requireNamespace("Matrix") # for sparse matrices #' @param networks the list of networks from which the author names are wanted #' @param globally decides if all author names are in one list or in separate lists for each network [default: TRUE] #' -#' @return the list of author names +#' @return the list of author names as a list of sorted vectors get.author.names.from.networks = function(networks, globally = TRUE) { ## for each network, get a list of authors that are in this network active.authors.list = lapply(networks, function(network) { active.authors = igraph::V(network)$name + active.authors = sort(active.authors) return(active.authors) }) @@ -55,7 +57,8 @@ get.author.names.from.networks = function(networks, globally = TRUE) { ## remove duplicates and order alphabetically ascending active.authors = active.authors[!duplicated(active.authors)] active.authors = sort(active.authors) - return(active.authors) + ## return as a list + return(list(active.authors)) } else { return(active.authors.list) } @@ -69,7 +72,7 @@ get.author.names.from.networks = function(networks, globally = TRUE) { #' or any combination of them [default: c("commits", "mails", "issues")] #' @param globally decides if all author names are in one list or in separate for each network [default: TRUE] #' -#' @return the list of author names +#' @return the list of author names as a list of sorted vectors get.author.names.from.data = function(data.ranges, data.sources = c("commits", "mails", "issues"), globally = TRUE) { data.sources = match.arg.or.default(data.sources, several.ok = TRUE) @@ -79,7 +82,7 @@ get.author.names.from.data = function(data.ranges, data.sources = c("commits", " active.authors = range.data$get.authors.by.data.source(data.sources) - active.authors.names = active.authors[["author.name"]] + active.authors.names = sort(active.authors[["author.name"]]) return(active.authors.names) @@ -95,7 +98,8 @@ get.author.names.from.data = function(data.ranges, data.sources = c("commits", " ## remove duplicates and order alphabetically ascending active.authors = active.authors[!duplicated(active.authors)] active.authors = sort(active.authors) - return(active.authors) + ## return as a list + return(list(active.authors)) } else { return(active.authors.list) } @@ -132,13 +136,22 @@ get.expanded.adjacency = function(network, authors, weighted = FALSE) { ## get the weighted adjacency matrix for the current network matrix.data = igraph::get.adjacency(network, attr = "weight") } else { - ## get the unweighted adjacency matrix for the current network + ## get the unweighted sparse adjacency matrix for the current network matrix.data = igraph::get.adjacency(network) } - ## order the adjacency matrix + network.authors.num = nrow(matrix.data) + ## order the adjacency matrix and filter out authors that were not in authors list if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work - matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))] + matrix.data = matrix.data[order((rownames(matrix.data)[rownames(matrix.data) %in% authors])), + order((rownames(matrix.data)[rownames(matrix.data) %in% authors]))] + } + + if (network.authors.num > nrow(matrix.data)) { + # write a warning with the number of authors from the network that we ignore + warning.string = sprintf("The network had %d authors that will not be displayed in the matrix!", + network.authors.num - nrow(matrix.data)) + warning(warning.string) } ## save the activity data per author @@ -156,7 +169,7 @@ get.expanded.adjacency = function(network, authors, weighted = FALSE) { } #' Calculates a sparse adjacency matrix for each network in the list. -#' All adjacency matrices are expanded in such a way that the use the same set +#' All adjacency matrices are expanded in such a way that they use the same set #' of authors derived from all networks in the list. #' #' @param networks list of networks @@ -165,7 +178,7 @@ get.expanded.adjacency = function(network, authors, weighted = FALSE) { #' @return the list of adjacency matrices get.expanded.adjacency.matrices = function(networks, weighted = FALSE){ - authors = get.author.names.from.networks(networks) + authors = get.author.names.from.networks(networks)[[1]] adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, authors, weighted) @@ -199,11 +212,15 @@ get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) { ## search for a non-zero entry and set them to an arbitray number (e.g., 42) ## to force that all non-zero entries are correctly set to 1 afterwards - not.zero.idxs = which(matrices.cumulated[[m]] >= 1, arr.ind = TRUE) - if (nrow(not.zero.idxs) > 0) { - first.not.zero.idx = not.zero.idxs[1, ] - names(first.not.zero.idx) = c("row", "col") - matrices.cumulated[[m]][first.not.zero.idx[["row"]], first.not.zero.idx[["col"]]] = 42 + if (length(matrices.cumulated[[m]]@i) > 0) { + + ## the first non-zero entry of a sparse matrix is at the first position pointed to by + ## the lists @i and @j of the matrix. Since these lists store the position 0-based, + ## but the list access we use them for is 1-based, we need to add 1 to both values. + row = matrices.cumulated[[m]]@i[[1]] + 1 + col = matrices.cumulated[[m]]@j[[1]] + 1 + + matrices.cumulated[[m]][row][col] = 42 matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i)) } } @@ -214,16 +231,36 @@ get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) { } #' Converts a list of adjacency matrices to an array. +#' Expects matrices of equal dimension with equal column- and rownames. #' #' @param adjacency.list the list of adjacency matrices #' #' @return the converted array convert.adjacency.matrix.list.to.array = function(adjacency.list){ + if (length(adjacency.list) < 1) { + logging::logerror("The method 'convert.adjacency.matrix.list.to.array received' an empty list!") + stop("The method 'convert.adjacency.matrix.list.to.array' received an empty list!") + } + ## Check if all matrices have equal column- and rownames + rownames = rownames(adjacency.list[[1]]) + colnames = colnames(adjacency.list[[1]]) + + if (length(adjacency.list) > 1) { + for (i in 2:length(adjacency.list)) { + + if (!identical(rownames, rownames(adjacency.list[[i]])) || !identical(colnames, colnames(adjacency.list[[i]]))) { + error.string = sprintf("The matrix at position %d has different col or rownames from the first!", i) + logging::logerror(error.string) + stop(error.string) + } + } + } + ## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list))) - rownames(array) = rownames(adjacency.list[[1]]) - colnames(array) = colnames(adjacency.list[[1]]) + rownames(array) = rownames + colnames(array) = colnames ## copy the activity values from the adjacency matrices in the list to the corresponding array slices for (i in seq_along(adjacency.list)) { diff --git a/util-networks.R b/util-networks.R index 9f205bba5..b02eab694 100644 --- a/util-networks.R +++ b/util-networks.R @@ -15,13 +15,13 @@ ## Copyright 2017 by Raphael Nömmer ## Copyright 2017-2018 by Christian Hechtl ## Copyright 2017-2019 by Thomas Bock -## Copyright 2021, 2023 by Thomas Bock +## Copyright 2021, 2023-2024 by Thomas Bock ## Copyright 2018 by Barbara Eckl ## Copyright 2018-2019 by Jakob Kronawitter ## Copyright 2020 by Anselm Fehnker ## Copyright 2021 by Niklas Schneider ## Copyright 2022 by Jonathan Baumann -## Copyright 2023 by Maximilian Löffler +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. @@ -56,7 +56,7 @@ EDGE.ATTR.HANDLING = list( ## network-analytic data weight = "sum", type = "first", - relation = "first", + relation = function(relation) sort(unique(relation)), ## commit data changed.files = "sum", @@ -475,17 +475,117 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", return(private$artifacts.network.issue) } - ## log warning as we do not have relations among issues right now - logging::logwarn(paste( - "There exist no actual artifact network with the relation 'issue'.", - "Return an edge-less network now." - )) + if (private$proj.data$get.project.conf()$get.entry("issues.only.comments")) { + logging::logwarn(paste( + "Create an edge-less artifact network as 'issues.only.comments' is set.", + "Comments in issues cannot create issue edges." + )) + } - ## construct edgeless network with mandatory edge and vertex attributes - directed = private$network.conf$get.value("artifact.directed") - artifacts = private$proj.data$get.artifacts("issues") # issue IDs - artifacts.net = create.empty.network(directed = directed, add.attributes = TRUE) + - igraph::vertices(artifacts) + ## construct edge list based on issue-artifact data + artifacts.net.data.raw = private$proj.data[[DATASOURCE.TO.ARTIFACT.FUNCTION[["issues"]]]]() + + ## obtain issue-connecting events + add.links = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "add_link" & + artifacts.net.data.raw$event.info.2 == "issue", ] + referenced.bys = artifacts.net.data.raw[artifacts.net.data.raw$event.name == "referenced_by" & + artifacts.net.data.raw$event.info.2 == "issue", ] + + ## the codeface extraction for jira issues creates duplicate events, linking the referenced issue + ## to the referencing issue, in addition to the correct events, linking the referencing issue to + ## the referenced issue. We can only deduplicate them, if we build an undirected network, as otherwise, + ## we would need to guess the correct direction. + if (!private$network.conf$get.entry("artifact.directed")) { + + ## obtain 'add_link' events from jira + jira.add.links = add.links[add.links$issue.source == "jira", ] + matched = list() + + ## iterate over all add_link events from jira + for (i in 1:nrow(jira.add.links)) { + + add.link = jira.add.links[i, ] + + ## ensure not to remove both duplicate edges + if (any(sapply(matched, function(entry) identical(entry, add.link)))) { + next + } + + ## match any 'add_link' events, that are the reverse direction of 'add.link', + ## but have the same timestamp and author information + match = jira.add.links[( + jira.add.links$issue.id == add.link$event.info.1 & + jira.add.links$event.info.1 == add.link$issue.id & + jira.add.links$date == add.link$date & + jira.add.links$author.name == add.link$author.name), ] + + ## if a match is found, remove 'add.link' and its corresponding 'referenced_by' event + if (nrow(match) > 0) { + add.links = add.links[!( + add.links$issue.id == match$issue.id & + add.links$event.info.1 == match$event.info.1 & + add.links$date == match$date & + add.links$author.name == match$author.name), ] + referenced.bys = referenced.bys[!( + referenced.bys$issue.id == add.link$issue.id & + referenced.bys$event.info.1 == add.link$event.info.1 & + referenced.bys$date == add.link$date & + referenced.bys$author.name == add.link$author.name), ] + matched = append(matched, list(match)) + } + } + } + + + if (nrow(add.links) != nrow(referenced.bys)) { + logging::logwarn("Inconsistent issue data. Unequally many 'add_link' and 'referenced_by' issue-events.") + } + + vertices = unique(artifacts.net.data.raw["issue.id"]) + edge.list = data.frame() + + # edges in artifact networks can not have the 'artifact' attribute but should instead have + # the 'author.name' attribute as events caused by authors connect issues + edge.attributes = private$network.conf$get.value("edge.attributes") + artifact.index = match("artifact", edge.attributes, nomatch = NA) + if (!is.na(artifact.index)) { + edge.attributes = edge.attributes[-artifact.index] + edge.attributes = c(edge.attributes, c("author.name")) + } + + ## connect corresponding add_link and referenced_by issue-events + edge.list = plyr::rbind.fill(parallel::mclapply(split(add.links, seq_along(add.links)), function(from) { + ## get edge attributes + cols.which = edge.attributes %in% colnames(from) + edge.attrs = from[, edge.attributes[cols.which], drop = FALSE] + + ## construct edge + to = subset(referenced.bys, + event.info.1 == from[["issue.id"]] & + author.name == from[["author.name"]] & + date == from[["date"]]) + if (!all(is.na(to))) { + combination = list("from" = from[["issue.id"]], "to" = to[["issue.id"]]) + combination = cbind(combination, edge.attrs, row.names = NULL) # add edge attributes + return(combination) # return the combination for this row + } + })) + + artifacts.net.data = list( + vertices = data.frame( + name = vertices + ), + edges = edge.list + ) + + ## construct network from obtained data + artifacts.net = construct.network.from.edge.list( + artifacts.net.data[["vertices"]], + artifacts.net.data[["edges"]], + network.conf = private$network.conf, + directed = private$network.conf$get.value("artifact.directed"), + available.edge.attributes = private$proj.data$get.data.columns.for.data.source("issues") + ) ## store network private$artifacts.network.issue = artifacts.net @@ -519,7 +619,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", attr(bip.relation, "vertex.kind") = private$get.vertex.kind.for.relation(relation) attr(bip.relation, "relation") = relation - return (bip.relation) + return(bip.relation) }) names(bip.relations) = relations @@ -681,6 +781,12 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", igraph::V(net)$kind = TYPE.AUTHOR igraph::V(net)$type = TYPE.AUTHOR + ## simplify network if wanted + if (private$network.conf$get.value("simplify")) { + net = simplify.network(net, simplify.multiple.relations = + private$network.conf$get.value("simplify.multiple.relations")) + } + ## add range attribute for later analysis (if available) if ("RangeData" %in% class(private$proj.data)) { attr(net, "range") = private$proj.data$get.range() @@ -722,6 +828,12 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## set vertex and edge attributes for identifaction igraph::V(net)$type = TYPE.ARTIFACT + ## simplify network if wanted + if (private$network.conf$get.value("simplify")) { + net = simplify.network(net, simplify.multiple.relations = + private$network.conf$get.value("simplify.multiple.relations")) + } + ## add range attribute for later analysis (if available) if ("RangeData" %in% class(private$proj.data)) { attr(net, "range") = private$proj.data$get.range() @@ -822,6 +934,12 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", network = igraph::delete.vertices(network, authors.to.remove) } + ## simplify network if wanted + if (private$network.conf$get.value("simplify")) { + network = simplify.network(network, simplify.multiple.relations = + private$network.conf$get.value("simplify.multiple.relations")) + } + ## add range attribute for later analysis (if available) if ("RangeData" %in% class(private$proj.data)) { attr(network, "range") = private$proj.data$get.range() @@ -899,8 +1017,15 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", artifacts.to.add.kind = artifacts.all[ artifacts.all[["data.vertices"]] %in% artifacts.to.add, "artifact.type" ] - artifacts.net = artifacts.net + igraph::vertices(artifacts.to.add, type = TYPE.ARTIFACT, - kind = artifacts.to.add.kind) + + ## Adjust vertex attribute to 'Issue' in multi networks + ## to be consistent with bipartite networks + artifacts.to.add.kind[artifacts.to.add.kind == "IssueEvent"] = "Issue" + + if (length(artifacts.to.add) > 0) { + artifacts.net = artifacts.net + igraph::vertices(artifacts.to.add, type = TYPE.ARTIFACT, + kind = artifacts.to.add.kind) + } ## check directedness and adapt artifact network if needed if (igraph::is.directed(authors.net) && !igraph::is.directed(artifacts.net)) { @@ -922,9 +1047,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## 1) merge the existing networks u = igraph::disjoint_union(authors.net, artifacts.net) - ## As there is a bug in 'igraph::disjoint_union' in igraph versions 1.4.0, 1.4.1, and 1.4.2 - ## (see https://github.com/igraph/rigraph/issues/761), we need to adjust the type of the date attribute - ## of the outcome of 'igraph::disjoint_union'. + ## As there is a bug in 'igraph::disjoint_union' in igraph from its version 1.4.0 on, which is still + ## present, at least, until its version 2.0.3 (see https://github.com/igraph/rigraph/issues/761), we need + ## to adjust the type of the date attribute of the outcome of 'igraph::disjoint_union'. ## Note: The following temporary fix only considers the 'date' attribute. However, this problem could also ## affect several other attributes, whose classes are not adjusted in our temporary fix. ## The following code block should be redundant as soon as igraph has fixed their bug. @@ -998,8 +1123,8 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed keys.number = length(list) - ## if edges in an artifact network contain the \code{artifact} attribute - ## replace it with the \code{author.name} attribute as artifacts cannot cause + ## if edges in an artifact network contain the \code{artifact} attribute + ## replace it with the \code{author.name} attribute as artifacts cannot cause ## edges in artifact networks, authors can edge.attributes = network.conf$get.value("edge.attributes") if (artifact.edges) { @@ -1199,11 +1324,6 @@ construct.network.from.edge.list = function(vertices, edge.list, network.conf, d ## initialize edge weights net = igraph::set.edge.attribute(net, "weight", value = 1) - ## transform multiple edges to edge weights - if (network.conf$get.value("simplify")) { - net = simplify.network(net) - } - logging::logdebug("construct.network.from.edge.list: finished.") return(net) @@ -1477,16 +1597,19 @@ add.attributes.to.network = function(network, type = c("vertex", "edge"), attrib #' @param network the given network #' @param remove.multiple whether to contract multiple edges between the same pair of vertices [default: TRUE] #' @param remove.loops whether to remove loops [default: TRUE] +#' @param simplify.multiple.relations whether to combine edges of multiple relations into +#' one simplified edge [default: FALSE] #' #' @return the simplified network -simplify.network = function(network, remove.multiple = TRUE, remove.loops = TRUE) { +simplify.network = function(network, remove.multiple = TRUE, remove.loops = TRUE, + simplify.multiple.relations = FALSE) { logging::logdebug("simplify.network: starting.") logging::loginfo("Simplifying network.") ## save network attributes, otherwise they get lost network.attributes = igraph::get.graph.attribute(network) - if (length(unique(igraph::get.edge.attribute(network, "relation"))) > 1) { + if (!simplify.multiple.relations && length(unique(igraph::get.edge.attribute(network, "relation"))) > 1) { ## data frame of the network edge.data = igraph::as_data_frame(network, what = "edges") vertex.data = igraph::as_data_frame(network, what = "vertices") @@ -1528,9 +1651,12 @@ simplify.network = function(network, remove.multiple = TRUE, remove.loops = TRUE #' @param networks the list of networks #' @param remove.multiple whether to contract multiple edges between the same pair of vertices [default: TRUE] #' @param remove.loops whether to remove loops [default: TRUE] +#' @param simplify.multiple.relations whether to combine edges of multiple relations into +#' one simplified edge [default: FALSE] #' #' @return the simplified networks -simplify.networks = function(networks, remove.multiple = TRUE, remove.loops = TRUE) { +simplify.networks = function(networks, remove.multiple = TRUE, remove.loops = TRUE, + simplify.multiple.relations = FALSE) { logging::logdebug("simplify.networks: starting.") logging::loginfo( "Simplifying networks (names = [%s]).", @@ -1538,7 +1664,7 @@ simplify.networks = function(networks, remove.multiple = TRUE, remove.loops = TR ) nets = parallel::mclapply(networks, simplify.network, remove.multiple = remove.multiple, - remove.loops = remove.loops) + remove.loops = remove.loops, simplify.multiple.relations = simplify.multiple.relations) logging::logdebug("simplify.networks: finished.") return(nets) @@ -1672,14 +1798,14 @@ delete.authors.without.specific.edges = function(network, specific.edge.types = #' empty relation, i.e. \code{character(0)} get.data.sources.from.relations = function(network) { ## get all relations in the network - data.sources = unique(igraph::E(network)$relation) + data.sources = unique(unlist(igraph::E(network)$relation)) ## map them to data sources respectively using the defined translation constant data.sources = sapply(data.sources, function(relation) { ## check for a \code{character(0)} relation and abort if there is one if (length(relation) == 0) { logging::logwarn("There seems to be an empty relation in the network. Cannot proceed.") - return (NA) + return(NA) } ## use the translation constant to get the appropriate data source diff --git a/util-plot.R b/util-plot.R index 67f638eb7..25e2a24c7 100644 --- a/util-plot.R +++ b/util-plot.R @@ -15,6 +15,7 @@ ## Copyright 2018 by Barbara Eckl ## Copyright 2018 by Thomas Bock ## Copyright 2020-2021 by Thomas Bock +## Copyright 2024 by Maximilian Löffler ## All Rights Reserved. @@ -142,7 +143,7 @@ plot.get.plot.for.network = function(network, labels = TRUE) { if (igraph::ecount(network) > 0) { p = p + ggraph::geom_edge_fan( - mapping = ggplot2::aes(colour = relation, linetype = edge.type, width = 0.3 + 0.5 * log(weight)), + mapping = ggplot2::aes(colour = paste(relation, sep = " "), linetype = edge.type, width = 0.3 + 0.5 * log(weight)), end_cap = ggraph::circle(PLOT.VERTEX.SIZE + 3, "pt"), start_cap = ggraph::circle(PLOT.VERTEX.SIZE + 3, "pt"), arrow = if (igraph::is.directed(network)) { diff --git a/util-read.R b/util-read.R index 8f1b4fd9b..8cfe1a802 100644 --- a/util-read.R +++ b/util-read.R @@ -17,7 +17,7 @@ ## Copyright 2020-2022 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock -## Copyright 2023 by Thomas Bock +## Copyright 2023-2024 by Thomas Bock ## Copyright 2018 by Jakob Kronawitter ## Copyright 2018-2019 by Anselm Fehnker ## Copyright 2020-2021, 2023 by Niklas Schneider @@ -207,7 +207,7 @@ read.commits = function(data.path, artifact) { #' #' @return the empty dataframe create.empty.commits.list = function() { - return (create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) + return(create.empty.data.frame(COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)) } ## * Mail data ------------------------------------------------------------- @@ -293,7 +293,7 @@ read.mails = function(data.path) { #' #' @return the empty dataframe create.empty.mails.list = function() { - return (create.empty.data.frame(MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)) + return(create.empty.data.frame(MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)) } ## * Issue data ------------------------------------------------------------ @@ -375,7 +375,7 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { } ## set pattern for issue ID for better recognition - issue.data[["issue.id"]] = sprintf("", issue.data[["issue.source"]], issue.data[["issue.id"]]) + issue.data[["issue.id"]] = sprintf(ISSUE.ID.FORMAT, issue.data[["issue.source"]], issue.data[["issue.id"]]) ## properly parse and store data in list-type columns issue.data[["issue.type"]] = I(unname(lapply(issue.data[["issue.type"]], jsonlite::fromJSON, simplifyVector = FALSE))) @@ -388,6 +388,13 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { issue.data[["creation.date"]] = get.date.from.string(issue.data[["creation.date"]]) issue.data[["closing.date"]] = get.date.from.string(issue.data[["closing.date"]]) + ## if other issues are referenced, convert names to ID format + matches = issue.data[issue.data[["event.name"]] %in% c("add_link", "remove_link", "referenced_by") & + issue.data[["event.info.2"]] == "issue", ] + formatted.matches = sprintf(ISSUE.ID.FORMAT, matches[["issue.source"]], matches[["event.info.1"]]) + issue.data[issue.data[["event.name"]] %in% c("add_link", "remove_link", "referenced_by") & + issue.data[["event.info.2"]] == "issue", ][["event.info.1"]] = formatted.matches + if (nrow(issue.data) > 0) { ## fix all dates to be after the creation date ## violations can happen for "commit_added" events if the commit was made before the PR was opened @@ -421,7 +428,7 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) { #' #' @return the empty dataframe create.empty.issues.list = function() { - return (create.empty.data.frame(ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)) + return(create.empty.data.frame(ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)) } @@ -548,7 +555,7 @@ read.authors = function(data.path) { #' #' @return the empty dataframe create.empty.authors.list = function() { - return (create.empty.data.frame(AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)) + return(create.empty.data.frame(AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)) } @@ -636,7 +643,7 @@ read.gender = function(data.path) { #' #' @return the empty dataframe create.empty.gender.list = function() { - return (create.empty.data.frame(GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES)) + return(create.empty.data.frame(GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES)) } @@ -746,7 +753,7 @@ read.commit.messages = function(data.path) { #' #' @return the empty dataframe create.empty.commit.message.list = function() { - return (create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) + return(create.empty.data.frame(COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)) } ## * PaStA data ------------------------------------------------------------ @@ -833,7 +840,7 @@ read.pasta = function(data.path) { #' #' @return the empty dataframe create.empty.pasta.list = function() { - return (create.empty.data.frame(PASTA.LIST.COLUMNS, PASTA.LIST.DATA.TYPES)) + return(create.empty.data.frame(PASTA.LIST.COLUMNS, PASTA.LIST.DATA.TYPES)) } ## * Synchronicity data ---------------------------------------------------- @@ -900,7 +907,7 @@ read.synchronicity = function(data.path, artifact, time.window) { #' #' @return the empty dataframe create.empty.synchronicity.list = function() { - return (create.empty.data.frame(SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)) + return(create.empty.data.frame(SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)) } @@ -948,7 +955,7 @@ read.custom.event.timestamps = function(data.path, file.name) { } logging::logdebug("read.custom.event.timestamps: finished.") - return (timestamps) + return(timestamps) } ## Helper functions -------------------------------------------------------- @@ -962,7 +969,10 @@ COMMIT.ID.FORMAT = "" #' #' @return a vector with the formatted commit ids format.commit.ids = function(commit.ids) { - return (sprintf(COMMIT.ID.FORMAT, commit.ids)) + return(sprintf(COMMIT.ID.FORMAT, commit.ids)) } +## declare a global format for issue.ids in several data frame columns +ISSUE.ID.FORMAT = "" + diff --git a/util-split.R b/util-split.R index 1c0ea9e92..d68f9caee 100644 --- a/util-split.R +++ b/util-split.R @@ -18,10 +18,11 @@ ## Copyright 2020 by Christian Hechtl ## Copyright 2017 by Felix Prasse ## Copyright 2017-2018 by Thomas Bock -## Copyright 2020 by Thomas Bock +## Copyright 2020, 2024 by Thomas Bock ## Copyright 2021 by Niklas Schneider ## Copyright 2021 by Johannes Hostert ## Copyright 2022 by Jonathan Baumann +## Copyright 2023-2024 by Maximilian Löffler ## All Rights Reserved. @@ -64,183 +65,68 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = number.windows = NULL, split.basis = c("commits", "mails", "issues"), sliding.window = FALSE, project.conf.new = NULL) { - ## get basis for splitting process - split.basis = match.arg(split.basis) - - ## if the data used by the split basis is not present, load it automatically - if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) { - function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]] - project.data[[function.name]]() + # validate existence and type of the 'bins' parameter + if (!is.null(bins) && !lubridate::is.POSIXct(bins)) { + dates = parallel::mclapply(unlist(bins), get.date.from.string) + if (any(is.na(dates))) { + logging::logerror(paste("The bins parameter, if present, needs to be a vector", + "whose elements represent dates")) + stop("Stopped due to incorrect parameter types") + } } - ## get actual raw data - data.to.split = project.data$get.cached.data.sources("only.unfiltered") - - data = lapply(data.to.split, function(ds) { - ## build the name of the respective getter and call it - function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[ds]] - return(project.data[[function.name]]()) - }) - names(data) = data.to.split - - ## load available additional data sources - additional.data.sources = project.data$get.cached.data.sources("only.additional") - additional.data = lapply(additional.data.sources, function(ds) { - ## build the name of the respective getter and call it - function.name = DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION[[ds]] - return(project.data[[function.name]]()) - }) - names(additional.data) = additional.data.sources + split = split.data.by.time.or.bins(project.data, splitting.length = time.period, bins, split.by.time = TRUE, + number.windows, split.basis, sliding.window, project.conf.new) + return(split) +} - ## number of windows given (ignoring time period and bins) - if (!is.null(number.windows)) { - ## reset bins for the later algorithm - bins = NULL - ## remove sliding windows - sliding.window = FALSE - } - ## if bins are NOT given explicitly - if (is.null(bins)) { - ## get bins based on split.basis - bins = split.get.bins.time.based(data[[split.basis]][["date"]], time.period, number.windows)$bins - bins.labels = head(bins, -1) - split.by.bins = FALSE - ## logging - logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.", - project.data$get.class.name(), time.period, split.basis) - } - ## when bins are given explicitly - else { - ## remove sliding windows - sliding.window = FALSE - ## get bins based on parameter - split.basis = NULL - bins = get.date.from.string(bins) - bins = get.date.string(bins) - bins.labels = head(bins, -1) - split.by.bins = TRUE - ## logging - logging::loginfo("Splitting data '%s' into time ranges [%s].", - project.data$get.class.name(), paste(bins, collapse = ", ")) +#' Split project data in activity-bin-based ranges as specified +#' +#' @param project.data the project data object from which the data is retrieved +#' @param activity.amount the amount of data elements with unique ids to be considered in a bin, an integer. +#' @param bins the bins by which data should be split. Comprises of two components: +#' \code{vector}: Assigns elements of the \code{split.basis} column of \code{project.data} to bins. +#' \code{bins}: Dates defining the start of bins (the last date defines the end of the last bin, in an +#' *exclusive* manner). +#' The expected format of \code{bins} is produced by \code{split.get.bins.activity.based}. +#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' [default: "commits"] +#' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}. +#' +#' @return the list of RangeData objects, each referring to one bin +#' +#' @seealso split.get.bins.activity.based +split.data.by.bins = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"), + sliding.window) { + + # validate type of the 'bins' parameter + if (is.null(bins) || !is.list(bins)) { + logging::logerror("The bins parameter needs to be of type list, (is %s)", class(bins)) + stop("Stopped due to incorrect parameter types") } - bins.date = get.date.from.string(bins) - ## construct ranges - bins.ranges = construct.ranges(bins) - names(bins.ranges) = bins.ranges - - if ((length(bins.ranges) <= 1) && sliding.window) { - logging::logwarn("Sliding-window approach does not apply for one range or less.") - sliding.window = FALSE + # validate existence and type of the 'bins' component of the 'bins' parameter + if (!("bins" %in% names(bins))) { + logging::logerror("The 'bins' parameter needs to include a component 'bins'") + stop("Stopped due to incorrect parameter types") } - if (is.null(project.conf.new)) { - ## Clone the project configuration, so that splitting repeatedly does not interfere - ## with the same configuration. - project.conf.new = project.data$get.project.conf()$clone() + dates = parallel::mclapply(bins[["bins"]], get.date.from.string) + if (any(is.na(dates))) { + logging::logerror(paste("The 'bins' component of the 'bins' parameter, needs to be a vector", + "whose elements represent dates")) + stop("Stopped due to incorrect parameter types") } - if (!sliding.window) { - ## split data - data.split = parallel::mclapply(data.to.split, function(df.name) { - logging::logdebug("Splitting %s.", df.name) - ## identify bins for data - df = data[[df.name]] - df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE) - ## split data according to df.bins - df.split = split(df, df.bins) - ## add proper labels/names - names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) - return(df.split) - }) - ## set the names to the data sources obtained earlier - names(data.split) = data.to.split - - ## re-arrange data to get the proper list of data per range - logging::logdebug("Re-arranging data.") - data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) - names(data.split) = bins.ranges - - ## adapt project configuration - project.conf.new$set.revisions(bins, bins.date) - - ## construct RangeData objects - logging::logdebug("Constructing RangeData objects.") - - cf.data = parallel::mclapply(bins.ranges, function(range) { - logging::logdebug("Constructing data for range %s.", range) - ## construct object for current range - cf.range.data = RangeData$new(project.conf.new, range) - ## get data for current range - df.list = data.split[[range]] - - ## set main data sources: commits, mails, issues - for (data.source in data.to.split) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](df.list[[data.source]]) - } - ## set additional data sources: authors, commit.messages, pasta, synchronicity - for (data.source in additional.data.sources) { - setter.name = sprintf("set.%s", data.source) - cf.range.data[[setter.name]](additional.data[[data.source]]) - } - - return(cf.range.data) - }) - - } else { - ## perform different steps for sliding-window approach - - ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = FALSE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = TRUE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) - bins = get.date.string(bins.date) - - logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", - project.data$get.class.name(), ranges) - cf.data = split.data.time.based.by.ranges(project.data, ranges) - - ## update project configuration - project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) - for (cf in cf.data) { - ## re-set project configuration due to object duplication - cf.conf = cf$set.project.conf(project.conf.new) - } + # validate existence and type of the 'vector' component of the 'bins' parameter + if (!inherits(bins[["vector"]], "numeric")) { + logging::logerror("The 'vector' component of the bins parameter needs to be a numeric vector") + stop("Stopped due to incorrect parameter types") } - ## add splitting information to project configuration - project.conf.new$set.splitting.info( - type = "time-based", - length = if (split.by.bins) { - bins - } - else { - if (!is.null(number.windows)) { - as.character(lubridate::as.period( - get.time.period.by.amount( - min(data[[split.basis]][["date"]]), - max(data[[split.basis]][["date"]]), - number.windows - ) - )) - } - else time.period - }, - basis = split.basis, - sliding.window = sliding.window, - revisions = bins, - revisions.dates = bins.date - ) - - ## set bin attribute - attr(cf.data, "bins") = bins.date - - ## return list of RangeData objects - return(cf.data) + split = split.data.by.time.or.bins(project.data, activity.amount, bins, split.by.time = FALSE, + sliding.window = sliding.window, split.basis = split.basis) + return(split) } #' Split project data by timestamps @@ -249,7 +135,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins = #' and the last range ends with the last timestamp. #' #' If timestamps are not provided, the custom event timestamps in \code{project.data} are -#' used instead. +#' used instead. If no custom event timestamps are available in \code{project.data}, an error is thrown. #' #' @param project.data the *Data object from which the data is retrieved #' @param bins a vector of timestamps [default: NULL] @@ -262,9 +148,15 @@ split.data.time.based.by.timestamps = function(project.data, bins = NULL, projec if (is.null(bins)) { # bins were not provided, use custom timestamps from project bins = unlist(project.data$get.custom.event.timestamps()) + + if (is.null(bins)) { # stop if no custom timestamps are available + logging::logerror("There are no custom timestamps available for splitting (configured file: %s).", + project.data$get.project.conf.entry("custom.event.timestamps.file")) + stop("Stopping due to missing data.") + } } - return (split.data.time.based(project.data, bins = bins, project.conf.new)); + return(split.data.time.based(project.data, bins = bins, project.conf.new)) } #' Split project data in activity-based ranges as specified @@ -360,17 +252,19 @@ split.data.activity.based = function(project.data, activity.type = c("commits", logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).", project.data$get.class.name(), activity.amount, activity.type, number.windows) - ## get bins based on split.basis + ## get bins based on 'split.basis'. Here the 'include.duplicate.ids' parameter flag must be set, to + ## retrieve bins which map every event to a bin including events with non-unique ids. This is important + ## to ensure that every range really has 'activity.amount' many entries after splitting logging::logdebug("Getting activity-based bins.") bins.data = split.get.bins.activity.based(data[[activity.type]], id.column[[activity.type]], - activity.amount, remove.duplicate.bins = TRUE) + activity.amount, remove.duplicate.bins = TRUE, include.duplicate.ids = TRUE) bins = bins.data[["bins"]] bins.date = get.date.from.string(bins) ## split the data based on the extracted timestamps logging::logdebug("Splitting data based on time windows arising from activity bins.") - cf.data = split.data.time.based(project.data, bins = bins.date, split.basis = activity.type, - project.conf.new = project.conf.new) + cf.data = split.data.by.bins(project.data, bins = bins.data, activity.amount = activity.amount, + sliding.window = sliding.window, split.basis = activity.type) ## perform additional steps for sliding-window approach: ## for activity-based sliding-window bins to work, we need to crop the data appropriately and, @@ -384,23 +278,9 @@ split.data.activity.based = function(project.data, activity.type = c("commits", items.unique = unique(data[[activity.type]][[ id.column[[activity.type]] ]]) items.unique.count = length(items.unique) - ## offsets used for cropping (half the first/last bin) + ## offsets used for cropping (half of the first bin) offset.start = floor(activity.amount / 2) - offset.end = (items.unique.count - offset.start) %% activity.amount - ## cut the data appropriately - if (offset.end > 0) { - items.cut = c( - items.unique[seq_len(offset.start)], - items.unique[seq(from = (items.unique.count - offset.end + 1), to = items.unique.count)] - ) - } else { - items.cut = items.unique[seq_len(offset.start)] - } - - ## determine end bin of last sliding-window range - end.event.id = items.unique[(items.unique.count - offset.end + 1)] - end.event.logical = (data[[activity.type]][[ id.column[[activity.type]] ]] == end.event.id) - end.event.date = unique(data[[activity.type]][end.event.logical, ][["date"]]) + items.cut = items.unique[seq_len(offset.start)] ## store the data again data.to.cut = data[[activity.type]][[ id.column[[activity.type]] ]] %in% items.cut @@ -417,12 +297,35 @@ split.data.activity.based = function(project.data, activity.type = c("commits", activity.amount = activity.amount, sliding.window = FALSE, project.conf.new = project.conf.new) + ## extract bins + bins.date.middle = get.date.string(attr(cf.data.sliding, "bins")) + + ## Both, the last sliding range and the last regular range end at the very last item. + ## This is the case because the end of the data is never cropped (like the beginning is). + ## 'split.data.activity.based', which is invoked to obtain both set of ranges, creates + ## ranges until all elements are in one. + ## + ## The conditional below inspects whether the very last item is in the first or the second + ## half of the last regular range. If it is in the first half, there will be a sliding + ## window which covers all items of the last regular range which makes the last regular + ## range obsolete. + ## Similarely if the last item is in the second half of the last regular range, there + ## will be a sliding range (which started at the half of the last regular range) which + ## contains only items also included in the last regular range, which makes the sliding + ## range obsolete. + length.of.last.range = items.unique.count %% activity.amount + if (length.of.last.range > offset.start || length.of.last.range == 0) { + cf.data.sliding = cf.data.sliding[-length(cf.data.sliding)] + bins.date.middle = bins.date.middle[-length(bins.date.middle)] + } else { + cf.data = cf.data[-length(cf.data)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } + ## append data to normally-split data cf.data = append(cf.data, cf.data.sliding) - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = attr(cf.data.sliding, "bins") - ## sort data object properly by bin starts bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) cf.data = cf.data[ order(bins.ranges.start) ] @@ -431,38 +334,6 @@ split.data.activity.based = function(project.data, activity.type = c("commits", bins.date = sort(c(bins.date, bins.date.middle)) bins = get.date.string(bins.date) - ## if the last regular range and the last sliding-window range end at the same time - ## and the data of the last regular range is contained in the last sliding-window range, then: - ## remove the last regular range as it is not complete and we don't loose data when removing it - last.regular.range = cf.data[[length(cf.data)]] - last.sliding.range = cf.data[[length(cf.data) - 1]] - get.activity.data = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]] - - last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - last.sliding.range.ids = (last.sliding.range[[get.activity.data]]())[[ id.column[[activity.type]] ]] - if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] - && all(last.regular.range.ids %in% last.sliding.range.ids) ) { - - cf.data = cf.data[-length(cf.data)] - bins.date = bins.date[-length(bins.date)] - bins = bins[-length(bins)] - } else if (bins.date[length(bins.date)] != bins.date.middle[length(bins.date.middle)]) { - ## adjust the end date of the last sliding-window range, as it might be shorter than it should be: - ## The end of the last range usually is one second after the last event (as end dates are exclusive). - ## In case of sliding windows, the end of the last sliding range needs to be extended to the date of the - ## next event after that range (as end dates are exclusive) to get a full range as for all the previous - ## ranges which end at the beginning of the next range, which is the date of the first event after the - ## actual range. - - ## When we have sliding windows, there are, at least, three ranges (two regular ranges and one - ## sliding-window range. Hence, there are always more than three elements in the bins vector, so accessing - ## bins[length(bins) - 3] cannot throw errors in this case. - name.last.sliding.window = construct.ranges(c(bins[length(bins) - 3], get.date.string(end.event.date))) - names(cf.data)[length(cf.data) - 1] = name.last.sliding.window - bins.date[length(bins.date) - 1] = end.event.date - bins[length(bins) - 1] = get.date.string(end.event.date) - } - ## update project configuration project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) for (cf in cf.data) { @@ -602,13 +473,12 @@ split.data.time.based.by.ranges = function(project.data, ranges) { range.data = split.data.time.based(project.data, bins = start.end, sliding.window = FALSE)[[1]] ## 2) return the data - return (range.data) + return(range.data) }) } return(data.split) } - ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split networks ---------------------------------------------------------- @@ -670,28 +540,20 @@ split.network.time.based = function(network, time.period = "3 months", bins = NU if (sliding.window) { ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), time.period = time.period, overlap = 0.5, raw = FALSE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), - time.period = time.period, overlap = 0.5, raw = TRUE, - include.end.date = FALSE) # bins have already been prepared correctly - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) - - logging::loginfo("Splitting network into time ranges [%s].", + include.end.date = FALSE) + logging::loginfo("Splitting network into overlapping time ranges [%s].", paste(ranges, collapse = ", ")) nets = split.network.time.based.by.ranges(network, ranges, remove.isolates) } else { - logging::loginfo("Splitting network into bins [%s].", - paste(bins.date, collapse = ", ")) - nets = split.network.by.bins(network, bins, bins.vector, remove.isolates) + revs = get.date.string(bins.date) + ranges = construct.ranges(revs, sliding.window = FALSE) + logging::loginfo("Splitting network into non-overlapping time ranges [%s].", + paste(ranges, collapse = ", ")) + nets = split.network.by.bins(network, bins, bins.vector, bins.date, remove.isolates) } - ## set bin attribute - attr(nets, "bins") = bins.date - ## set ranges as names - revs = get.date.string(bins.date) - names(nets) = construct.ranges(revs, sliding.window = sliding.window) - + names(nets) = ranges return(nets) } @@ -751,10 +613,6 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = ranges = construct.overlapping.ranges(start = min(dates), end = max(dates), time.period = time.period, overlap = 0.5, raw = FALSE, include.end.date = TRUE) - bins.info = construct.overlapping.ranges(start = min(dates), end = max(dates), - time.period = time.period, overlap = 0.5, raw = TRUE, - include.end.date = TRUE) - bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info))))) } else { bins.info = split.get.bins.time.based(dates, time.period, number.windows) bins.date = get.date.from.string(bins.info[["bins"]]) @@ -772,7 +630,6 @@ split.networks.time.based = function(networks, time.period = "3 months", bins = if (sliding.window) { nets = split.network.time.based.by.ranges(network = net, ranges = ranges, remove.isolates = remove.isolates) - attr(nets, "bins") = bins.date } else { nets = split.network.time.based(network = net, bins = bins.date, sliding.window = sliding.window, remove.isolates = remove.isolates) @@ -853,7 +710,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win bins.vector = bins.vector[ with(df, order(my.unique.id)) ] # re-order to get igraph ordering bins = sort(unique(bins.vector)) ## split network by bins - networks = split.network.by.bins(network, bins, bins.vector, remove.isolates) + networks = split.network.by.bins(network, bins, bins.vector, remove.isolates = remove.isolates) if (number.edges >= edge.count) { logging::logwarn("Sliding-window approach does not apply: not enough edges (%s) for number of edges %s", @@ -871,16 +728,7 @@ split.network.activity.based = function(network, number.edges = 5000, number.win ## offsets used for cropping (half the first/last bin) offset.start = floor(number.edges / 2) - offset.end = (edge.count - offset.start) %% number.edges - ## cut the data appropriately - if (offset.end > 0) { - edges.cut = c( - edges.by.date[seq_len(offset.start)], - edges.by.date[seq(from = (edge.count - offset.end + 1), to = edge.count)] - ) - } else { - edges.cut = edges.by.date[seq_len(offset.start)] - } + edges.cut = edges.by.date[seq_len(offset.start)] ## delete edges from the network and create a new network network.cut = igraph::delete.edges(network, igraph::E(network)[edges.cut]) @@ -889,37 +737,45 @@ split.network.activity.based = function(network, number.edges = 5000, number.win networks.sliding = split.network.activity.based(network.cut, number.edges = number.edges, sliding.window = FALSE) - ## append data to normally-split data - networks = append(networks, networks.sliding) - ## compute bins for sliding windows: pairwise middle between dates - bins.date.middle = attr(networks.sliding, "bins") + bins.date.middle = get.date.string(attr(networks.sliding, "bins")) + + ## Both, the last sliding network and the last regular network end at the very last edge. + ## This is the case because the end of the edges is never cropped (like the beginning is). + ## Both 'split.network.activity.based', and 'split.network.by.bins', which are invoked to obtain + ## the two set of networks, creates networks until all edges are contained. + ## + ## The conditional below inspects whether the very last edge is in the first or the second + ## half of the last regular network. If it is in the first half, there will be a sliding + ## network which covers all edges of the last regular network which makes the last regular + ## network obsolete. + ## Similarely if the last edge is in the second half of the last regular network, there + ## will be a sliding network (which started at the half of the last regular network) which + ## contains only edges also included in the last regular network, which makes the sliding + ## network obsolete. + length.of.last.range = edge.count %% number.edges + if (length.of.last.range > offset.start || length.of.last.range == 0) { + networks.sliding = networks.sliding[-length(networks.sliding)] + bins.date.middle = bins.date.middle[-length(bins.date.middle)] + } else { + networks = networks[-length(networks)] + bins.date = bins.date[-length(bins.date)] + bins = bins[-length(bins)] + } - ## sort data object properly by bin starts + ## append sliding networks to normally-split networks + networks = append(networks, networks.sliding) + + ## sort networks properly by bin starts bins.ranges.start = c(head(bins.date, -1), head(bins.date.middle, -1)) networks = networks[ order(bins.ranges.start) ] ## construct proper bin vectors for configuration bins.date = sort(c(bins.date, bins.date.middle)) - - ## if the last regular range and the last sliding-window range end at the same time - ## and the latter contains the former's edges, then: - ## remove the last regular range as it is not complete and we don't loose data when removing it - edges.last.regular = igraph::E(networks[[length(networks)]]) - edges.last.sliding = igraph::E(networks[[length(networks) - 1]]) - if (bins.date[length(bins.date)] == bins.date.middle[length(bins.date.middle)] - && all(edges.last.regular %in% edges.last.sliding) - && table(edges.last.regular$date) %in% table(edges.last.sliding$date) ) { - - networks = networks[-length(networks)] - bins.date = bins.date[-length(bins.date)] - bins = bins[-length(bins)] - } - } ## set bin attribute - attr(networks, "bins") = bins.date + attr(networks, "bins") = get.date.from.string(bins.date) ## set ranges as names revs = get.date.string(bins.date) @@ -954,18 +810,20 @@ split.network.time.based.by.ranges = function(network, ranges, remove.isolates = ranges.bounds = lapply(ranges, get.range.bounds) ## loop over all ranges and split the network accordingly: - nets.split = mapply( - ranges, ranges.bounds, SIMPLIFY = FALSE, - FUN = function(range, start.end) { + nets.split = lapply(ranges.bounds, function(bounds) { ## 1) split the network to the current range - range.net = split.network.time.based(network, bins = start.end, sliding.window = FALSE, + range.net = split.network.time.based(network, bins = bounds, sliding.window = FALSE, remove.isolates = remove.isolates)[[1]] ## 2) return the network - return (range.net) + return(range.net) } ) + ## convert ranges to bins + bins = get.bin.dates.from.ranges(ranges.bounds) + attr(nets.split, "bins") = bins + return(nets.split) } @@ -973,16 +831,16 @@ split.network.time.based.by.ranges = function(network, ranges, remove.isolates = ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Split raw data ---------------------------------------------------------- -#' Split the given data by the given bins. +#' Split the given datafame by the given bins. #' #' @param df a data.frame to be split #' @param bins a vector with the length of 'nrow(df)' assigning a bin for each row of 'df' #' #' @return a list of data.frames, with the length of 'unique(bins)' -split.data.by.bins = function(df, bins) { - logging::logdebug("split.data.by.bins: starting.") +split.dataframe.by.bins = function(df, bins) { + logging::logdebug("split.dataframe.by.bins: starting.") df.split = split(df, bins) - logging::logdebug("split.data.by.bins: finished.") + logging::logdebug("split.dataframe.by.bins: finished.") return(df.split) } @@ -991,10 +849,12 @@ split.data.by.bins = function(df, bins) { #' @param network a network #' @param bins a vector with the unique bin identifiers, describing the order in which the bins are created #' @param bins.vector a vector of length 'ecount(network)' assigning a bin for each edge of 'network' +#' @param bins.date a vector of dates representing the start of each bin. If present, then the dates will be set +#' as an attribute on the returned networks [default: NULL] #' @param remove.isolates whether to remove isolates in the resulting split networks [default: TRUE] #' #' @return a list of networks, with the length of 'unique(bins.vector)' -split.network.by.bins = function(network, bins, bins.vector, remove.isolates = TRUE) { +split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, remove.isolates = TRUE) { logging::logdebug("split.network.by.bins: starting.") ## create a network for each bin of edges nets = parallel::mclapply(bins, function(bin) { @@ -1005,11 +865,236 @@ split.network.by.bins = function(network, bins, bins.vector, remove.isolates = T g = igraph::subgraph.edges(network, edges, delete.vertices = remove.isolates) return(g) }) + ## set 'bins' attribute, if specified + if (!is.null(bins.date)) { + attr(nets, "bins") = get.date.from.string(bins.date) + } logging::logdebug("split.network.by.bins: finished.") return(nets) } +## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / +## Internal helper functions for data splitting ---------------------------- + +#' Split project data in time-based or activity-bin-based ranges as specified +#' +#' @param project.data the *Data object from which the data is retrieved +#' @param splitting.length either \code{time.period} from \code{split.data.time.based} +#' or \code{activity.amount} from \code{split.data.by.bins} +#' @param bins either formatted as the \code{bins} parameter of \code{split.data.time.based} +#' or as the \code{bins} parameter of \code{split.data.by.bins} +#' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based +#' @param number.windows see \code{number.windows} from \code{split.data.time.based} +#' [default: NULL] +#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues' +#' [default: "commits"] +#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach +#' [default: FALSE] +#' @param project.conf.new the new project config to construct the \code{RangeData} objects. +#' If \code{NULL}, a clone of \code{project.data$get.project.conf()} will be used. +#' [default: NULL] +#' +#' @return the list of RangeData objects, each referring to one time period +#' +#' @seealso split.data.time.based +#' @seealso split.data.by.bins +split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time, + number.windows = NULL, split.basis = c("commits", "mails", "issues"), + sliding.window = FALSE, project.conf.new = NULL) { + + ## get basis for splitting process + split.basis = match.arg(split.basis) + + ## if the data used by the split basis is not present, load it automatically + if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) { + function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]] + project.data[[function.name]]() + } + + ## get actual raw data + data.to.split = project.data$get.cached.data.sources("only.unfiltered") + + data = lapply(data.to.split, function(ds) { + ## build the name of the respective getter and call it + function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[ds]] + return(project.data[[function.name]]()) + }) + names(data) = data.to.split + + ## load available additional data sources + additional.data.sources = project.data$get.cached.data.sources("only.additional") + additional.data = lapply(additional.data.sources, function(ds) { + ## build the name of the respective getter and call it + function.name = DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION[[ds]] + return(project.data[[function.name]]()) + }) + names(additional.data) = additional.data.sources + + ## number of windows given (ignoring time period and bins) + if (!is.null(number.windows)) { + ## reset bins for the later algorithm + bins = NULL + ## remove sliding windows + sliding.window = FALSE + } + + ## indicates if time-based splitting is performed using bins + split.time.based.with.bins = FALSE + + ## if bins are NOT given explicitly + if (is.null(bins)) { + ## get bins based on split.basis + bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins + bins.labels = head(bins, -1) + ## logging + logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.", + project.data$get.class.name(), splitting.length, split.basis) + } + ## when bins are given explicitly, get bins based on parameter + else { + if (split.by.time) { + split.time.based.with.bins = TRUE + split.basis = NULL + bins = get.date.from.string(bins) + bins = get.date.string(bins) + ## remove sliding windows + sliding.window = FALSE + } else { + ## sliding windows do not need to be removed here, as sliding windows and bins + ## are not contradicting in activity-based splitting + bins.vector = bins[["vector"]] + bins = bins[["bins"]] + } + bins.labels = head(bins, -1) + ## logging + logging::loginfo("Splitting data '%s' into time ranges [%s].", + project.data$get.class.name(), paste(bins, collapse = ", ")) + } + bins.date = get.date.from.string(bins) + + ## construct ranges + bins.ranges = construct.ranges(bins) + names(bins.ranges) = bins.ranges + + if ((length(bins.ranges) <= 1) && sliding.window) { + logging::logwarn("Sliding-window approach does not apply for one range or less.") + sliding.window = FALSE + } + + if (is.null(project.conf.new)) { + ## Clone the project configuration, so that splitting repeatedly does not interfere + ## with the same configuration. + project.conf.new = project.data$get.project.conf()$clone() + } + + if (!sliding.window || !split.by.time) { + ## split data + data.split = parallel::mclapply(data.to.split, function(df.name) { + logging::logdebug("Splitting %s.", df.name) + ## identify bins for data + df = data[[df.name]] + df.bins = if (!split.by.time && (df.name == split.basis)) + bins.vector + else + findInterval(df[["date"]], bins.date, all.inside = FALSE) + ## split data according to df.bins + df.split = split(df, df.bins) + ## add proper labels/names + names(df.split) = sapply(as.integer(names(df.split)), function(bin) bins[bin]) + return(df.split) + }) + ## set the names to the data sources obtained earlier + names(data.split) = data.to.split + + ## re-arrange data to get the proper list of data per range + logging::logdebug("Re-arranging data.") + data.split = parallel::mclapply(bins.labels, function(bin) lapply(data.split, `[[`, bin)) + names(data.split) = bins.ranges + + ## adapt project configuration + project.conf.new$set.revisions(bins, bins.date) + + ## construct RangeData objects + logging::logdebug("Constructing RangeData objects.") + + cf.data = parallel::mclapply(bins.ranges, function(range) { + logging::logdebug("Constructing data for range %s.", range) + ## construct object for current range + cf.range.data = RangeData$new(project.conf.new, range) + ## get data for current range + df.list = data.split[[range]] + + ## set main data sources: commits, mails, issues + for (data.source in data.to.split) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](df.list[[data.source]]) + } + ## set additional data sources: authors, commit.messages, pasta, synchronicity + for (data.source in additional.data.sources) { + setter.name = sprintf("set.%s", data.source) + cf.range.data[[setter.name]](additional.data[[data.source]]) + } + + return(cf.range.data) + }) + + } else { + ## perform different steps for sliding-window approach of time-based splitting + + ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = splitting.length, overlap = 0.5, raw = FALSE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date), + time.period = splitting.length, overlap = 0.5, raw = TRUE, + include.end.date = FALSE) # bins have already been prepared correctly + bins.date = get.bin.dates.from.ranges(bins.info) + bins = get.date.string(bins.date) + + logging::loginfo("Splitting data '%s' into time ranges using sliding windows [%s].", + project.data$get.class.name(), ranges) + cf.data = split.data.time.based.by.ranges(project.data, ranges) + + ## update project configuration + project.conf.new$set.revisions(bins, bins.date, sliding.window = TRUE) + for (cf in cf.data) { + ## re-set project configuration due to object duplication + cf.conf = cf$set.project.conf(project.conf.new) + } + } + + ## add splitting information to project configuration + project.conf.new$set.splitting.info( + type = if (split.by.time) "time-based" else "activity-based", + length = if (split.time.based.with.bins) { + bins + } + else { + if (!is.null(number.windows)) { + as.character(lubridate::as.period( + get.time.period.by.amount( + min(data[[split.basis]][["date"]]), + max(data[[split.basis]][["date"]]), + number.windows + ) + )) + } + else splitting.length + }, + basis = split.basis, + sliding.window = sliding.window, + revisions = bins, + revisions.dates = bins.date + ) + + ## set bin attribute + attr(cf.data, "bins") = bins.date + + ## return list of RangeData objects + return(cf.data) +} + + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Unification of range names ---------------------------------------------- @@ -1102,13 +1187,18 @@ split.get.bins.time.based = function(dates, time.period, number.windows = NULL) #' @param activity.amount the amount of activity denoting the number of unique items #' in each split bin [default: 5000] #' @param remove.duplicate.bins remove duplicate bin borders? [default: FALSE] +#' @param include.duplicate.ids include entries of the \code{df} with non-unique ids +#' in the creation of the bins. This should! not change bin borders +#' as entries with the same id should! share the same \code{date} attribute. +#' [default: FALSE] #' #' @return a list, #' the item 'vector': the bins each row in 'df' belongs to (increasing integers), #' the item 'bins': the bin labels, described by dates, each bin containing -#' 'acitivity.amount' many unique items; each item in the vector indicates +#' 'activity.amount' many unique items; each item in the vector indicates #' the start of a bin, although the last item indicates the end of the last bin -split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE) { +split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE, + include.duplicate.ids = FALSE) { logging::logdebug("split.get.bins.activity.based: starting") ## get the unique integer IDs for each item in 'id' column ids = df[[id]] @@ -1120,11 +1210,23 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica if (bins.number.complete != 0) rep(seq_len(bins.number.complete), each = activity.amount), rep(bins.number.complete + 1, bins.number.incomplete) ) + + ## pad bins with entries for all duplicate ids + if (include.duplicate.ids) { + bins.activity.padded = c() + for (i in seq_along(ids)) { + ## create an extra entry for every duplicate id in the same bin as + ## the first occurance of the id + current.bin = bins.activity[ which(ids.unique == ids[i]) ] + bins.activity.padded = c(bins.activity.padded, current.bin) + } + bins.activity = bins.activity.padded + } bins.number = max(bins.activity) ## join ids and bin numbers bins.mapping = data.frame( - id = ids.unique, + id = if (include.duplicate.ids) ids else ids.unique, bin = bins.activity ) diff --git a/util-tensor.R b/util-tensor.R index c0d9777fa..a1790e331 100644 --- a/util-tensor.R +++ b/util-tensor.R @@ -12,6 +12,7 @@ ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## ## Copyright 2019-2020 by Anselm Fehnker +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -56,8 +57,7 @@ FourthOrderTensor = R6::R6Class("FourthOrderTensor", build.tensor.from.networks = function(networks, weighted = FALSE) { ## get adjacency matrices from networks - adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, private$authors, weighted) - + adjacency.matrices = get.expanded.adjacency.matrices(networks, weighted) ## create an array with the size of the fourth-order tensor that only contains zeros array = array(0, dim = private$dim) @@ -93,7 +93,7 @@ FourthOrderTensor = R6::R6Class("FourthOrderTensor", initialize = function(networks, weighted = FALSE) { private$relations = names(networks) - private$authors = get.author.names.from.networks(networks) + private$authors = get.author.names.from.networks(networks)[[1]] private$dim = c(length(private$authors), length(private$relations), length(private$authors), length(private$relations)) private$tensor = private$build.tensor.from.networks(networks, weighted)