diff --git a/materials/.Rbuildignore b/materials/.Rbuildignore new file mode 100644 index 00000000..5163d0b5 --- /dev/null +++ b/materials/.Rbuildignore @@ -0,0 +1 @@ +^LICENSE\.md$ diff --git a/materials/DESCRIPTION b/materials/DESCRIPTION index 910cb2b4..3cdd8dc7 100755 --- a/materials/DESCRIPTION +++ b/materials/DESCRIPTION @@ -3,39 +3,53 @@ Type: Book Title: Does not matter. Version: 0.0.2 Imports: + base64enc, bibtex, bookdown, broom, + contentid, curl, + dataone, + datapack, dplyr, DT, EML, + filelock, forcats, ggplot2, ggforce, ggmap, ggpmisc, googlesheets4, + htmlwidgets, httr, janitor, + jsonlite, kableExtra, knitr, knitcitations, leaflet, + parsedate, pdftools, + pins, + plyt, qualtRics, readr, reshape2, udunits2, + uuid, sf, scales, + solrium, + stringi, stringr, tidyr, tidytext, textdata, - htmlwidgets, wordcloud, viridis, - dataone + xml + Remotes: rstudio/bookdown +License: Apache License (>= 2) diff --git a/materials/LICENSE.md b/materials/LICENSE.md new file mode 100644 index 00000000..b62a9b5f --- /dev/null +++ b/materials/LICENSE.md @@ -0,0 +1,194 @@ +Apache License +============== + +_Version 2.0, January 2004_ +_<>_ + +### Terms and Conditions for use, reproduction, and distribution + +#### 1. Definitions + +“License” shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +“Licensor” shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +“Legal Entity” shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, “control” means **(i)** the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the +outstanding shares, or **(iii)** beneficial ownership of such entity. + +“You” (or “Your”) shall mean an individual or Legal Entity exercising +permissions granted by this License. + +“Source” form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +“Object” form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +“Work” shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +“Derivative Works” shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +“Contribution” shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +“submitted” means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as “Not a Contribution.” + +“Contributor” shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +#### 2. Grant of Copyright License + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +#### 3. Grant of Patent License + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +#### 4. Redistribution + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +* **(a)** You must give any other recipients of the Work or Derivative Works a copy of +this License; and +* **(b)** You must cause any modified files to carry prominent notices stating that You +changed the files; and +* **(c)** You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. + +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +#### 5. Submission of Contributions + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +#### 6. Trademarks + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +#### 7. Disclaimer of Warranty + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +#### 8. Limitation of Liability + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +#### 9. Accepting Warranty or Additional Liability + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +_END OF TERMS AND CONDITIONS_ + +### APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets `[]` replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same “printed page” as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/materials/_quarto.yml b/materials/_quarto.yml index 9cb51add..51ddef09 100755 --- a/materials/_quarto.yml +++ b/materials/_quarto.yml @@ -3,7 +3,7 @@ project: output-dir: _book book: - title: " NCEAS Learning Hub's coreR Course" + title: "NCEAS Open Science Synthesis for the Delta Science Program" # date: "April 3, 2023" reader-mode: false @@ -23,30 +23,31 @@ book: - icon: twitter href: https://twitter.com/ucsb_nceas - icon: github - href: https://github.com/NCEAS/core-r-course + href: https://github.com/NCEAS/nceas-training/tree/2023-06-delta chapters: - index.qmd # preface - - session_01.qmd # rstudio server setup + - session_01.qmd # rstudio server setup + mini intro to R programming - session_02.qmd # git setup - - session_03.qmd # why r programming +mini intro to R programming - - session_04.qmd # intro to rmd - - session_05.qmd # FAIR and CARE - - session_06.qmd # intro to git + github - - session_07.qmd # cleaning and wrangling data - - session_08.qmd # tidy data - - session_09.qmd # r practice: joins, tidy data - - session_10.qmd # git collab - - session_11.qmd # publish github pages - - session_12.qmd # intro data viz - - session_13.qmd # r practice: wrangling, data viz, github pages - - session_14.qmd # combined dmp + metadata + publishing - - session_15.qmd # intro to sf + map making - - session_16.qmd # r practice: wrangling, sf, map - - session_17.qmd # git workflows - - session_18.qmd # reproducibility and provenance + - session_03.qmd # r intro to quarto + - session_04.qmd # LEGO® reproducibility activity + - session_05.qmd # Accesing and publishing data + - session_06.qmd # logic modeling + - session_10.qmd # synthesis + - session_09.qmd # data modeling essentials + - session_16.qmd # FAIR and CARE + - session_07.qmd # git and github intro + - session_08.qmd # programmatic data access + - session_13.qmd # synthesis + - session_11.qmd # clean and wrangle data + - session_12.qmd # r practice: clean and wrangling + - session_14.qmd # r functions + - session_15.qmd # r packages + - session_13.qmd # synthesis + - session_19.qmd # reproducible papers + -# bibliography: references.bib +bibliography: book.bib format: html: diff --git a/materials/book.bib b/materials/book.bib index cf831c83..1b04fe81 100644 --- a/materials/book.bib +++ b/materials/book.bib @@ -233,4 +233,43 @@ @article{carroll_care_2020 note = {Number: 1 Publisher: Ubiquity Press}, keywords = {{FAIR} principles, Indigenous, data governance, data principles, data sovereignty}, -} \ No newline at end of file +} + + +@misc{warren2021, + title = {Ecological and social Interactions in urban parks: bird surveys in local parks in the central Arizona-Phoenix metropolitan area}, + author = {Warren, Paige S. and Kinzig, Ann and Martin, Chris A and Machabee, Louis}, + year = {2021}, + date = {2021}, + publisher = {Environmental Data Initiative}, + doi = {10.6073/PASTA/F6F004BC7112CE266FDE2B80FAD19FF4}, + url = {https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-cap.256.10}, + langid = {en} +} + +@misc{lter2022, + title = {SBC LTER: Reef: Abundance, size and fishing effort for California Spiny Lobster (Panulirus interruptus), ongoing since 2012}, + author = {LTER, Santa Barbara Coastal and Reed, Daniel C and Miller, Robert J}, + year = {2022}, + date = {2022}, + publisher = {Environmental Data Initiative}, + doi = {10.6073/PASTA/25AA371650A671BAFAD64DD25A39EE18}, + url = {https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.77.8}, + langid = {en} +} + +@article{michener2015, + doi = {10.1371/journal.pcbi.1004525}, + author = {Michener, William K.}, + journal = {PLOS Computational Biology}, + publisher = {Public Library of Science}, + title = {Ten Simple Rules for Creating a Good Data Management Plan}, + year = {2015}, + month = {10}, + volume = {11}, + url = {https://doi.org/10.1371/journal.pcbi.1004525}, + pages = {1-9}, + abstract = {null}, + number = {10}, + +} diff --git a/materials/images/allison-horst-git-workflow.png b/materials/images/allison-horst-git-workflow.png new file mode 100644 index 00000000..35e3cc2f Binary files /dev/null and b/materials/images/allison-horst-git-workflow.png differ diff --git a/materials/images/allison-horst-jenny-bryan-quote.png b/materials/images/allison-horst-jenny-bryan-quote.png new file mode 100644 index 00000000..4c44b266 Binary files /dev/null and b/materials/images/allison-horst-jenny-bryan-quote.png differ diff --git a/materials/images/delta-meeting-schedule.png b/materials/images/delta-meeting-schedule.png new file mode 100644 index 00000000..5ba3373f Binary files /dev/null and b/materials/images/delta-meeting-schedule.png differ diff --git a/materials/images/delta-synthesis-slack.png b/materials/images/delta-synthesis-slack.png new file mode 100644 index 00000000..4fe14e6d Binary files /dev/null and b/materials/images/delta-synthesis-slack.png differ diff --git a/materials/images/delta/delta-logo.png b/materials/images/delta/delta-logo.png new file mode 100644 index 00000000..606dd02c Binary files /dev/null and b/materials/images/delta/delta-logo.png differ diff --git a/materials/images/git-collab-repos.svg b/materials/images/git-collab-repos.svg new file mode 100644 index 00000000..4db4c4c1 --- /dev/null +++ b/materials/images/git-collab-repos.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/materials/images/git-intro.png b/materials/images/git-intro.png new file mode 100644 index 00000000..f0dc80f3 Binary files /dev/null and b/materials/images/git-intro.png differ diff --git a/materials/images/github-intro.png b/materials/images/github-intro.png new file mode 100644 index 00000000..d7ae8d6a Binary files /dev/null and b/materials/images/github-intro.png differ diff --git a/materials/images/non-verbal-feedback.png b/materials/images/non-verbal-feedback.png new file mode 100644 index 00000000..d0aa751b Binary files /dev/null and b/materials/images/non-verbal-feedback.png differ diff --git a/materials/images/quarto-code-options.png b/materials/images/quarto-code-options.png new file mode 100644 index 00000000..615232d6 Binary files /dev/null and b/materials/images/quarto-code-options.png differ diff --git a/materials/images/quarto-rmd-logo.png b/materials/images/quarto-rmd-logo.png new file mode 100644 index 00000000..a191a3c4 Binary files /dev/null and b/materials/images/quarto-rmd-logo.png differ diff --git a/materials/images/quarto-rmd-rendered.png b/materials/images/quarto-rmd-rendered.png new file mode 100644 index 00000000..1f38181b Binary files /dev/null and b/materials/images/quarto-rmd-rendered.png differ diff --git a/materials/images/quarto-rmd-structure.png b/materials/images/quarto-rmd-structure.png new file mode 100644 index 00000000..e259f622 Binary files /dev/null and b/materials/images/quarto-rmd-structure.png differ diff --git a/materials/images/quarto-side-by-side.png b/materials/images/quarto-side-by-side.png new file mode 100644 index 00000000..7264ca02 Binary files /dev/null and b/materials/images/quarto-side-by-side.png differ diff --git a/materials/images/regions-split-apply-combine.png b/materials/images/regions-split-apply-combine.png new file mode 100644 index 00000000..330fe1b8 Binary files /dev/null and b/materials/images/regions-split-apply-combine.png differ diff --git a/materials/images/rmd-code-options.png b/materials/images/rmd-code-options.png new file mode 100644 index 00000000..3d2f2fbd Binary files /dev/null and b/materials/images/rmd-code-options.png differ diff --git a/materials/images/schedule.png b/materials/images/schedule.png index 43e3e5c3..a138e857 100644 Binary files a/materials/images/schedule.png and b/materials/images/schedule.png differ diff --git a/materials/images/tidy-data-images/data_normalization/denormalized.png b/materials/images/tidy-data-images/data_normalization/denormalized.png new file mode 100644 index 00000000..8999a0af Binary files /dev/null and b/materials/images/tidy-data-images/data_normalization/denormalized.png differ diff --git a/materials/images/tidy-data-images/data_normalization/normalized.png b/materials/images/tidy-data-images/data_normalization/normalized.png new file mode 100644 index 00000000..0194df3a Binary files /dev/null and b/materials/images/tidy-data-images/data_normalization/normalized.png differ diff --git a/materials/images/tidy-data-images/data_normalization/two_entities.png b/materials/images/tidy-data-images/data_normalization/two_entities.png new file mode 100644 index 00000000..e2841f89 Binary files /dev/null and b/materials/images/tidy-data-images/data_normalization/two_entities.png differ diff --git a/materials/images/tidy-data-images/merging_data/join-diagrams-full.png b/materials/images/tidy-data-images/merging_data/join-diagrams-full.png new file mode 100644 index 00000000..4a4bce80 Binary files /dev/null and b/materials/images/tidy-data-images/merging_data/join-diagrams-full.png differ diff --git a/materials/images/tidy-data-images/merging_data/join-diagrams-inner.png b/materials/images/tidy-data-images/merging_data/join-diagrams-inner.png new file mode 100644 index 00000000..ce66bc53 Binary files /dev/null and b/materials/images/tidy-data-images/merging_data/join-diagrams-inner.png differ diff --git a/materials/images/tidy-data-images/merging_data/join-diagrams-left.png b/materials/images/tidy-data-images/merging_data/join-diagrams-left.png new file mode 100644 index 00000000..2e13e9c9 Binary files /dev/null and b/materials/images/tidy-data-images/merging_data/join-diagrams-left.png differ diff --git a/materials/images/tidy-data-images/merging_data/join-diagrams-right.png b/materials/images/tidy-data-images/merging_data/join-diagrams-right.png new file mode 100644 index 00000000..d99c4a93 Binary files /dev/null and b/materials/images/tidy-data-images/merging_data/join-diagrams-right.png differ diff --git a/materials/images/tidy-data-images/merging_data/join-diagrams-separate.png b/materials/images/tidy-data-images/merging_data/join-diagrams-separate.png new file mode 100644 index 00000000..257820ac Binary files /dev/null and b/materials/images/tidy-data-images/merging_data/join-diagrams-separate.png differ diff --git a/materials/images/tidy-data-images/merging_data/join-venn.png b/materials/images/tidy-data-images/merging_data/join-venn.png new file mode 100644 index 00000000..023cd2f2 Binary files /dev/null and b/materials/images/tidy-data-images/merging_data/join-venn.png differ diff --git a/materials/images/ERD_Relationship_Symbols_Quick_Reference-1.png b/materials/images/tidy-data-images/relational_data_models/ERD_Relationship_Symbols_Quick_Reference.png similarity index 100% rename from materials/images/ERD_Relationship_Symbols_Quick_Reference-1.png rename to materials/images/tidy-data-images/relational_data_models/ERD_Relationship_Symbols_Quick_Reference.png diff --git a/materials/images/tidy-data-images/relational_data_models/ER_diagram_1.png b/materials/images/tidy-data-images/relational_data_models/ER_diagram_1.png new file mode 100644 index 00000000..730cce13 Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/ER_diagram_1.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/ER_diagram_2.png b/materials/images/tidy-data-images/relational_data_models/ER_diagram_2.png new file mode 100644 index 00000000..199b02ee Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/ER_diagram_2.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/ER_diagram_3.png b/materials/images/tidy-data-images/relational_data_models/ER_diagram_3.png new file mode 100644 index 00000000..68064ff8 Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/ER_diagram_3.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/ER_diagram_4.png b/materials/images/tidy-data-images/relational_data_models/ER_diagram_4.png new file mode 100644 index 00000000..c5a58867 Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/ER_diagram_4.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/ER_diagram_5.png b/materials/images/tidy-data-images/relational_data_models/ER_diagram_5.png new file mode 100644 index 00000000..0f206d00 Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/ER_diagram_5.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/compound_key.png b/materials/images/tidy-data-images/relational_data_models/compound_key.png new file mode 100644 index 00000000..0e31acfb Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/compound_key.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/primary_foreign_keys.png b/materials/images/tidy-data-images/relational_data_models/primary_foreign_keys.png new file mode 100644 index 00000000..f927c922 Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/primary_foreign_keys.png differ diff --git a/materials/images/tidy-data-images/relational_data_models/surrogate_natural_keys.png b/materials/images/tidy-data-images/relational_data_models/surrogate_natural_keys.png new file mode 100644 index 00000000..7d5cb431 Binary files /dev/null and b/materials/images/tidy-data-images/relational_data_models/surrogate_natural_keys.png differ diff --git a/materials/images/excel-org-01.png b/materials/images/tidy-data-images/tidy_data/excel-org-01.png similarity index 100% rename from materials/images/excel-org-01.png rename to materials/images/tidy-data-images/tidy_data/excel-org-01.png diff --git a/materials/images/excel-org-02.png b/materials/images/tidy-data-images/tidy_data/excel-org-02.png similarity index 100% rename from materials/images/excel-org-02.png rename to materials/images/tidy-data-images/tidy_data/excel-org-02.png diff --git a/materials/images/excel-org-03.png b/materials/images/tidy-data-images/tidy_data/excel-org-03.png similarity index 100% rename from materials/images/excel-org-03.png rename to materials/images/tidy-data-images/tidy_data/excel-org-03.png diff --git a/materials/images/excel-org-04.png b/materials/images/tidy-data-images/tidy_data/excel-org-04.png similarity index 100% rename from materials/images/excel-org-04.png rename to materials/images/tidy-data-images/tidy_data/excel-org-04.png diff --git a/materials/images/excel-org-05.png b/materials/images/tidy-data-images/tidy_data/excel-org-05.png similarity index 100% rename from materials/images/excel-org-05.png rename to materials/images/tidy-data-images/tidy_data/excel-org-05.png diff --git a/materials/images/tidy-data-images/tidy_data/tidy_data.png b/materials/images/tidy-data-images/tidy_data/tidy_data.png new file mode 100644 index 00000000..afe092b8 Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/tidy_data.png differ diff --git a/materials/images/tidy-data-images/tidy_data/tidy_not_normal.png b/materials/images/tidy-data-images/tidy_data/tidy_not_normal.png new file mode 100644 index 00000000..5fcaa2eb Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/tidy_not_normal.png differ diff --git a/materials/images/tidy-data-images/tidy_data/tidy_observations.png b/materials/images/tidy-data-images/tidy_data/tidy_observations.png new file mode 100644 index 00000000..75dd7234 Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/tidy_observations.png differ diff --git a/materials/images/tidy-data-images/tidy_data/tidy_values.png b/materials/images/tidy-data-images/tidy_data/tidy_values.png new file mode 100644 index 00000000..48992c26 Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/tidy_values.png differ diff --git a/materials/images/tidy-data-images/tidy_data/tidy_variables.png b/materials/images/tidy-data-images/tidy_data/tidy_variables.png new file mode 100644 index 00000000..f0db28aa Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/tidy_variables.png differ diff --git a/materials/images/tidy-data-images/tidy_data/untidy_1.png b/materials/images/tidy-data-images/tidy_data/untidy_1.png new file mode 100644 index 00000000..bc2a0eb5 Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/untidy_1.png differ diff --git a/materials/images/tidy-data-images/tidy_data/untidy_2.png b/materials/images/tidy-data-images/tidy_data/untidy_2.png new file mode 100644 index 00000000..55a65e8d Binary files /dev/null and b/materials/images/tidy-data-images/tidy_data/untidy_2.png differ diff --git a/materials/index.qmd b/materials/index.qmd index a42c0b09..a4051cdf 100644 --- a/materials/index.qmd +++ b/materials/index.qmd @@ -1,38 +1,85 @@ -# Preface {.unnumbered} - -*April 3 - April 7, 2023* - -## Welcome to the coreR Course +# Overview {.unnumbered} ::: column-margin ![](cover.png){width="80%" fig-align="center"} + +![](images/delta/delta-logo.png){width="80%" fig-align="center"} + ::: -A five-day immersion in R programming for environmental data science. Researchers will gain experience with essential data science tools and best practices to increase their capacity as collaborators, reproducible coders, and open scientists. This course is taught both in-person and virtually. +## About this training + +NCEAS Open Science Synthesis training consists of three 1-week long workshops, geared towards early career researchers. Participants engage in a mix of lectures, exercises, and synthesis research groups to undertake synthesis while learning and implementing best practices for open data science. + + + -### Course Learning Objectives: +## Why NCEAS +The [National Center for Ecological Analysis and Synthesis (NCEAS)](https://www.nceas.ucsb.edu/), a research affiliate of UCSB, is a leading expert on interdisciplinary data science and works collaboratively to answer the world's largest and most complex questions. The NCEAS approach leverages existing data and employs a team science philosophy to squeeze out all potential insights and solutions efficiently - this is called [synthesis science](https://www.nceas.ucsb.edu/our-approach). -- Effectively manage data using `tidy` data practices and developing quality metadata -- Implement reproducible scientific workflows throughout all aspects of a project -- Build data visualizations and reusable reports using `ggplot`, `markdown`, and GitHub -- Increase your familiarity and confidence with data science tools +NCEAS has over 25 years of success with this model among working groups and environmental professionals. Together with the Delta Science Program and the Delta Stewardship Council we are excited to pass along skills, workflows, mindsets learn throughout the years. + + +## Week 1: Open Data and Synthesis + +*June 26-30, 2023* + +Learning Objectives: + + - Implement reproducible scientific workflows throughout all aspects of a project + - Increase your familiarity and confidence with data science tools + - Effectively manage and wrangle data using `tidy` data practices + - Accessing, interpreting and developing metadata for synthesis research + - Organize and initiate synthesis projects + ## Schedule -![](images/schedule-coreR-2023-04.png) + + +:::{.column-page} +![](images/schedule.png) +::: + + + +## Next trainings + +### Week 2: Open Tools for Analysis and Visualization + +*Aug 28 - Sep 1, 2023* + +- Strengthen core knowledge of version control and workflow +- Introduce metaanalysis concepts and tools +- Approaches for geospatial visualization +- Data tools for qualitative data + + +### Week 3: Scaling up and presenting synthesis + +*October 23 – 27, 2023* + +- Handling missing data +- Big data workflows and parallel computing +- Building scientific websites with R and Shiny +- Synthesis presentations and next steps + ## Code of Conduct By participating in this activity you agree to abide by the [NCEAS Code of Conduct](https://www.nceas.ucsb.edu/sites/default/files/2021-11/NCEAS_Code-of-Conduct_Nov2021_0.pdf). + + ## About this book -These written materials are the result of a continuous and collaborative effort at NCEAS to help researchers make their work more transparent and reproducible. This work began in the early 2000's, and reflects the expertise and diligence of many, many individuals. The primary authors are listed in the citation below, with additional contributors recognized for their role in developing previous iterations of these or similar materials. +These written materials are the result of a continuous and collaborative effort at NCEAS with the support of DataONE, to help researchers make their work more transparent and reproducible. This work began in the early 2000's, and reflects the expertise and diligence of many, many individuals. The primary authors for this version are listed in the citation below, with additional contributors recognized for their role in developing previous iterations of these or similar materials. This work is licensed under a [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/). -**Citation:** Halina Do-Linh, Camila Vargas Poulsen, Samantha Csik, Daphne Virlar-Knight. 2023. coreR Course. NCEAS Learning Hub. -**Additional contributors:** Ben Bolker, Amber E. Budden, Julien Brun, Natasha Haycock-Chavez, S. Jeanette Clark, Julie Lowndes, Stephanie Hampton, Matthew B. Jones, Samanta Katz, Erin McLean, Bryce Mecum, Deanna Pennington, Karthik Ram, Jim Regetz, Tracy Teal, Leah Wasser. +**Citation:** Halina Do-Linh, Carmen Galaz García, Matthew B. Jones, Camila Vargas Poulsen. 2023. Open Science Synthesis training Week 1. NCEAS Learning Hub & Delta Stewardship Council. + +**Additional contributors:** Ben Bolker, Julien Brun, Amber E. Budden, Jeanette Clark, Samantha Csik, Stephanie Hampton, Natasha Haycock-Chavez, Samanta Katz, Julie Lowndes, Erin McLean, Bryce Mecum, Deanna Pennington, Karthik Ram, Jim Regetz, Tracy Teal, Daphne Virlar-Knight, Leah Wasser. This is a Quarto book. To learn more about Quarto books visit . diff --git a/materials/sections/accessing-and-publishing-data.qmd b/materials/sections/accessing-and-publishing-data.qmd new file mode 100644 index 00000000..4545d2dd --- /dev/null +++ b/materials/sections/accessing-and-publishing-data.qmd @@ -0,0 +1,403 @@ +## Learning Objectives {.unnumbered} + +- Understand the importance of data management for successfully preserving data +- Learn about metadata guidelines and best practices for reproducibility +- Become familiar with environmental data repositories for accessing and publishing data + +## The Big Idea + +The ultimate goal of this lesson is to provide an overview of a reproducible open science framework for your research, when either you are accessing published data (data user) to -- for example use it for synthesis or you want to publish your own data (data author). To achieve this, we are going to talk about the following topics. + +- The Data Life Cycle +- The importance of data management +- Metadata best practices +- Data preservation + +We will discuss how these topics relate to each other and why they are the building block for you to use others' data and for others to access, interpret, and use your data in the future. + +## The Data Life Cycle + +The Data Life Cycle gives you an overview of meaningful steps in a research project. This step-by-step breakdown facilitates successful management and preservation of data throughout a project. Some research activities might use only part of the life cycle. For example, a meta-analysis might focus on the Discover, Integrate, and Analyze steps, while a project focused on primary data collection and analysis might bypass the Discover and Integrate steps. + +[![Source: DataOne](images/data-life-cycle.png){fig-alt="Data Life Cycle graphic with each stage following the next to create a circle." fig-align="right"}](https://dataoneorg.github.io/Education/bestpractices/) + +The first step to working with data is identifying where my project is starting in the Data Life Cycle. Using the data life cycle stages, create your own cycle that best fits your project needs. + +**A way to use the Data Life Cycle in practice is to:** + +- Think about the end goal, outcomes, and products of your project +- Think and decide steps in the Data Life Cycle you need to include in your project +- Review [best practices](https://dataoneorg.github.io/Education/bestpractices/) for that step in the cycle and start outlining action items in each of those steps. + +DataOne's [Data Management Skillbuilding Hub](https://dataoneorg.github.io/Education/bestpractices/) offers several best practices on how to effectively work with your data throughout all stages of the data life cycle. + +No matter how your data life cycle looks like, *Plan* should be at the top of the cycle. It is advisable to initiate your data management planning at the beginning of your research process before any data has been collected or discovered. The following section will discuss more in-depth data management and how to plan accordingly + +## Managing your data + +Successfully managing your data throughout a research project helps ensures its preservation for future use. + +### Why manage your data? + +**From a Researcher Perspective** + +- Keep yourself organized -- be able to find your files (data inputs, analytic scripts, outputs at various stages of the analytic process, etc.) +- Track your science processes for reproducibility -- be able to match up your outputs with exact inputs and transformations that produced them +- Better control versions of data -- easily identify versions that can be periodically purged +- Quality control your data more efficiently +- To avoid data loss (e.g. making backups) +- Format your data for re-use (by yourself or others) +- Be prepared: Document your data for your own recollection, accountability, and re-use (by yourself or others) +- Gain credibility and recognition for your science efforts through data sharing! + +**Advancement of Science** + +- Data is a valuable asset -- it is expensive and time consuming to collect + +- Maximize the effective use and value of data and information assets + +- Continually improve the quality including: data accuracy, integrity, integration, timeliness of data capture and presentation, relevance, and usefulness + +- Ensure appropriate use of data and information + +- Facilitate data sharing + +- Ensure sustainability and accessibility in long term for re-use in science + +### Tools to Manage your Data + +A Data Management Plan (DMP) is a document that describes how you will use your data during a research project, as well as what you will do with your data long after the project ends. DMPs are living documents and should be updated as research plans change to ensure new data management practices are captured ([Environmental Data Initiative](https://edirepository.org/resources/data-management-planning)). + +A well-thought-out plan means you are more likely to: + +- stay organized +- work efficiently +- truly share data +- engage your team +- meet funder requirements as DMPs are becoming common in the submission process for proposals + +A DMP is both a straightforward blueprint for how you manage your data, *and* provides guidelines for your and your team on policies, access, roles, and more. While it is important to plan, it is equally important to recognize that no plan is perfect as change is inevitable. To make your DMP as robust as possible, treat it as a "living document" that you periodically review with your team and adjust as the needs of the project change. + +### How to Plan + +- **Plan early:** research shows that over time, information is lost and this is inevitable so it's important to think about long-term plans for your research at the beginning before you're deep in your project. And ultimately, you'll save more time. +- **Plan in collaboration:** high engagement of your team and other important contributors is not only a benefit to your project, but it also makes your DMP more resilient. When you include diverse expertise and perspectives to the planning stages, you're more likely to overcome obstacles in the future. +- **Utilize existing resources:** don't reinvent the wheel! There are many great DMP resources out there. Consider the article *Ten Simple Rules for Creating a Good Data Management Plan* [@michener2015], which has succinct guidelines on what to include in a DMP. Or use an online tool like [DMPTool](https://dmptool.org/), which provides official DMP templates from funders like NSF, example answers, and allows for collaboration. +- **Make revising part of the process:** Don't let your DMP collect dust after your initially write it. Make revising the DMP part of your research project and use it as a guide to ensure you're keeping on track. +- **Include tidy and ethical lens:** It is important to start thinking through these lenses during the planning process of your DMP, it will make it easier to include and maintain tidy and ethical principles throughout the entire project. We will discuss in depth about [tidy data](https://learning.nceas.ucsb.edu/2023-06-delta/session_09.html), [FAIR principles](https://learning.nceas.ucsb.edu/2023-06-delta/session_16.html#what-is-fair) and data ethics though the [CARE principles](https://learning.nceas.ucsb.edu/2023-06-delta/session_16.html#what-is-care) later this week. + +More details on what to include in a Data Management Plan in [Additional Resources](https://learning.nceas.ucsb.edu/2023-06-delta/session_05.html#additional-resouces) + +## Metadata Best Practices + +Within the data life cycle you can be collecting data (creating new data) or integrating data that has all ready been collected. Either way, **metadata** plays plays a major role to successfully spin around the cycle because it enables data reuse long after the original collection. + +Imagine that you're writing your metadata for a typical researcher (who might even be you!) 30+ years from now - what will they need to understand what's inside your data files? + +The goal is to have enough information for the researcher to **understand the data**, **interpret the data**, and then **reuse the data** in another study. + +### Overall Guidelines + +Another way to think about metadata is to answer the following questions with the documentation: + +- What was measured? +- Who measured it? +- When was it measured? +- Where was it measured? +- How was it measured? +- How is the data structured? +- Why was the data collected? +- Who should get credit for this data (researcher AND funding agency)? +- How can this data be reused (licensing)? + +### Bibliographic Guidelines + +The details that will help your data be cited correctly are: + +- **Global identifier** like a digital object identifier (DOI) +- Descriptive **title** that includes information about the topic, the geographic location, the dates, and if applicable, the scale of the data +- Descriptive **abstract** that serves as a brief overview off the specific contents and purpose of the data package +- **Funding information** like the award number and the sponsor +- **People and organizations** like the creator of the dataset (i.e. who should be cited), the person to **contact** about the dataset (if different than the creator), and the contributors to the dataset + +### Discovery Guidelines + +The details that will help your data be discovered correctly are: + +- **Geospatial coverage** of the data, including the field and laboratory sampling locations, place names and precise coordinates +- **Temporal coverage** of the data, including when the measurements were made and what time period (ie the calendar time or the geologic time) the measurements apply to +- **Taxonomic coverage** of the data, including what species were measured and what taxonomy standards and procedures were followed +- Any other **contextual information** as needed + +### Interpretation Guidelines + +The details that will help your data be interpreted correctly are: + +- **Collection methods** for both field and laboratory data the full experimental and project design as well as how the data in the dataset fits into the overall project +- **Processing methods** for both field and laboratory samples +- All sample **quality control procedures** +- **Provenance** information to support your analysis and modelling methods +- Information about the **hardware and software** used to process your data, including the make, model, and version +- **Computing quality control** procedures like testing or code review + +### Data Structure and Contents + +- **Everything needs a description**: the data model, the data objects (like tables, images, matrices, spatial layers, etc), and the variables all need to be described so that there is no room for misinterpretation. +- **Variable information** includes the definition of a variable, a standardized unit of measurement, definitions of any coded values (i.e. 0 = not collected), and any missing values (i.e. 999 = NA). + +Not only is this information helpful to you and any other researcher in the future using your data, but it is also helpful to search engines. The semantics of your dataset are crucial to ensure your data is both discoverable by others and interoperable (that is, reusable). + +For example, if you were to search for the character string "carbon dioxide flux" in a data repository, not all relevant results will be shown due to varying vocabulary conventions (i.e., "CO2 flux" instead of "carbon dioxide flux") across disciplines --- only datasets containing the exact words "carbon dioxide flux" are returned. With correct semantic annotation of the variables, your dataset that includes information about carbon dioxide flux but that calls it CO2 flux WOULD be included in that search. + +### Rights and Attribution + +Correctly **assigning a way for your datasets to be cited** and reused is the last piece of a complete metadata document. This section sets the scientific rights and expectations for the future on your data, like: + +- Citation format to be used when giving credit for the data +- Attribution expectations for the dataset +- Reuse rights, which describe who may use the data and for what purpose +- Redistribution rights, which describe who may copy and redistribute the metadata and the data +- Legal terms and conditions like how the data are licensed for reuse. + +### Metadata Standards + +So, **how does a computer organize all this information?** There are a number of metadata standards that make your metadata machine readable and therefore easier for data curators to publish your data. + +- [Ecological Metadata Language (EML)](https://eml.ecoinformatics.org/) +- [Geospatial Metadata Standards (ISO 19115 and ISO 19139)](https://www.fgdc.gov/metadata/iso-standards) + - See [NOAA's ISO Workbook](http://www.ncei.noaa.gov/sites/default/files/2020-04/ISO%2019115-2%20Workbook_Part%20II%20Extentions%20for%20imagery%20and%20Gridded%20Data.pdf) +- [Biological Data Profile (BDP)](chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://www.fgdc.gov/standards/projects/FGDC-standards-projects/metadata/biometadata/biodatap.pdf) +- [Dublin Core](https://www.dublincore.org/) +- [Darwin Core](https://dwc.tdwg.org/) +- [PREservation Metadata: Implementation Strategies (PREMIS)](https://www.loc.gov/standards/premis/) +- [Metadata Encoding Transmission Standard (METS)](https://www.loc.gov/standards/mets/) + +*Note this is not an exhaustive list.* + +### Data Identifiers + +Many journals require a DOI (a digital object identifier) be assigned to the published data before the paper can be accepted for publication. The reason for that is so that the data can easily be found and easily linked to. + +Some data repositories assign a DOI for each dataset you publish on their repository. But, if you need to update the datasets, check the policy of the data repository. Some repositories assign a new DOI after you update the dataset. If this is the case, researchers should cite the exact version of the dataset that they used in their analysis, even if there is a newer version of the dataset available. + +### Data Citation + +Researchers should get in the habit of citing the data that they use (even if it's their own data!) in each publication that uses that data. + +## Data Sharing & Preservation + +![](images/WhyManage-small.png) + +### Data Packages + +> We define a data package as a scientifically useful collection of data and metadata that a researcher wants to preserve. + +Sometimes a data package represents all of the data from a particular experiment, while at other times it might be all of the data from a grant, or on a topic, or associated with a paper. Whatever the extent, we define a data package as having one or more data files, software files, and other scientific products such as graphs and images, all tied together with a descriptive metadata document. + +Many data repositories assign a unique identifier to every version of every data file, similarly to how it works with source code commits in GitHub. Those identifiers usually take one of two forms. A DOI identifier, often assigned to the metadata and becomes a publicly citable identifier for the package. Each of the other files gets a global identifier, often a UUID that is globally unique. This allows to identify a digital entity within a data package. + +In the graphic to the side, the package can be cited with the DOI `doi:10.5063/F1Z1899CZ`,and each of the individual files have their own identifiers as well. + +![](images/data-package.png) + +### Data Repositories: Built for Data (and code) + +- GitHub is not an archival location +- Examples of dedicated data repositories: + - KNB + - Arctic Data Center + - tDAR + - EDI + - Zenodo +- Dedicated data repositories are: + - Rich in metadata + - Archival in their mission + - [Certified](https://www.coretrustseal.org/) +- Data papers, e.g., Scientific Data +- [re3data](https://www.re3data.org/) is a global registry of research data repositories +- [Repository Finder](https://repositoryfinder.datacite.org/) is a pilot project and tool to help researchers find an appropriate repository for their work + +#### DataOne Federation + +DataONE is a federation of dozens of data repositories that work together to make their systems interoperable and to provide a single unified search system that spans the repositories. DataONE aims to make it simpler for researchers to publish data to one of its member repositories, and then to discover and download that data for reuse in synthetic analyses. + +DataONE can be [searched on the web](https://search.dataone.org/), which effectively allows a single search to find data from the dozens of members of DataONE, rather than visiting each of the (currently 44!) repositories one at a time. + +![](images/DataONECNs.png) + +## Summary + +- The Data Life Cycle help us see the big picture of our data project. +- Once we identify the necessary steps it is helpful to think through each one and plan accordingly. +- It is extremely helpful to develop a data management plan is to stay organized. +- Document everything. Having rich metadata is a key factor to enable data reuse. Describe your data and files and use an appropriate metadata standard. +- Publish your data in a stable long live repository and assign a unique identifier. + +## Data Users Example + +**Data Life Cycle** + +- Plan + +- Discover: Finding and understanding data + +- Integrate: Accessing Data + +- Analyze + +- Describe: Metadata, Citing data (provenance) + +- Preserve: Publishing a derived data package + +## Exercise: Evaluate a Data Package on the EDI Repository + +Explore data packages published on EDI assess the quality of their metadata. Imagine you are planning on using this data for a synthesis project. + +::: callout-tip +### Setup + +Break into groups and use the following data packages: + +- **Group A:** [SBC LTER: Reef: Abundance, size and fishing effort for California Spiny Lobster (Panulirus interruptus), ongoing since 2012](https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.77.8) + +- **Group B:** [Physiological stress of American pika (Ochotona princeps) and associated habitat characteristics for Niwot Ridge, 2018 - 2019](https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-nwt.268.1) + +- **Group C:** [Ecological and social interactions in urban parks: bird surveys in local parks in the central Arizona-Phoenix metropolitan area](https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-cap&identifier=256&revision=10) + +- **Group D:** [Interagency Ecological Program: Fish catch and water quality data from the Sacramento River floodplain and tidal slough, collected by the Yolo Bypass Fish Monitoring Program, 1998-2021.](https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=233&revision=3) +::: + +```{=html} + +``` +You and your group will evaluate a data package for its: (1) metadata quality, and (2) data documentation quality for reusability. + +::: callout-note +### Exercise: Evaluate a data package + +1. Open our [Data Package Assessment Rubric](https://docs.google.com/document/d/1PQpw9ohOMY7K1yBWaknMHV0dGEm0GnZ9pCB8_i2JPoU/edit?usp=sharing) (Note: Evaluate only the Metadata Documentation and Quality section), make copy and: + a. **Investigate the metadata in the provided data** + i. Does the metadata meet the standards we talked about? How so? + ii. If not, how would you improve the metadata based on the standards we talked about? + b. **Investigate the overall data documentation in the data package** + i. Is the documentation sufficient enough for reusing the data? Why or why not? + ii. If not, how would you improve the data documentation? What's missing? +2. Elect someone to share back to the group the following: + a. How easy or challenging was it to find the metadata and other data documentation you were evaluating? Why or why not? + b. What documentation stood out to you? What did you like or not like about it? + c. Do you feel like you understand the research project enough to use the data yourself (aka reproducibility? + +*If you and your group finish early, check out more datasets in the bonus question.* +::: + +## Bonus: Investigate metadata and data documentation in other Data Repositories + +Not all environmental data repositories document and publish datasets and data packages in the same way. Nor do they have the same submission requirements. It's helpful to become familiar with metadata and data documentation jargon so it's easier to identify the information you're looking for. It's also helpful for when you're nearing the end of your project and are getting ready to publish your datasets. + +Evaluate the following data packages at these data repositories: + +1. KNB [Arthropod pitfall trap biomass captured (weekly) and pitfall biomass model predictions (daily) near Toolik Field Station, Alaska, summers 2012-2016](https://knb.ecoinformatics.org/view/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Fmetadata%2Feml%2Fknb-lter-arc%2F20052%2F1) +2. DataOne [USDA-NOAA NWS Daily Climatological Data](https://search.dataone.org/view/knb-lter-jrn.20020210.9893) +3. Arctic Data Center [Landscape evolution and adapting to change in ice-rich permafrost systems 2021-2022](https://arcticdata.io/catalog/view/doi%3A10.18739%2FA2JQ0SW7X) + +How different are these data repositories from the EDI Data Portal? Would you consider publishing you data at one or multiple of these repositories? + +## Additional Resouces + +### What to include in a DMP + ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| DMP Section | Guiding Questions | ++=======================================+===================================================================================================================================================================================================+ +| Funder Requirements | - Does the funder have a template or specific DMP guidelines? | +| | | +| | - Do you thoroughly understand all the requirements? Or do you need to reach out for clarification? | +| | | +| | - Is there a page-limit to what you can submit in your proposal? Would it beneficial to have an appendix or a longer version of your DMP for internal use elsewhere (and not for submission)? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Study Design | - What analytical methods do you plan to use? | +| | | +| | - What experiments, if any, are needed to answer your research question? | +| | | +| | - What are the end products you plan to produce? | +| | | +| | - What ethical considerations do you have about your project? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Data Collection | - What type of data do you plan to collect (text, audio files, images, models, spreadsheets)? | +| | | +| | - Where do you plan to source your data? Is it observational, already existing, or does it need to be collected? Do you need to obtain a license to access the data? Do you need an IRB review? | +| | | +| | - How much data do you plan to collect or use? | +| | | +| | - What format is the data in? Is it open source or is it proprietary? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Data Organization | - How will you manage your data? Will you be using open source or proprietary software programs? | +| | | +| | - Do you need a database to manage your data? Are there existing databases you can utilize or do you need to build one? | +| | | +| | - What software tools do you plan to use to manage and organize your data? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Quality Assurance and Quality Control | - How will you ensure that your data is of quality? | +| | | +| | - How will you maintain data integrity throughout your analysis? | +| | | +| | - What tests will you run on your raw data and processed data? | +| | | +| | - Will you be utilizing outside partners to implement testing or QA/QC measures? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Data Policies | - What licenses do you plan to use for your data? Are there open source licenses that meet your funders requirements? | +| | | +| | - What are the policies for sharing, retaining, and licensing the data? Whose responsibility is that? | +| | | +| | - Are there any legal or ethical restrictions on your data? Do you have sensitive data that cannot be shared? Is a metadata documentation appropriate? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Data documentation & Metadata | - What information is required for you and others to accurately interpret, reuse, and access your data? | +| | | +| | - Will you be using a metadata standard? | +| | | +| | - What information is needed for you to write comprehensive metadata? | +| | | +| | - Where and how will you maintain this documentation? Is it possible for you to have the documentation open source? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Data Sharing | - How will the data be shared after the project ends? Is this an accessible location? | +| | | +| | - When will the data and project be available? Immediately after the project ends or a time period after? | +| | | +| | - Will you be publishing the project and the data to a journal? | +| | | +| | - What data products do you plan to share? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Roles and Responsibilities | - Who is in charge of collecting the data? Managing it? Storing it? Archiving it? Running quality control? Overall project management? There are lots of roles to consider here. | +| | | +| | - What kind of expertise is needed for these roles? | +| | | +| | - What happens if a role needs to change? How do you plan to handle this kind of change? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Long-term Storage & Data Preservation | - Where do you plan to archive your data? | +| | | +| | - How long will the data be accessible? | +| | | +| | - How will the data be accessed for future use? | +| | | +| | - How will you be storing the data during your project? Is this different than where you will store it after the project ends? | +| | | +| | - Does your institution or funder have long-term storage options for you to use? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| Budget | - Do you need to purchase any proprietary software? | +| | | +| | - Do you need to purchase any hardware? | +| | | +| | - Do you need to pay for any services? | +| | | +| | - Will you need to hire employees? Consultants? | +| | | +| | - Do you anticipate that you will need to pay for any professional development or training either for yourself or your team? | ++---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + +: {tbl-colwidths="\[20,80\]"} diff --git a/materials/sections/activity-reproducibility-lego.qmd b/materials/sections/activity-reproducibility-lego.qmd new file mode 100644 index 00000000..26f0531c --- /dev/null +++ b/materials/sections/activity-reproducibility-lego.qmd @@ -0,0 +1,41 @@ +--- +format: html +--- + +## Reproducibility activity using LEGO® + +### Learning Objectives {.unnumbered} + +- Illustrate elements of good reproducibility through the medium of LEGO® +- Discuss what is needed and what is not needed for good reproducibility + +::: callout-note +## Acknowledgements +This activity is largely based on the [LEGO® Metadata for Reproducibility game pack](http://eprints.gla.ac.uk/196477/), which was developed by [Mary Donaldson](https://orcid.org/0000-0002-1936-3499) and [Matt Mahon](https://orcid.org/0000-0001-8950-8422). +::: + +## Getting started + +::: callout-tip +#### Setup + +1. Gather into small groups +2. Get LEGO® blocks and worksheets (instructions + metadata documentation) +3. Follow directions on worksheets +::: + +At the end, we will discuss as a group. + +## Discussion + +
+ +Discussion Questions + +- Did you find this a simple way to document your process? +- Was there anything you found difficult to capture? +- Did those replicating the builds find it straightforward to follow? +- Did you encounter any ambiguity in the instructions? + +
+ diff --git a/materials/sections/clean-wrangle-data.qmd b/materials/sections/clean-wrangle-data.qmd index e64f59d3..ad934fb7 100644 --- a/materials/sections/clean-wrangle-data.qmd +++ b/materials/sections/clean-wrangle-data.qmd @@ -1,42 +1,33 @@ ## Learning Objectives {.unnumbered} -- Learn about the Split-Apply-Combine strategy is and how it applies to data -- Describe the difference between wide vs. long table formats and how to convert between them - Introduce `dplyr` and `tidyr` functions to clean and wrangle data for analysis +- Learn about the Split-Apply-Combine strategy and how it applies to data wrangling +- Describe the difference between wide vs. long table formats and how to convert between them -## Introduction - -The data we get to work with are rarely, if ever, in the format we need to do our analyses. It’s often the case that one package requires data in one format, while another package requires the data to be in another format. To be efficient analysts, we should have good tools for reformatting data for our needs so we can do our actual work like making plots and fitting models. The `dplyr` and `tidyr` R packages provide a fairly complete and extremely powerful set of functions for us to do this reformatting quickly and learning these tools well will greatly increase your efficiency as an analyst. - -Analyses take many shapes, but they often conform to what is known as the Split-Apply-Combine strategy. This strategy follows a usual set of steps: - -- **Split**: Split the data into logical groups (e.g., area, stock, year) -- **Apply**: Calculate some summary statistic on each group (e.g. mean total length by year) -- **Combine**: Combine the groups back together into a single table - -![](images/split-apply-combine-diagram.png){width="90%" fig-align="center"} +## Introduction -As shown above, our original table is split into groups by `year`, we calculate the mean length for each group, and finally combine the per-year means into a single table. +The data we get to work with are rarely, if ever, in the format we need to do our analyses. +It’s often the case that one package requires data in one format, while another package requires the data to be in another format. +To be efficient analysts, we should have good tools for reformatting data for our needs so we can do further work like making plots and fitting models. +The `dplyr` and `tidyr` R packages provide a fairly complete and extremely powerful set of functions for us to do this reformatting quickly. +Learning these tools well will greatly increase your efficiency as an analyst. -`dplyr` provides a fast and powerful way to express this. Let’s look at a simple example of how this is done: +Let's look at two motivating examples. -Assuming our length data is already loaded in a `data.frame` called `length_data`: +::: {.callout-note appearance="minimal" icon=false} +## Example 1 +Suppose you have the following `data.frame` called `length_data` with data about salmon length and want to calculate the average length per year. | year| length\_cm| |-----:|-----------:| -| 1991| 5.673318| +| 1990| 5.673318| | 1991| 3.081224| | 1991| 4.592696| | 1992| 4.381523| | 1992| 5.597777| | 1992| 4.900052| -| 1992| 4.139282| -| 1992| 5.422823| -| 1992| 5.905247| -| 1992| 5.098922| - -We can do this calculation using `dplyr` like this: +The `dplyr` R library provides a fast and powerful way to do this calculation in a few lines of code: ```{r} #| eval: false @@ -44,8 +35,12 @@ length_data %>% group_by(year) %>% summarize(mean_length_cm = mean(length_cm)) ``` +::: + -Another exceedingly common thing we need to do is "reshape" our data. Let's look at an example table that is in what we will call "wide" format: +::: {.callout-note appearance="minimal" icon=false} +## Example 2 + Another process we often need to do is to "reshape" our data. Consider the following table that is in what we call "wide" format: | site | 1990 | 1991 | ... | 1993 | |--------|------|------|-----|------| @@ -54,17 +49,12 @@ Another exceedingly common thing we need to do is "reshape" our data. Let's look | ... | ... | ... | ... | ... | | dredge | 100 | 118 | ... | 112 | -You are probably quite familiar with data in the above format, where values of the variable being observed are spread out across columns (Here: columns for each year). Another way of describing this is that there is more than one measurement per row. This wide format works well for data entry and sometimes works well for analysis but we quickly outgrow it when using R. For example, how would you fit a model with year as a predictor variable? In an ideal world, we'd be able to just run: - -```{r} -#| eval: false -lm(length ~ year) -``` - +You are probably familiar with data in the above format, where values of the variable being observed are spread out across columns. +In this example we have a different column per year. +This wide format works well for data entry and sometimes works well for analysis but we quickly outgrow it when using R (and know it is not tidy data!). +For example, how would you fit a model with year as a predictor variable? In an ideal world, we'd be able to just run `lm(length ~ year)`. But this won't work on our wide data because `lm()` needs `length` and `year` to be columns in our table. -Or how would we make a separate plot for each year? We could call `plot()` one time for each year but this is tedious if we have many years of data and hard to maintain as we add more years of data to our data set. - The `tidyr` package allows us to quickly switch between wide format and long format using the `pivot_longer()` function: ```{r} @@ -80,8 +70,9 @@ site_data %>% | dredge | 1990 | 144| | ... | ... | ...| | dredge | 1993 | 145| +::: -In this lesson we're going to walk through the functions you'll most commonly use from the `dplyr` and `tidyr` packages: +This lesson will cover examples to learn about the functions you'll most commonly use from the `dplyr` and `tidyr` packages: | Function name | Description | |--------|------| @@ -105,13 +96,13 @@ In this lesson we're going to walk through the functions you'll most commonly us : Common `tidyr` functions {tbl-colwidths="[25,75]"} -## Data Cleaning Basics +## Data cleaning basics To demonstrate, we'll be working with a tidied up version of a data set from [Alaska Department of Fish & Game containing commercial catch data from 1878-1997](https://knb.ecoinformatics.org/#view/df35b.304.2). The data set and reference to the original source can be found at its [public archive](https://knb.ecoinformatics.org/#view/df35b.304.2). ::: callout-tip ## Setup -First, open a new RMarkdown document. Delete everything below the setup chunk, and add a library chunk that calls `dplyr`, `tidyr`, and `readr` +First, open a new Quarto document. Delete everything below the setup chunk, and add a library chunk that calls `dplyr`, `tidyr`, and `readr` ```{r} #| message: false @@ -144,23 +135,25 @@ The following objects are masked from ‘package:base’: These are important warnings. They are letting you know that certain functions from the `stats` and `base` packages (which are loaded by default when you start R) are masked by *different functions* with the same name in the `dplyr` package. It turns out, the order that you load the packages in matters. Since we loaded `dplyr` after `stats`, R will assume that if you call `filter()`, you mean the `dplyr` version unless you specify otherwise. -Being specific about which version of `filter()`, for example, you call is easy. To explicitly call a function by its unambiguous name, you use the syntax `package_name::function_name(...)`. So, if I wanted to call the `stats` version of `filter()` in this Rmarkdown document, I would use the syntax `stats::filter(...)`. +Being specific about which version of `filter()`, for example, you call is easy. +To explicitly call a function by its unambiguous name, we use the syntax `package_name::function_name(...)`. +So, if we wanted to call the `stats` version of `filter()` in this Rmarkdown document, I would use the syntax `stats::filter(...)`. ::: ::: callout-note -## Exercise +## Note -Warnings are important, but we might not want them in our final document. After you have read the packages in, **adjust the chunk settings in your library chunk** to suppress warnings and messages. +Warnings are important, but we might not want them in our final document. After you have read the packages in, **adjust the chunk settings in your library chunk** to suppress warnings and messages by adding `#| warning: false`. ::: -Now that we have introduce some data wrangling functions, let's get the data that we are going to use for this lesson. +Now that we have introduced some data wrangling libraries, let's get the data that we are going to use for this lesson. ::: callout-tip ## Setup -1. Obtain data from the [KNB Data Package Alaska commercial salmon catches by management region (1886- 1997)](https://knb.ecoinformatics.org/view/df35b.304.2) +1. Go to [KNB Data Package Alaska commercial salmon catches by management region (1886- 1997)](https://knb.ecoinformatics.org/view/df35b.304.2) 2. Find the data file `byerlySalmonByRegion.csv`. Right click the "Download" button and select "Copy Link Address" @@ -200,8 +193,11 @@ Before we get too much further, spend a minute or two outlining your RMarkdown d - Reshape data ::: -## Explore data -Similar to what we did in our [Intro to Rmd](https://learning.nceas.ucsb.edu/2023-04-coreR/session_05.html) lesson, it is good practice to skim through the data you just read in. This is important to make sure the data is read as you were expecting and also it allows you to get familiar with the data. +## Data exploration +Similar to what we did in our [Intro to Quarto](https://learning.nceas.ucsb.edu/2023-04-coreR/session_03.html) lesson, it is good practice to skim through the data you just read in. +Doing so is important to make sure the data is read as you were expecting and to familiarize yourself with the data. + +Some of the basic ways to explore your data are: ```{r} #| eval: false @@ -215,8 +211,8 @@ head(catch_original) ## Summary of each column of data summary(catch_original) -## Prints unique values in a column (in this case Date) -unique(catch_original$Year) +## Prints unique values in a column (in this case, the region) +unique(catch_original$Region) ## Opens data frame in its own tab to see each row and column of the data View(catch_original) @@ -227,7 +223,7 @@ View(catch_original) ## About the pipe (`%>%`) operator -Before we jump into learning `tidyr` and `dplyr`, we first need to explain the `%>%`. +Before we jump into learning `tidyr` and `dplyr`, we first need to explain the pipeline operator `%>%`. Both the `tidyr` and the `dplyr` packages use the pipe operator (`%>%`), which may look unfamiliar. The pipe is a powerful way to efficiently chain together operations. The pipe will take the output of a previous statement, and use it as the input to the next statement. @@ -269,6 +265,7 @@ RStudio has a keyboard shortcut for `%>%` ## Selecting or removing columns using `select()` +We're ready to go back to our salmon dataset. The first issue is the extra columns `All` and `notesRegCode`. Let's select only the columns we want, and assign this to a variable called `catch_data`. ```{r} @@ -280,7 +277,7 @@ head(catch_data) Much better! -`select()` also allows you to say which columns you *don't* want, by passing unquoted column names preceded by minus (`-`) signs: +The `select()` function also allows you to say which columns you *don't* want, by passing unquoted column names preceded by minus (`-`) signs: ```{r} #| eval: false @@ -288,9 +285,10 @@ catch_data <- catch_original %>% select(-All,-notesRegCode) ``` -## Quality Check +## Quality check -Now that we have the data we are interested in using, we should do a little quality check to see that it seems as expected. One nice way of doing this is the `glimpse()` function. +Now that we have the data we are interested in using, we should do a little quality check to see that everything seems as expected. +One nice way of doing this is the `glimpse()` function. ```{r} dplyr::glimpse(catch_data) @@ -299,7 +297,7 @@ dplyr::glimpse(catch_data) ::: callout-note ## Exercise -Notice the output of the `glimpse()` function call. Does anything seem amiss with this data set that might warrant fixing? +Examine the output of the `glimpse()` function call. Does anything seem amiss with this data set that might warrant fixing?
**Answer:** @@ -309,7 +307,8 @@ Notice the output of the `glimpse()` function call. Does anything seem amiss wit ## Changing column content using `mutate()` -We can use the `mutate()` function to change a column, or to create a new column. First Let's try to just convert the Chinook catch values to `numeric` type using the `as.numeric()` function, and overwrite the old Chinook column. +We can use the `mutate()` function to change a column, or to create a new column. +First, let's try to convert the Chinook catch values to `numeric` type using the `as.numeric()` function, and overwrite the old Chinook column. ```{r} catch_clean <- catch_data %>% @@ -318,7 +317,7 @@ catch_clean <- catch_data %>% head(catch_clean) ``` -We get a warning ```NAs introduced by coercion``` which is R telling us that it couldn't convert every value to an integer and, for those values it couldn't convert, it put an `NA` in its place. This is behavior we commonly experience when cleaning data sets and it's important to have the skills to deal with it when it comes up. +We get a warning ``"NAs introduced by coercion"`` which is R telling us that it couldn't convert every value to an integer and, for those values it couldn't convert, it put an `NA` in its place. This is behavior we commonly experience when cleaning data sets and it's important to have the skills to deal with it when it comes up. To investigate, let's isolate the issue. We can find out which values are `NA`s with a combination of `is.na()` and `which()`, and save that to a variable called `i`. @@ -334,7 +333,7 @@ It looks like there is only one problem row, lets have a look at it in the origi catch_data[i,] ``` -Well that's odd: The value in `catch_thousands` is `I`. It turns out that this data set is from a PDF which was automatically converted into a `csv` and this value of `I` is actually a 1. +Well that's odd: The value in `catch_thousands` is the letter `I`. It turns out that this data set is from a PDF which was automatically converted into a `csv` and this value of `I` is actually a 1. Let's fix it by incorporating the `if_else()` function to our `mutate()` call, which will change the value of the `Chinook` column to 1 if the value is equal to `I`, then will use `as.numeric()` to turn the character representations of numbers into numeric typed values. @@ -343,7 +342,7 @@ catch_clean <- catch_data %>% mutate(Chinook = if_else(condition = Chinook == "I", true = "1", false = Chinook), - Chinook = as.integer(Chinook)) + Chinook = as.numeric(Chinook)) ##check catch_clean[i, ] @@ -351,7 +350,9 @@ catch_clean[i, ] ## Changing shape using `pivot_longer()` and `pivot_wider()` -The next issue is that the data are in a wide format and, we want the data in a long format instead. `pivot_longer()` from the `tidyr` package helps us do just this conversion. How does did work, if you do not remember all the arguments that go into this function you can always call the `help` page by typing `?pivot_longer` in the console. +The next issue is that the data are in a wide format and we want the data in a long format instead. +The function `pivot_longer()` from the `tidyr` package helps us do this conversion. +If you do not remember all the arguments that go into `pivot_longer()` you can always call the `help` page by typing `?pivot_longer` in the console. ```{r} catch_long <- catch_clean %>% @@ -367,9 +368,13 @@ head(catch_long) The syntax we used above for `pivot_longer()` might be a bit confusing so let's walk though it. -The first argument to `pivot_longer` is the columns over which we are pivoting. You can select these by listing either the names of the columns you do want to pivot, or in this case, the names of the columns you are not pivoting over. The `names_to` argument takes the name of the column that you are creating from the column **names** you are pivoting over. The `values_to` argument takes the name of the column that you are creating from the **values** in the columns you are pivoting over. +- The first argument to `pivot_longer` is the columns over which we are pivoting. You can select these by listing either the names of the columns you do want to pivot, or in this case, the names of the columns you are not pivoting over. + +- The `names_to` argument: this is the name of the column that you are creating from the column **names** of the columns you are pivoting over. -The opposite of `pivot_longer()`, `pivot_wider()`, works in a similar declarative fashion: +- The `values_to` argument: the name of the column that you are creating from the **values** in the columns you are pivoting over. + +The opposite of `pivot_longer()` is the `pivot_wider()` function. It works in a similar declarative fashion: ```{r} catch_wide <- catch_long %>% @@ -383,7 +388,8 @@ Same than we did above we can pull up the documentation of the function to remin ## Renaming columns with `rename()` -If you scan through the data, you may notice the values in the `catch` column are very small (these are supposed to be annual catches). If we look at [the metadata](https://knb.ecoinformatics.org/#view/df35b.304.2) we can see that the `catch` column is in thousands of fish so let's convert it before moving on. +If you scan through the data, you may notice the values in the `catch` column are very small (these are supposed to be annual catches). +If we look at [the metadata](https://knb.ecoinformatics.org/#view/df35b.304.2) we can see that the `catch` column is in thousands of fish, so let's convert it before moving on. Let's first rename the `catch` column to be called `catch_thousands`: @@ -398,7 +404,8 @@ head(catch_long) ## `names()` versus `rename()` -Many people use the base R function `names()` to rename columns, often in combination with column indexing that relies on columns being in a particular order. Column indexing is often also used to select columns instead of the `select()` function from `dplyr`. Although these methods both work just fine, they do have one major drawback: in most implementations they rely on you knowing exactly the column order your data is in. +Many people use the base R function `names()` to rename columns, often in combination with column indexing that relies on columns being in a particular order. Column indexing is often also used to select columns instead of the `select()` function from `dplyr`. +Although these methods work just fine, they do have one major drawback: in most implementations they rely on you knowing exactly the column order your data is in. **To illustrate why your knowledge of column order isn't reliable enough for these operations, considering the following scenario:** @@ -406,7 +413,8 @@ Your colleague emails you letting you know that she has an updated version of th Unbeknownst to you, your colleagues bought a new sensor this year that measures dissolved oxygen. Because of the new variables in the data set, the column order is different. Your script which previously renamed the fourth column, `SAL_PSU` to `salinity` now renames the fourth column, `O2_MGpL` to `salinity`. No wonder your results looked so weird, good thing you caught it! -If you had written your code so that it doesn't rely on column order, but instead renames columns using the `rename()` function, the code would have run just fine (assuming the name of the original salinity column didn't change, in which case the code would have thrown an error in an obvious way). This is an example of a defensive coding strategy, where you try to anticipate issues before they arise, and write your code in such a way as to keep the issues from happening. +If you had written your code so that it doesn't rely on column order, but instead renames columns using the `rename()` function, the code would have run just fine (assuming the name of the original salinity column didn't change, in which case the code would have thrown an error in an obvious way). +This is an example of a *defensive coding strategy*, where you try to anticipate issues before they arise, and write your code in such a way as to keep the issues from happening. ::: @@ -424,7 +432,7 @@ catch_long <- catch_long %>% head(catch_long) ``` -Now let's remove the `catch_thousands` column for now since we don't need it. Note that here we have added to the expression we wrote above by adding another function call (mutate) to our expression. This takes advantage of the pipe operator by grouping together a similar set of statements, which all aim to clean up the `catch_clean` data frame. +Let's remove the `catch_thousands` column for now since we don't need it. Note that here we have added to the expression we wrote above by adding another function call (mutate) to our expression. This takes advantage of the pipe operator by grouping together a similar set of statements, which all aim to clean up the `catch_clean` data frame. ```{r} catch_long <- catch_long %>% @@ -438,16 +446,56 @@ We're now ready to start analyzing the data. ## Summary statistics using `group_by()` and `summarize()` -As we outlined in the Introduction, `dplyr` lets us employ the **Split-Apply-Combine** strategy and this is exemplified through the use of the `group_by()` and `summarize()` functions: +Suppose we are now interested in getting the average catch per region. +In our initial data exploration we saw there are 18 regions, we can easily see their names again: + +```{r} +unique(catch_original$Region) +``` + +Think about how we would calculate the average catch per region "by hand". It would be something like this: + +0. We start with our table and notice there are multiple regions in the "Regions" column. + + +1. We split our original table to group all observations from the same region together. + + +2. We calculate the average catch for each of the groups we form. + + +3. Then we combine the values for average catch per region into a single table. + +:::{.column-body-outset} +![](images/regions-split-apply-combine.png){ fig-align="center"} +::: + +Analyses like this conform to what is known as the **Split-Apply-Combine strategy**. This strategy follows the three steps we explained above: + +1. **Split**: Split the data into logical groups (e.g., region, species, etc.) +2. **Apply**: Calculate some summary statistic on each group (e.g. mean catch *by* year, number of individuals *per* species) +3. **Combine**: Combine the statistic calculated on each group back together into a single table + +The `dplyr` library lets us easily employ the Split-Apply-Combine strategy by using the `group_by()` and `summarize()` functions: ```{r} mean_region <- catch_long %>% group_by(Region) %>% - summarize(catch_mean = mean(catch)) + summarize(mean_catch = mean(catch)) head(mean_region) ``` +Let's see how the previous code implements the Split-Apply-Combine strategy: + +1. `group_by(Region)`: this is telling R to **split** the dataframe and create a group for each different value in the column `Region`. R just keeps track of the groups, it doesn't return separate dataframes per region. + + +2. `mean(catch)`: here `mean` is the function we want to **apply** to the column `catch` in each group. + + +3. `summarize(catch = mean(catch))` the function `summarize()` is used to **combine** the results of `mean(catch)` in each group into a single table. The argument `mean_catch = mean(catch)` indicates that the column having the results of `mean(catch)` will be named `mean_catch`. + Another common use of `group_by()` followed by `summarize()` is to count the number of rows in each group. We have to use a special function from `dplyr`, `n()`. ```{r} @@ -458,16 +506,17 @@ n_region <- catch_long %>% head(n_region) ``` -::: column-margin -**Quick Tip** +::: callout-note +## Tip + If you are finding that you are reaching for this combination of `group_by()`, `summarize()` and `n()` a lot, there is a helpful `dplyr` function `count()` that accomplishes this in one function! ::: ::: callout-note ## Exercise -- Find another grouping and statistic to calculate for each group -- Find out if you can group by multiple variables +- Find another grouping and statistic to calculate for each group. +- Find out if you can group by multiple variables. ::: @@ -476,7 +525,7 @@ If you are finding that you are reaching for this combination of `group_by()`, ` #| code-summary: "Answer" #| eval: false -## one example can be +## for example: catch_year_sp <- catch_long %>% group_by(Year, species) %>% summarize(total_year = sum(catch, na.rm = T)) @@ -486,7 +535,7 @@ catch_year_sp <- catch_long %>% ## Filtering rows using `filter()` -`filter()` is the verb we use to filter our `data.frame` to rows matching some condition. It's similar to `subset()` from base R. +We use the `filter()` function to filter our `data.frame` to rows matching some condition. It's similar to `subset()` from base R. Let's go back to our original `data.frame` and do some `filter()`ing: @@ -526,7 +575,7 @@ chinook_see <- catch_long %>% ## Sorting your data using `arrange()` -`arrange()` is how we sort the rows of a `data.frame`. Two common case to use `arrange()` are: +The `arrange()` function is used to sort the rows of a `data.frame`. Two common case to use `arrange()` are: - To calculate a cumulative sum (with `cumsum()`) so row order matters - To display a table (like in an `.Rmd` document) in sorted order @@ -556,14 +605,14 @@ head(mean_region) ## Splitting a column using `separate()` and `unite()` -`separate()` and its complement, `unite()` allow us to easily split a single column into numerous (or numerous into a single). +The `separate()` function allow us to easily split a single column into numerous. Its complement, the `unite()` function, allows ys to combine multiple columns into a single one. -This can come in really handle when we need to split a column into two pieces by a consistent separator (like a dash). +This can come in really handy when we need to split a column into two pieces by a consistent separator (like a dash). -Let's make a new `tibble` with fake data to illustrate this. Here we have a set of site identification codes. with information about the island where the site is (the first 3 letters) and a site number (the 3 numbers). If we want to group and summarize by island, we need a column with just the island information. +Let's make a new `data.frame` with fake data to illustrate this. Here we have a set of site identification codes with information about the island where the site is (the first 3 letters) and a site number (the 3 numbers). If we want to group and summarize by island, we need a column with just the island information. ```{r} -sites_df <- tibble(site = c("HAW-101", +sites_df <- data.frame(site = c("HAW-101", "HAW-103", "OAH-320", "OAH-219", @@ -599,7 +648,7 @@ cities_clean <- cities_df %>% ``` -`unite()` does just the reverse of `separate()`. If we have a data.frame that contains columns for year, month, and day, we might want to unite these into a single date column. +The `unite()` function does just the reverse of `separate()`. If we have a `data.frame` that contains columns for year, month, and day, we might want to unite these into a single date column. ```{r} dates_df <- data.frame( @@ -628,10 +677,6 @@ We just ran through the various things we can do with `dplyr` and `tidyr` but if catch_original <- read_csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.302.1", method = "libcurl")) -region_defs <- read_csv(url("https://knb.ecoinformatics.org/knb/d1/mn/v2/object/df35b.303.1", - method = "libcurl")) %>% - select(code, mgmtArea) - mean_region <- catch_original %>% select(-All, -notesRegCode) %>% mutate(Chinook = ifelse(Chinook == "I", 1, Chinook)) %>% @@ -641,7 +686,8 @@ mean_region <- catch_original %>% values_to = "catch") %>% mutate(catch = catch*1000) %>% group_by(Region) %>% - summarize(mean_catch = mean(catch)) + summarize(mean_catch = mean(catch)) %>% + arrange(desc(mean_catch)) head(mean_region) ``` diff --git a/materials/sections/fair-care-principles.qmd b/materials/sections/fair-care-principles.qmd index 91ed3a3f..962d2057 100644 --- a/materials/sections/fair-care-principles.qmd +++ b/materials/sections/fair-care-principles.qmd @@ -1,38 +1,9 @@ ---- -format: html ---- - ## Learning Objectives {.unnumbered} - Introduce FAIR and CARE principles and the value it provides to data - Provide a FAIR and CARE lens that can be applied to your data-focused work - Evaluate the FAIRness and CAREness of your work and the work of others -## But first, a reproducibility activity using LEGO® - -This activity is largely based on the [LEGO® Metadata for Reproducibility game pack](http://eprints.gla.ac.uk/196477/), which was developed by [Mary Donaldson](https://orcid.org/0000-0002-1936-3499) and [Matt Mahon](https://orcid.org/0000-0001-8950-8422). - -::: callout-tip -## Setup - -1. Get into groups -2. Get LEGO® blocks and template -3. Follow directions on template -::: - -We'll return in about an hour to discuss as a group. - -
- -Activity Discussion - -- Did you find this a simple way to document your process? -- Was there anything you found difficult to capture? -- Did those replicating the builds find it straightforward to follow? -- Did you encounter any ambiguity in the instructions? - -
- ## The FAIR and CARE Principles [![Source: Global Indigenous Data Alliance](images/FAIR_CARE.png){width="92%" fig-align="center"}](https://www.gida-global.org/whoweare) @@ -51,20 +22,20 @@ With the rise of open science and more accessible data, it is becoming increasin [![Source: Fair Teaching Handbook](images/FAIRsFAIR.png){width="80%" fig-align="center"}](https://fairsfair.gitbook.io/fair-teaching-handbook/) -| FAIR | Definition | -|-----------------|-------------------------------------------------------| -| **(F) Findable** | Metadata and data should be easy to find for both humans and computers. | -| **(A) Accessible** | Once someone finds the required data, they need to know how the data can be accessed. | -| **(I) Interoperable** | The data needs to be easily integrated with other data for analysis, storage, and processing. | | | -| **(R) Reusable** | Data should be well-described so they can be reused and replicated in different settings. | +| FAIR | Definition | +|-----------------------|-----------------------------------------------------------------------------------------------| +| **(F) Findable** | Metadata and data should be easy to find for both humans and computers. | +| **(A) Accessible** | Once someone finds the required data, they need to know how the data can be accessed. | +| **(I) Interoperable** | The data needs to be easily integrated with other data for analysis, storage, and processing. | +| **(R) Reusable** | Data should be well-described so they can be reused and replicated in different settings. | ### FAIR Principles in Practice -This is not an exhaustive list of actions for applying FAIR Principles to your research, but these are important big picture concepts you should always keep in mind. We'll be going through the resources linked below so that you know how to use them in your own work. +This is not an exhaustive list of actions for applying FAIR Principles to your research, but these are important big picture concepts you should always keep in mind. We'll be going through the resources linked below so that you know how to use them in your own work. -- **It's all about the metadata.** To make your data and research as findable and as accessible as possible, it's crucial that you are providing rich metadata. This includes, using a field-specific metadata standard (i.e. EML or Ecological Metadata Language for earth and environmental sciences), adding a globally unique identifier (i.e. a Digital Object Identifier) to your datasets, and more. We'll be discussing more metadata best practices in a [later lesson](https://learning.nceas.ucsb.edu/2023-04-coreR/session_14.html), but it's important to understand that quality metadata goes a long way in making your data FAIR. To help with this, researchers use a workflow called the [**FAIRification process**](https://www.go-fair.org/fair-principles/fairification-process/). +- **It's all about the metadata.** To make your data and research as findable and as accessible as possible, it's crucial that you are providing rich metadata. This includes, using a field-specific metadata standard (i.e. EML or Ecological Metadata Language for earth and environmental sciences), adding a globally unique identifier (i.e. a Digital Object Identifier) to your datasets, and more. As discussed in a [previous lesson](https://learning.nceas.ucsb.edu/2023-06-delta/session_05.html), quality metadata goes a long way in making your data FAIR. One tool to help implement FAIR principles to non-FAIR data is the [**FAIRification process**](https://www.go-fair.org/fair-principles/fairification-process/). This workflow was developed by GoFAIR, a self-governed initiative that aims to help implement the FAIR data principles. - **Assess the FAIRness of your research.** The FAIR Principles are a lens to apply to your work. And it's important to ask yourself questions about finding and accessing your data, about how machine-readable your datasets and metadata are, and how reusable it is throughout the entirety of your project. This means you should be re-evaluating the FAIRness of your work over and over again. One way to check the FAIRness of your work, is to use tools like [FAIR-Aware](https://fairaware.dans.knaw.nl/) and the [FAIR Data Maturity Model](https://zenodo.org/record/3909563#.ZCC5kuzMJb_). These tools are self-assessments and can be thought of as a checklists for FAIR and will provide guidance if you're missing anything. -- **Make FAIR decisions during the planning process.** You can ensure FAIR Principles are going to implemented in your work by thinking about it and making FAIR decisions early on. A way to document this planning process, is by writing a Data Management Plan (DMP). We'll talk more about DMPs in a [later lesson](https://learning.nceas.ucsb.edu/2023-04-coreR/session_14.html#writing-data-management-plans-dmps), but know that all aspects of FAIR can be documented in a DMP. +- **Make FAIR decisions during the planning process.** You can ensure FAIR Principles are going to implemented in your work by thinking about it and making FAIR decisions early on and throughout the data life cycle. As you document your data always keep in mind the FAIR lense. ### What is CARE? @@ -73,7 +44,7 @@ The [CARE Principles](https://www.gida-global.org/care) for Indigenous Data Gove [![Source: Carroll, S.R., et al, 2020. The CARE Principles for Indigenous Data Governance](images/CAREsCARE.png){width="80%" fig-align="center"}](https://datascience.codata.org/articles/10.5334/dsj-2020-043/) | CARE | Definition | -|-------------|-----------------------------------------------------------| +|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | **(C) Collective Benefit** | Data ecosystems shall be designed and function in ways that enable Indigenous Peoples to derive benefit from the data. | | **(A) Authority to Control** | Indigenous Peoples' rights and interests in Indigenous data must be recognized and their authority to control such data be empowered. Indigenous data governance enables Indigenous Peoples and governing bodies to determine how Indigenous Peoples, as well as Indigenous lands, territories, resources, knowledge and geographical indicators, are represented and identified within data. | | **(R) Responsibility** | Those working with Indigenous data have a responsibility to share how those data are used to support Indigenous Peoples' self-determination and collective benefit. Accountability requires meaningful and openly available evidence of these efforts and the benefits accruing to Indigenous Peoples. | @@ -81,18 +52,37 @@ The [CARE Principles](https://www.gida-global.org/care) for Indigenous Data Gove ### CARE Principles in Practice -- **Make your data access to Indigenous groups**. Much of the CARE Principles are about sharing and making data accessible to Indigenous Peoples. To do so, consider publish your data on Indigenous founded data repositories such as: - - [Collaborative Indigenous Research Digital Garden (CIRDG)](https://www.oise.utoronto.ca/collaborativeindigenousresearch/research) - - [Mukurtu Wumpurrarni-kari Archive](https://mukurtu.org/about/) - -- **Use Traditional Knowledge (TK) and Biocultural (BC) Labels** How do we think of intellectual property for Traditional and Biocultural Knowledge? Knowledge that outdates any intellectual property system. In many cases institution, organizations, outsiders hold the copy rights of this knowledge and data that comes from their lands, territories, waters and traditions. Traditional Knowledge and Biocultural Labels are digital tags that establish Indigenous cultural authority and governance over Indigenous data and collections by adding provenance information and contextual metadata (including community names), protocols, and permissions for access, use, and circulation. This way mark cultural authority so is recorded in a way that recognizes the inherent sovereignty that Indigenous communities have over knowledge. Giving Indigenous groups more control over their cultural material and guide users what an appropriate behavior looks like. A global initiative that support Indigenous communities with tools that attribute their cultural heritage is [Local Contexts](https://localcontexts.org/). +- **Make your data access to Indigenous groups**. Much of the CARE Principles are about sharing and making data accessible to Indigenous Peoples. To do so, consider publish your data on Indigenous founded data repositories such as: + - [Collaborative Indigenous Research Digital Garden (CIRDG)](https://www.oise.utoronto.ca/collaborativeindigenousresearch/research) + - [Mukurtu Wumpurrarni-kari Archive](https://mukurtu.org/about/) -- **Assess the CAREness of your research.** Like FAIR, CARE Principles are a lens to apply to your work. With CARE, it's important to center human well-being in addition to open science and data sharing. To do this, reflect on how you're giving access to Indigenous groups, on who your data impacts and the relationships you have with them, and the ethical concerns in your work. The Arctic Data Center, a data repository for Arctic research, now requires an [Ethical Research Practices Statement](https://arcticdata.io/all/blog/2022/04/applying-care-documenting-ethical-data-procedures-and-sensitive-data-at-the-arctic-data-center/) when submitting data to them. They also have multiple [guidelines](https://arcticdata.io/data-ethics/) on how to write and what to include in an Ethical Research Practices Statement. +- **Use Traditional Knowledge (TK) and Biocultural (BC) Labels** How do we think of intellectual property for Traditional and Biocultural Knowledge? Knowledge that outdates any intellectual property system. In many cases institution, organizations, outsiders hold the copy rights of this knowledge and data that comes from their lands, territories, waters and traditions. Traditional Knowledge and Biocultural Labels are digital tags that establish Indigenous cultural authority and governance over Indigenous data and collections by adding provenance information and contextual metadata (including community names), protocols, and permissions for access, use, and circulation. This way mark cultural authority so is recorded in a way that recognizes the inherent sovereignty that Indigenous communities have over knowledge. Giving Indigenous groups more control over their cultural material and guide users what an appropriate behavior looks like. A global initiative that support Indigenous communities with tools that attribute their cultural heritage is [Local Contexts](https://localcontexts.org/). + +- **Assess the CAREness of your research.** Like FAIR, CARE Principles are a lens to apply to your work. With CARE, it's important to center human well-being in addition to open science and data sharing. To do this, reflect on how you're giving access to Indigenous groups, on who your data impacts and the relationships you have with them, and the ethical concerns in your work. The Arctic Data Center, a data repository for Arctic research, now requires an [Ethical Research Practices Statement](https://arcticdata.io/all/blog/2022/04/applying-care-documenting-ethical-data-procedures-and-sensitive-data-at-the-arctic-data-center/) when submitting data to them. They also have multiple [guidelines](https://arcticdata.io/data-ethics/) on how to write and what to include in an Ethical Research Practices Statement. + +## Thinking with CARE lenses ::: callout-note ## CARE Exercise -Explore the Arctic Data Center's guidelines on writing an Ethical Research Practices Statement. Then write an Ethical Research Practices Statement for your current research. Switch statements with a partner and assess their statement. +Explore the [Arctic Data Center's guidelines on writing an Ethical Research Practices Statement](https://arcticdata.io/data-ethics/). Then write an Ethical Research Practices Statement for your current research. Switch statements with a partner and assess their statement. ::: +## Evaluating FAIR + +::: callout-note +## FAIR Exercise + +Now that we reviewed the FAIR principles in detail, go back to the EDI data package you looked into at the beginning of the week and carefully evaluate the FAIRness of the data package using our [Data package assesment rubric, Evidence for FAIR and CARE](https://docs.google.com/document/d/1PQpw9ohOMY7K1yBWaknMHV0dGEm0GnZ9pCB8_i2JPoU/edit). + +- **Group A:** [SBC LTER: Reef: Abundance, size and fishing effort for California Spiny Lobster (Panulirus interruptus), ongoing since 2012](https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-sbc.77.8) + +- **Group B:** [Physiological stress of American pika (Ochotona princeps) and associated habitat characteristics for Niwot Ridge, 2018 - 2019](https://portal.edirepository.org/nis/mapbrowse?packageid=knb-lter-nwt.268.1) + +- **Group C:** [Ecological and social interactions in urban parks: bird surveys in local parks in the central Arizona-Phoenix metropolitan area](https://portal.edirepository.org/nis/mapbrowse?scope=knb-lter-cap&identifier=256&revision=10) + +- **Group D:** [Interagency Ecological Program: Fish catch and water quality data from the Sacramento River floodplain and tidal slough, collected by the Yolo Bypass Fish Monitoring Program, 1998-2021.](https://portal.edirepository.org/nis/mapbrowse?scope=edi&identifier=233&revision=3) + + +::: diff --git a/materials/sections/git-collab-merge-conflicts.qmd b/materials/sections/git-collab-merge-conflicts.qmd index 731a442e..98e8b537 100644 --- a/materials/sections/git-collab-merge-conflicts.qmd +++ b/materials/sections/git-collab-merge-conflicts.qmd @@ -1,71 +1,43 @@ ## Learning Objectives {.unnumbered} -- How to use `Git` and GitHub to collaborate with colleagues on code -- What typically causes conflicts when collaborating -- How to resolve a conflict -- Workflows to avoid conflicts +- Apply the principles, features, and collaboration tools of `Git` and GitHub to effectively collaborate with colleagues on code +- Analyze and evaluate common causes of conflicts that arise when collaborating on repositories +- Demonstrate the ability to resolve conflicts using `Git` conflict resolution techniques +- Apply workflows and best practices that minimize conflicts on collaborative repositories -## Introduction +## Introduction to `Git` and GitHub Tools for Collaboration -`Git` is a great tool for working on your own, but even better for working with friends -and colleagues. `Git` allows you to work with confidence on your own local copy of files -with the confidence that you will be able to successfully synchronize your changes -with the changes made by others. +[![Artwork by Allison Horst](images/allison-horst-jenny-bryan-quote.png)](https://twitter.com/allison_horst) -The simplest way to collaborate with `Git` is to use a shared repository on a hosting -service such as [GitHub](https://github.com), and use this shared repository as -the mechanism to move changes from one Collaborator to another. While there are other -more advanced ways to sync `Git` repositories, this "hub and spoke" model works really -well due to its simplicity. +`Git` is not only a powerful tool for individual work but also an excellent choice for collaborating with friends and colleagues. `Git` ensures that after you've completed your contributions to a repository, you can confidently synchronize your changes with changes made by others. -In this model, the Collaborator will `clone` a copy of the Owner's repository from +One of the easiest and most effective ways to collaborate using `Git` is by utilizing a shared repository on a hosting service like [GitHub](https://github.com). This shared repository acts as a central hub, enabling collaborators to effortlessly exchange and merge their changes. With `Git` and a shared repository, you can collaborate seamlessly and work confidently, knowing that your changes will be integrated smoothly with those of your collaborators. + +[![Graphic from Atlassian](images/git-collab-repos.svg)](https://www.atlassian.com/git/tutorials/syncing){width="80%" fig-align="center"} + +There are many advanced techniques for synchronizing `Git` repositories, but let's start with a simple example. + +In this example, the Collaborator will `clone` a copy of the Owner's repository from GitHub, and the Owner will grant them Collaborator status, enabling the Collaborator to directly pull and push from the Owner's GitHub repository. -![](images/github-workflows-collab.png){width="80%" fig-align="center"} - ## Collaborating with a trusted colleague **without conflicts** -We start by enabling collaboration with a trusted colleague. We will designate the Owner as the person who owns the shared repository, and the Collaborator as the person that they wish to grant the ability to make changes to their repository. We start by giving that person access to our GitHub repository. +We start our collaboration by giving a trusted colleague access to our repository on GitHub. In this example, we define the **Owner as the individual who owns the repository**, and the **Collaborator as the person whom the Owner chooses to give permission to make changes to their repository**. -::: callout-tip -### Setup +The Collaborator will make changes to the repository and then `push` those changes to the shared repository on GitHub. The Owner will then use `pull` to retrieve the changes without encountering any conflicts. This is the most ideal workflow. -- Get into pairs, then choose one person as the Owner and one as the Collaborator -- Both logon to [GitHub](https://github.com) +The instructors will demonstrate this process in the next section. -These next steps are for the Owner: +### Step 0: Owner adds Collaborator to shared repository {.unnumbered} -- Navigate to the `{FIRSTNAME}_test` repository -- Go to "Settings" and navigate to "Collaborators" in the "Access" section on the left-hand side -- Under "Manage Access" click the button "Add people" and type the username of your Collaborator in the search box -- Once you've found the correct username, click "Add {Collaborator username} to this repository +The Owner must change the settings of the repository and give the Collaborator access to the repository by inviting them as a collaborator to the repository. Once the Collaborator has accepted the invite, they can contribute to the repository. ![](images/github-collaborators.png){width="80%" fig-align="center"} -Now, the Collaborator will follow this step: - -- Check your email for an invitation to GitHub or check your notifications (likely under "Your Organizations") on GitHub to accept the invite to collaborate. -::: - -::: {.callout-caution icon=false} -### Last thing, some `Git` configuration - -When `Git` released version 2.27, a new feature they incorporated allows users to specify how to pull, essentially, otherwise a warning will appear. To suppress this warning we need to configure our `Git` with this line of code: - -```{.bash} -git config pull.rebase false -``` - -`pull.rebase false` is a default strategy for pulling where it will try to auto-merge the files if possible, and if it can’t it will show a merge conflict -::: - -We will start by having the Collaborator make some changes and share those with the Owner without generating any conflicts, In an ideal world, this would be the normal workflow. The instructors are going to demonstrate this in the next section. - ### Step 1: Collaborator clone {.unnumbered} -To be able to contribute to a repository, the Collaborator -must clone the repository from the **Owner's** GitHub account. To do this, the Collaborator should visit the GitHub page for the Owner's repository, and then copy the clone URL. In R Studio, the Collaborator will create a new project from version control by pasting this clone URL into the appropriate dialog (see the earlier chapter introducing GitHub). +To be able to contribute to a repository, the Collaborator must clone the repository from the **Owner's** GitHub account. To do this, the Collaborator should visit the GitHub page for the Owner's repository, and then copy the clone URL. In R Studio, the Collaborator will create a new project from version control by pasting this clone URL into the appropriate dialog (see the earlier chapter introducing GitHub). ![](images/github-clone-url-owner.png) @@ -99,13 +71,44 @@ while editing, and then `add`, `commit`, and `push` the Owner changes to GitHub. The Collaborator can now `pull` down those Owner changes, and all copies are once again fully synced. And you're off to collaborating. -### Exercise: With a partner, collaborate in a repository using a conflict-free process {.unnumbered} +## Exercise 1: With a partner collaborate in a repository without a merge conflict{#ex1-no-conflict} + +::: callout-tip +### Setup + +- Get into pairs, then choose one person as the Owner and one as the Collaborator +- Both logon to [GitHub](https://github.com) + +These next steps are for the Owner: + +- Navigate to the `{FIRSTNAME}_test` repository +- Go to "Settings" and navigate to "Collaborators" in the "Access" section on the left-hand side +- Under "Manage Access" click the button "Add people" and type the username of your Collaborator in the search box +- Once you've found the correct username, click "Add {Collaborator username} to this repository + +![](images/github-collaborators.png){width="80%" fig-align="center"} + +Now, the Collaborator will follow this step: + +- Check your email for an invitation to GitHub or check your notifications (likely under "Your Organizations") on GitHub to accept the invite to collaborate. +::: + +::: {.callout-caution icon=false} +### Last thing, some `Git` configuration + +When `Git` released version 2.27, a new feature they incorporated allows users to specify how to pull, essentially, otherwise a warning will appear. To suppress this warning we need to configure our `Git` with this line of code: + +```{.bash} +git config pull.rebase false +``` + +`pull.rebase false` is a default strategy for pulling where it will try to auto-merge the files if possible, and if it can’t it will show a merge conflict +::: ::: callout-note -#### Instructions +### Instructions -Now that the instructors have demonstrated this conflict-free process, break into -pairs and try the same with your partner. You will do the exercise twice, where each person will get to practice being both the Owner and the Collaborator roles. +You will do the exercise twice, where each person will get to practice being both the Owner and the Collaborator roles. - Step 0: Designate one person as the Owner and one as the Collaborator. @@ -128,38 +131,43 @@ pairs and try the same with your partner. You will do the exercise twice, where - Step 1: Owner adds Collaborator to `{FIRSTNAME}_test` repository - Step 2: Collaborator clones the Owner's `{FIRSTNAME}_test` repository - Step 3: Collaborator edits the `README` file: - - Collaborator adds a new level 2 heading to `README` titled "How to Create a `Git` Repository" and adds the high level steps for creating a `Git` repository on GitHub + - Collaborator adds a new level 2 heading to `README` titled "How to Create a `Git` Repository from an existing project" and adds the high level steps for this workflow - Step 4: Collaborator commits and pushes the `README` file with the new changes to GitHub - Step 5: Owner pulls the changes that the Collaborator made - Step 6: Owner edits the `README` file: - - Under "How to Create a `Git` Repository", Owner adds the high level steps for creating a `Git` repository from an existing project on RStudio + - Under "How to Create a `Git` Repository", Owner adds the high level steps for this workflow - Step 7: Owner commits and pushes the `README` file with the new changes to GitHub - Step 8: Collaborator pulls the `Owners` changes from GitHub **Hint:** If you don't remember how to create a `Git` repository, refer to the chapter [Intro to `Git` and GitHub](https://learning.nceas.ucsb.edu/2023-04-coreR/session_07.html) where we created two `Git` repositories ::: +## A Note on Advanced Collaboration Techniques + +There are many `Git` and GitHub collaboration techniques, some more advanced than others. We won't be covering advanced strategies in this course. But here is a table for your reference on a few popular `Git` collaboration workflow strategies and tools. + +| Collaboration Technique | Benefits | When to Use | When Not to Use | +|-------------------------|----------|-------------|-----------------| +| Branch Management Strategies | 1. Enables parallel development and experimentation
2. Facilitates isolation of features or bug fixes
3. Provides flexibility and control over project workflows | When working on larger projects with multiple features or bug fixes simultaneously.
When you want to maintain a stable main branch while developing new features or resolving issues on separate branches.
When collaborating with teammates on different aspects of a project and later integrating their changes. | When working on small projects with a single developer or limited codebase.
When the project scope is simple and doesn't require extensive branch management.
When there is no need to isolate features or bug fixes. | +| Code Review Practices | 1. Enhances code quality and correctness through feedback
2. Promotes knowledge sharing and learning within the team
3. Helps identify bugs, improve performance, and ensure adherence to coding standards | When collaborating on a codebase with team members to ensure code quality and maintain best practices.
When you want to receive feedback and suggestions on your code to improve its readability, efficiency, or functionality.
When working on critical or complex code that requires an extra layer of scrutiny before merging it into the main branch. | When working on personal projects or small codebases with no collaboration involved.
When time constraints or project size make it impractical to conduct code reviews.
When the codebase is less critical or has low complexity. | +| Forking | 1. Enables independent experimentation and development
2. Provides a way to contribute to a project without direct access
3. Allows for creating separate, standalone copies of a repository | When you want to contribute to a project without having direct write access to the original repository.
When you want to work on an independent variation or extension of an existing project.
When experimenting with changes or modifications to a project while keeping the original repository intact. | When collaborating on a project with direct write access to the original repository.
When the project does not allow external contributions or forking.
When the project size or complexity doesn't justify the need for independent variations. | +| Pull Requests | 1. Facilitates code review and discussion
2. Allows for collaboration and feedback from team members
3. Enables better organization and tracking of proposed changes | When working on a shared repository with a team and wanting to contribute changes in a controlled and collaborative manner.
When you want to propose changes to a project managed by others and seek review and approval before merging them into the main codebase. | When working on personal projects or individual coding tasks without the need for collaboration.
When immediate changes or fixes are required without review processes.
When working on projects with a small team or single developer with direct write access to the repository. | + +: {tbl-colwidths="[15,28,28,28]"} + +The "When Not to Use" column provides insights into situations where it may be less appropriate to use each collaboration technique, helping you make informed decisions based on the specific context and requirements of your project. + +These techniques provide different benefits and are used in various collaboration scenarios, depending on the project's needs and team dynamics. ## Merge conflicts -So things can go wrong, which usually starts with a **merge conflict**, due to -both collaborators making incompatible changes to a file. While the error messages -from merge conflicts can be daunting, getting things back to a normal state can be -straightforward once you've got an idea where the problem lies. +**Merge conflicts** occur when both collaborators make conflicting changes to the same file. Resolving merge conflicts involves identifying the root of the problem and restoring the project to a normal state. Good communication, discussing file sections to work on, and avoiding overlaps can help prevent merge conflicts. However, if conflicts do arise, `Git` warns about potential issues and ensures that changes from different collaborators based on the same file version are not overwritten. To resolve conflicts, you need to explicitly specify whose changes should be used for each conflicting line in the file. -A merge conflict occurs when both the Owner and Collaborator change the same -lines in the same file without first pulling the changes that the other has made. -This is most easily avoided by good communication about who is working on various -sections of each file, and trying to avoid overlaps. But sometimes it happens, -and `Git` is there to warn you about potential problems. And `Git` will not allow -you to overwrite one person's changes to a file with another's changes to the same -file if they were based on the same version. +In this image, we see collaborators `mbjones` and `metamattj` have both made changes to the same line in the same `README.md` file. This is causing a merge conflict because `Git` doesn't know whose changes came first. To resolve it, we need to tell `Git` whose changes to keep for that line, and whose changes to discard. ![](images/git-conflict-00-lines-changed.png) -The main problem with merge conflicts is that, when the Owner and Collaborator -both make changes to the same line of a file, `Git` doesn't know whose changes -take precedence. You have to tell `Git` whose changes to use for that line. + ### Common ways to resolve a merge conflict @@ -307,11 +315,13 @@ branch, and the merged changes are clearly visible in the history. ![](images/git-conflict-08-history.png) -### Exercise: With a partner, collaborate in a repository and resolve a merge conflict {.unnumbered} +## Exercise 2: With a partner collaborate in a repository and resolve a merge conflict + +Note you will only need to complete the Setup and `Git` configuration steps again if you are working in a new repository. Return to [Exercise 1](@ex1-no-conflict) for Setup and `Git` configuration steps. ::: callout-note -#### Instructions +### Instructions Now it's your turn. In pairs, intentionally create a merge conflict, and then go through the steps needed to resolve the issues and continue developing with the merged files. See the sections above for help with each of the steps below. You will do the exercise twice, where each person will get to practice being both the Owner and the Collaborator roles. @@ -348,27 +358,26 @@ Now it's your turn. In pairs, intentionally create a merge conflict, and then go ::: -## Workflows to avoid merge conflicts +## Best practices to avoid merge conflicts -Some basic rules of thumb can avoid the vast majority of merge conflicts, saving -a lot of time and frustration. These are words our teams live by: +Some basic rules of thumb can avoid the vast majority of merge conflicts, saving a lot of time and frustration. These are words our teams live by: ::: column-margin [![XKCD 1597](images/git-xkcd-comic.png)](https://xkcd.com/1597/) ::: -- Communicate often +- Communicate often and set up effective communication channels - Tell each other what you are working on -- Start you working session with a pull -- Pull immediately before you commit or push -- Commit often in small chunks -- Make sure you and who your collaborating with all **fully** understand the `Git` workflow you're using aka make sure you're on the same page before you start! +- Start your working session with a `pull` +- `Pull` immediately before you `commit` or `push` +- `Commit` often in small chunks (this helps you organize your work!) +- Make sure you and who you are collaborating with all **fully** understand the `Git` workflow you're using aka make sure you're on the same page before you start! A good workflow is encapsulated as follows: ```Pull -> Edit -> Save -> Add (stage) -> Commit -> Pull -> Push``` -**Always start your working sessions with a `Pull` to get any outstanding changes, then -start your work**. `Stage` your changes, but before you `Commit`, `Pull` again to see if any new changes have arrived. If so, they should merge in easily if you are working in different parts of the program. You can then `Commit` and immediately `Push` your changes safely. +**Always start your working sessions with a `pull` to get any outstanding changes, then +start your work**. `Stage` your changes, but before you `commit`, `pull` again to see if any new changes have arrived. If so, they should merge in easily if you are working in different parts of the program. You can then `commit` and immediately `push` your changes safely. Good luck, and try to not get frustrated. Once you figure out how to handle merge conflicts, they can be avoided or dispatched when they occur, but it does take a bit of practice. diff --git a/materials/sections/git-github-intro.qmd b/materials/sections/git-github-intro.qmd index e625c159..849371d9 100644 --- a/materials/sections/git-github-intro.qmd +++ b/materials/sections/git-github-intro.qmd @@ -1,134 +1,225 @@ ## Learning Objectives {.unnumbered} -- Practice using `Git` to track changes of your project -- Practice the `Git` workflow: `pull`, `stage`, `commit`,`pull`, `push` -- Practice setting up a `Git` repository using different workflows +- Apply the principles of `Git` to track and manage changes of a project +- Utilize the `Git` workflow including pulling changes, staging modified files, committing changes, pulling again to incorporate remote changes, and pushing changes to a remote repository +- Create and configure `Git` repositories using different workflows -## Introduction to `Git` +## Introduction to Version Control ![](images/phd_comics_final.png){width="70%" fig-align="center"} -Every file in the scientific process changes. Manuscripts are edited. Figures get revised. Code gets fixed when problems are discovered. Data files get combined together, then errors are fixed, and then they are split and combined again. In the course of a single analysis, one can expect thousands of changes to files. And yet, all we use to track this are simplistic filenames. +Every file in the scientific process changes. Manuscripts are edited. Figures get revised. Code gets fixed when bugs are discovered. Sometimes those fixes lead to even more bugs, leading to more changes in the codebase. Data files get combined together. Sometimes those same files are split and combined again. All that to say - in just one research project, we can expect thousands of changes to occur. -You might think there is a better way, and you'd be right: **version control**. +These changes are important to track, and yet, we often use simplistic filenames to track them. Many of us have experienced renaming a document or script multiple times with the ingenuine addition of "final" to the filename (like the comic above demonstrates). -## A Motivating Example +You might think there is a better way, and you'd be right: **version control**. Version control provides an organized and transparent way to track changes in code and additional files. This practice was designed for software development, but is easily applicable to scientific programming. + +There are many benefits to using a version control software including: + +- Maintain a history of your research project's development while keeping your workspace clean +- Facilitate collaboration and transparency when working on teams +- Explore bugs or new features without disrupting your team members' work +- and more! + +The version control system we'll be diving into is `Git`, the most widely used modern version control system in the world. + +## Introduction to `Git` + GitHub Before diving into the details of `Git` and how to use it, let's start with a motivating example that's representative of the types of problems `Git` can help us solve. +### A Motivating Example + Say, for example, you're working on an analysis in R and you've got it into a state you're pretty happy with. We'll call this version 1: -![](images/git-intro-slide02.png) +:::{.column-body-outset-right} +![](images/git-intro-slide01.png) +::: You come into the office the following day and you have an email from your boss, "Hey, you know what this model needs?" -![](images/git-intro-slide03.png) +:::{.column-body-outset} +![](images/git-intro-slide02.png) +::: You're not entirely sure what she means but you figure there's only one thing she could be talking about: more cowbell. So you add it to the model in order to really explore the space. But you're worried about losing track of the old model so, instead of editing the code in place, you comment out the old code and put as serious a warning as you can muster in a comment above it. -![](images/git-intro-slide04.png) +:::{.column-body-outset} +![](images/git-intro-slide03.png) +::: Commenting out code you don't want to lose is something probably all of us have done at one point or another but it's really hard to understand why you did this when you come back years later or you when you send your script to a colleague. Luckily, there's a better way: Version control. Instead of commenting out the old code, we can change the code in place and tell `Git` to commit our change. So now we have two distinct versions of our analysis and we can always see what the previous version(s) look like. -![](images/git-intro-slide05.png) +:::{.column-body-outset} +![](images/git-intro-slide04.png) +::: You may have noticed something else in the diagram above: Not only can we save a new version of our analysis, we can also write as much text as we like about the change in the commit message. In addition to the commit message, `Git` also tracks who, when, and where the change was made. Imagine that some time has gone by and you've committed a third version of your analysis, version 3, and a colleague emails with an idea: What if you used machine learning instead? -![](images/git-intro-slide06.png) +:::{.column-body-outset} +![](images/git-intro-slide05.png) +::: Maybe you're not so sure the idea will work out and this is where a tool like `Git` shines. Without a tool like `Git`, we might copy analysis.R to another file called analysis-ml.R which might end up having mostly the same code except for a few lines. This isn't particularly problematic until you want to make a change to a bit of shared code and now you have to make changes in two files, if you even remember to. Instead, with `Git`, we can start a branch. Branches allow us to confidently experiment on our code, all while leaving the old code in tact and recoverable. -![](images/git-intro-slide07.png) +:::{.column-body-outset} +![](images/git-intro-slide06.png) +::: So you've been working in a branch and have made a few commits on it and your boss emails again asking you to update the model in some way. If you weren't using a tool like `Git`, you might panic at this point because you've rewritten much of your analysis to use a different method but your boss wants change to the old method. -![](images/git-intro-slide08.png) +:::{.column-body-outset} +![](images/git-intro-slide07.png) +::: But with `Git` and branches, we can continue developing our main analysis at the same time as we are working on any experimental branches. Branches are great for experiments but also great for organizing your work generally. -![](images/git-intro-slide09.png) +:::{.column-body-outset} +![](images/git-intro-slide08.png) +::: After all that hard work on the machine learning experiment, you and your colleague could decide to scrap it. It's perfectly fine to leave branches around and switch back to the main line of development but we can also delete them to tidy up. -![](images/git-intro-slide10.png) +:::{.column-body-outset} +![](images/git-intro-slide09.png) +::: If, instead, you and your colleague had decided you liked the machine learning experiment, you could also merge the branch with your main development line. Merging branches is analogous to accepting a change in Word's Track Changes feature but way more powerful and useful. -![](images/git-intro-slide11.png) +:::{.column-body-outset} +![](images/git-intro-slide10.png) +::: A key takeaway here is that `Git` can drastically increase your confidence and willingness to make changes to your code and help you avoid problems down the road. Analysis rarely follows a linear path and we need a tool that respects this. +:::{.column-body-outset} ![](images/git-intro-slide11.png) +::: Finally, imagine that, years later, your colleague asks you to make sure the model you reported in a paper you published together was actually the one you used. Another really powerful feature of `Git` is tags which allow us to record a particular state of our analysis with a meaningful name. In this case, we are lucky because we tagged the version of our code we used to run the analysis. Even if we continued to develop beyond commit 5 (above) after we submitted our manuscript, we can always go back and run the analysis as it was in the past. -### Summary +#### With `Git` we can enhance our workflow: {.unnumbered} + +- Eliminate the need for cryptic filenames and comments to track our work. +- Provide detailed descriptions of our changes through commits, making it easier to understand the reasons behind code modifications. +- Work on multiple branches simultaneously, allowing for parallel development, and optionally merge them together. +- Use commits to access and even execute older versions of our code. +- Assign meaningful tags to specific versions of our code. +- Additionally, `Git` offers a powerful distributed feature. Multiple individuals can work on the same analysis concurrently on their own computers, with the ability to merge everyone's changes together. + +### What *exactly* are `Git` and GitHub? + +#### `Git`: {.unnumbered} + +- an open-source distributed version control software +- designed to manage the versioning and tracking of source code files and project history +- operates locally on your computer, allowing you to create repositories, track changes, and collaborate with others +- provides features such as committing changes, branching and merging code, reverting to previous versions, and managing project history +- works directly with the files on your computer and does not require a network connection to perform most operations +- primarily used through the command-line interface (CLI, e.g. Terminal), but also has various GUI tools available (e.g. RStudio IDE) + +::: {.column-margin} +![](images/git-intro.png) +::: + +#### GitHub: {.unnumbered} +- online platform and service built around `Git` +- provides a centralized hosting platform for Git repositories +- allows us to store, manage, and collaborate on their `Git` repositories in the cloud +- offers additional features on top of `Git`, such as a web-based interface, issue tracking, project management tools, pull requests, code review, and collaboration features +- enables easy sharing of code with others, facilitating collaboration and contribution to open source projects +- provides a social aspect, allowing users to follow projects, star repositories, and discover new code + + +::: {.column-margin} +![](images/github-intro.png) +::: + +### The `Git` Life cycle -With `Git`, we can: +As a `Git` user, you'll need to understand the basic concepts associated with versioned sets of changes, and how they are stored and moved across repositories. Any given `Git` repository can be cloned so that it exists both locally, and remotely. But each of these cloned repositories is simply a copy of all of the files and change history for those files, stored in `Git`'s particular format. For our purposes, we can consider a `Git` repository as a folder with a bunch of additional version-related metadata. -- Avoid using cryptic filenames and comments to keep track of our work -- Describe our changes with as much information as we like so it's easier to understand why our code changed (commits) -- Work on multiple, simultaneous development (branches) of our code at the same time and, optionally, merge them together -- Go back in time to look at (and even run) older versions of our code -- Tag specific versions of our code as meaningful (tags) +In a local `Git`-enabled folder, the folder contains a workspace containing the current version of all files in the repository. These working files are linked to a hidden folder containing the 'Local repository', which contains all of the other changes made to the files, along with the version metadata. -And, as we'll see below, `Git` has one extra superpower available to us: It's distributed. Multiple people can work on the same analysis at the same time on their own computer and everyone's changes can eventually merged together. +So, when working with files using `Git`, you can use `Git` commands to indicate specifically which changes to the local working files should be staged for versioning (using the `git add` command), and when to record those changes as a version in the local repository (using the command `git commit`). -## Version control and Collaboration using `Git` and GitHub +The remaining concepts are involved in synchronizing the changes in your local repository with changes in a remote repository. The `git push` command is used to send local changes up to a remote repository (possibly on GitHub), and the `git pull` command is used to fetch changes from a remote repository and merge them into the local repository. -First, just what are `Git` and GitHub? +[A basic git workflow represented as two islands, one with "local repo" and "working directory", and another with "remote repo." Bunnies move file boxes from the working directory to the staging area, then with Commit move them to the local repo. Bunnies in rowboats move changes from the local repo to the remote repo (labeled "PUSH") and from the remote repo to the working directory (labeled "PULL").]{.aside} -- **`Git`**: version control software used to track files in a folder (a repository) - - `Git` creates the versioned history of a repository -- **GitHub**: web site that allows users to store their `Git` repositories and share them with others + -![](images/vc-local-github.png) +[![Artwork by Allison Horst](images/allison-horst-git-workflow.png)](https://twitter.com/allison_horst) -## Let's Look at a GitHub Repository +### Let's Look at a GitHub Repository This screen shows the copy of a repository stored on GitHub, with its list of files, when the files and directories were last modified, and some information on who made the most recent changes. -![](images/ss3sim-github.png) +:::{.column-page} +![](images/ss3sim-github.png) +::: If we drill into the "commits" for the repository, we can see the history of changes made to all of the files. Looks like `kellijohnson` was working on the project and fixing errors in December: +:::{.column-page} ![](images/ss3sim-commits.png) +::: And finally, if we drill into one of the changes made on December 20, we can see exactly what was changed in each file: +:::{.column-page} ![](images/ss3sim-diff.png) +::: Tracking these changes, how they relate to released versions of software and files is exactly what `Git` and GitHub are good for. And we will show how they can really be effective for tracking versions of scientific code, figures, and manuscripts to accomplish a reproducible workflow. -### The `Git` Life cycle +### `Git` Vocabulary & Commands -As a `Git` user, you'll need to understand the basic concepts associated with versioned sets of changes, and how they are stored and moved across repositories. Any given `Git` repository can be cloned so that it exists both locally, and remotely. But each of these cloned repositories is simply a copy of all of the files and change history for those files, stored in `Git`'s particular format. For our purposes, we can consider a `Git` repository as a folder with a bunch of additional version-related metadata. +We know the world of `Git` and GitHub can be daunting. Use these tables as references while you use `Git` and GitHub, and we encourage you to build upon this list as you become more comfortable with these tools. -In a local `Git`-enabled folder, the folder contains a workspace containing the current version of all files in the repository. These working files are linked to a hidden folder containing the 'Local repository', which contains all of the other changes made to the files, along with the version metadata. +This table contains essential terms and commands that complement intro to `Git` skills. They will get you far on personal and individual projects. -So, when working with files using `Git`, you can use `Git` commands to indicate specifically which changes to the local working files should be staged for versioning (using the `git add` command), and when to record those changes as a version in the local repository (using the command `git commit`). +| Term | `Git` Command(s) | Definition | +|--------------------|--------------------------|----------------------------------------------------------------------------------------------------| +| Add | `git add [file]` | Stages or adds file changes to the next commit. `git add .` will stage or add all files. | +| Commit | `git commit` | Records changes to the repository with a descriptive message. | +| Commit Message | `git commit -m` | A descriptive message explaining the changes made in a commit. The message must be within quotes (e.g. "This is my commit message."). | +| Fetch | `git fetch` | Retrieves changes from a remote repository but does not merge them. | +| Pull | `git pull` | Retrieves and merges changes from a remote repository to the current branch. | +| Push | `git push` | Sends local commits to a remote repository. | +| Stage | - | The process of preparing and selecting changes to be included in the next commit. | +| Status | `git status` | Shows the current status of the repository, including changes and branch information. | -The remaining concepts are involved in synchronizing the changes in your local repository with changes in a remote repository. The `git push` command is used to send local changes up to a remote repository (possibly on GitHub), and the `git pull` command is used to fetch changes from a remote repository and merge them into the local repository. +: Essential `Git` Commands -![](images/git-flowchart.png) -`Git` commands to use in the terminal: +This table includes more advanced `Git` terms and commands that are commonly used in both individual and collaborative projects. -- `git clone`: to copy a whole remote repository to local -- `git add` (stage): notify `Git` to track particular changes -- `git commit`: store those changes as a version -- `git pull`: merge changes from a remote repository to our local repository -- `git push`: copy changes from our local repository to a remote repository -- `git status`: determine the state of all files in the local repository -- `git log`: print the history of changes in a repository +| Term | `Git` Command(s) | Definition | +|--------------------|--------------------------|----------------------------------------------------------------------------------------------------| +| Branch | `git branch` | Lists existing branches or creates a new branch. | +| Checkout | `git checkout [branch]` | Switches to a different branch or restores files from a specific commit. | +| Clone | `git clone [repository]` | Creates a local copy of a remote repository. | +| Diff | `git diff` | Shows differences between files, commits, or branches. | +| Fork | - | Creates a personal copy of a repository under your GitHub account for independent development. | +| Log | `git log` | Displays the commit history of the repository. | +| Merge | `git merge [branch]` | Integrates changes from one branch into another branch. | +| Merge Conflict | - | Occurs when Git cannot automatically merge changes from different branches, requiring manual resolution. | +| Pull Request (PR) | - | A request to merge changes from a branch into another branch, typically in a collaborative project. | +| Rebase | `git rebase` | Integrates changes from one branch onto another by modifying commit history. | +| Remote | `git remote` | Manages remote repositories linked to the local repository. | +| Repository | `git init` | A directory where Git tracks and manages files and their versions. | +| Stash | `git stash` | Temporarily saves changes that are not ready to be committed. | +| Tag | `git tag` | Assigns a label or tag to a specific commit. | -Those seven commands are the majority of what you need to successfully use `Git`. But this is all super abstract, so let's explore with some real examples. +: Advanced `Git` Commands + +`Git` has a rich set of commands and features, and there are many more terms beyond either table. ## Exercise 1: Create a remote repository on GitHub @@ -146,11 +237,15 @@ Those seven commands are the majority of what you need to successfully use `Git` If you were successful, it should look something like this: +:::{.column-body-outset} ![](images/new-repo-github.png) +::: You've now created your first repository! It has a couple of files that GitHub created for you, like the `README.md` file, and the `LICENSE` file, and the `.gitignore` file. +:::{.column-body-outset} ![](images/github-test-repo.png) +::: For simple changes to text files, you can make edits right in the GitHub web interface. @@ -160,43 +255,76 @@ For simple changes to text files, you can make edits right in the GitHub web int Navigate to the `README.md` file in the file listing, and edit it by clicking on the pencil icon. This is a regular Markdown file, so you can just add markdown text. Add a new level 2 header called "Purpose" and add some bullet points describing the purpose of the repo. When done, add a commit message, and hit the "Commit changes" button. ::: +:::{.column-page} ![](images/github-test-edit.png) +::: Congratulations, you've now authored your first versioned commit! If you navigate back to the GitHub page for the repository, you'll see your commit listed there, as well as the rendered `README.md` file. +:::{.column-page} ![](images/github-test-displayed.png) +::: Let's point out a few things about this window. It represents a view of the repository that you created, showing all of the files in the repository so far. For each file, it shows when the file was last modified, and the commit message that was used to last change each file. This is why it is important to write good, descriptive commit messages. In addition, the header above the file listing shows the most recent commit, along with its commit message, and its SHA identifier. That SHA identifier is the key to this set of versioned changes. If you click on the SHA identifier (6c18e0a), it will display the set of changes made in that particular commit. -In the next section we'll use the GitHub URL for the GitHub repository you created to `clone` the repository onto your local machine so that you can edit the files in RStudio. To do so, start by copying the GitHub URL, which represents the repository location: +::: {.callout-caution icon=false} +## What should I write in my commit message? + +Writing effective `Git` commit messages is essential for creating a meaningful and helpful version history in your repository. It is crucial to avoid skipping commit messages or resorting to generic phrases like "Updates." When it comes to following best practices, there are several guidelines to enhance the readability and maintainability of the codebase. + +Here are some guidelines for writing effective `Git` commit messages: + +1. **Be descriptive and concise**: Provide a clear and concise summary of the changes made in the commit. Aim to convey the purpose and impact of the commit in a few words. + +2. **Use imperative tense**: Write commit messages in the imperative tense, as if giving a command. For example, use "Add feature" instead of "Added feature" or "Adding feature." This convention aligns with other `Git` commands and makes the messages more actionable. + +3. **Separate subject and body**: Start with a subject line, followed by a blank line, and then provide a more detailed explanation in the body if necessary. The subject line should be a short, one-line summary, while the body can provide additional context, motivation, or details about the changes. + +4. **Limit the subject line length**: Keep the subject line within 50 characters or less. This ensures that the commit messages are easily scannable and fit well in tools like `Git` logs. + +5. **Capitalize and punctuate properly**: Begin the subject line with a capital letter and use proper punctuation. This adds clarity and consistency to the commit messages. + +6. **Focus on the "what" and "why"**: Explain what changes were made and why they were made. Understanding the motivation behind a commit helps future researchers and collaborators (including you!) comprehend its purpose. + +7. **Use present tense for subject, past tense for body**: Write the subject line in present tense as it represents the current state of the codebase. Use past tense in the body to describe what has been done. + +8. **Reference relevant issues**: If the commit is related to a specific issue or task, include a reference to it. For example, you can mention the issue number or use keywords like "Fixes," "Closes," or "Resolves" followed by the issue number. +::: -![](images/github-test-clone-url.png){width="70%" fig-align="center"} ## Exercise 2: `clone` your repository and use `Git` locally in RStudio -RStudio knows how to work with files under version control with `Git`, but only if you are working within an RStudio project folder. In this next section, +In this exercise, we'll use the GitHub `URL` for the GitHub repository you created to `clone` the repository onto your local machine so that you can edit the files in RStudio. -we will `clone` the repository that you created on GitHub into a local repository as an RStudio project. Here's what we're going to do +Start by copying the GitHub `URL`, which represents the repository location: -![](images/github-workflows-owner.png) +:::{.column-body-outset} +![](images/github-test-clone-url.png){width="70%" fig-align="center"} +::: -We refer to the remote copy of the repository that is on GitHub as the origin repository (the one that we cloned from), and the copy on our local computer as the local copy. +RStudio knows how to work with files under version control with `Git`, but only if you are working within an R project folder. -RStudio knows how to work with files under version control with `Git`, but only if you are working within an RStudio project folder. In this next section, we will `clone` the repository that you created on GitHub into a local repository as an RStudio project. Here's what we're going to do: +Next, let's `clone` the repository created on GitHub so we have it accessible as an R project in RStudio. + +::: {.callout-important title="An important distinction"} +We refer to the remote copy of the repository that is on GitHub as the origin repository (the one that we cloned from), and the copy on our local computer as the local repository. +::: ::: callout-tip ## Setup -- In the File menu, select "New Project" -- In the dialog that pops up, select the "Version Control" option, and paste the GitHub URL that you copied into the field for the remote repository Repository URL -- While you can name the local copy of the repository anything, it's typical to use the same name as the GitHub repository to maintain the correspondence +- In the File menu, select "New Project" +- In the dialog that pops up, select the "Version Control" option, and paste the GitHub URL that you copied into the field for the remote repository Repository URL +- While you can name the local copy of the repository anything, it's typical to use the same name as the GitHub repository to maintain the correspondence ![](images/rstudio-clone-repo.png){width="90%" fig-align="center"} ::: Once you hit "Create Project", a new RStudio window will open with all of the files from the remote repository copied locally. Depending on how your version of RStudio is configured, the location and size of the panes may differ, but they should all be present, including a `Git` tab and the normal Files tab listing the files that had been created in the remote repository. +:::{.column-body-outset} ![](images/github-rstudio-test.png) +::: You'll note that there is one new file `halina_test.Rproj`, and three files that we created earlier on GitHub (`.gitignore`, `LICENSE`, and `README.md`). @@ -210,7 +338,7 @@ Inspect the history. For now, let's click on the History button in the `Git` tab ## Challenge 1. Let's make a change to the `README.md` file, this time from RStudio, then commit the `README.md` change -2. Add a new section to your `README.md` called "Creator" using a level 2 header, and under it include some information about yourself. **Bonus:** Add some contact information and link your email using Markdown syntax +2. Add a new section to your `README.md` called "Creator" using a level 2 header, and under it include some information about yourself. **Bonus:** Add some contact information and link your email using Markdown syntax ::: Once you save, you'll immediately see the `README.md` file show up in the `Git` tab, marked as a modification. You can select the file in the `Git` tab, and click Diff to see the differences that you saved (but which are not yet committed to your local repository). @@ -245,31 +373,26 @@ These 2 commits are the two we just made, and have not yet been pushed to GitHub **Push these changes to GitHub.** Now that everything has been changed as desired locally, you can push the changes to GitHub using the Push button. This will prompt you for your GitHub username and password, and upload the changes, leaving your repository in a totally clean and synchronized state. When finished, looking at the history shows all four commits, including the two that were done on GitHub and the two that were done locally on RStudio. +:::{.column-body-outset} ![](images/rstudio-history-3.png) +::: And note that the labels indicate that both the local repository (`HEAD`) and the remote repository (`origin/HEAD`) are pointing at the same version in the history. So, if we go look at the commit history on GitHub, all the commits will be shown there as well. +:::{.column-page} ![](images/github-history.png) +::: ::: callout-note ## Last thing, some `Git` configuration -When `Git` released version 2.27, a new feature they incorporated allows users to specify how to pull, essentially, otherwise a warning will appear. To suppress this warning we need to configure our `Git` with this line of code: +When `Git` released version 2.27, a new feature they incorporated allows users to specify how to pull (essentially), otherwise a warning will appear. To suppress this warning we need to configure our `Git` with this line of code: -```{.bash} +``` bash git config pull.rebase false ``` -`pull.rebase false` is a default strategy for pulling where it will try to auto-merge the files if possible, and if it can’t it will show a merge conflict -::: - - -::: callout-important -## What should I write in my commit message? - -Clearly, good documentation of what you've done is critical to making the version history of your repository meaningful and helpful. It's tempting to skip the commit message altogether, or to add some stock blurb like "Updates". It's better to use messages that will be helpful to your future self in deducing not just what you did, but why you did it. Also, commit messages are best understood if they follow the active verb convention. For example, you can see that my commit messages all started with a past tense verb, and then explained what was changed. - -While some of the changes we illustrated here were simple and so easily explained in a short phrase, for more complex changes, it's best to provide a more complete message. The convention, however, is to always have a short, terse first sentence, followed by a more verbose explanation of the details and rationale for the change. This keeps the high level details readable in the version log. I can't count the number of times I've looked at the commit log from 2, 3, or 10 years prior and been so grateful for the diligence of my past self and collaborators. +`pull.rebase false` is a default strategy for pulling where it will try to auto-merge the files if possible, and if it can't it will show a merge conflict ::: ## Exercise 3: Setting up `Git` on an existing project @@ -308,7 +431,9 @@ Here is what your page should look like: This will open your empty repository with a page that conveniently gives you exactly the instructions you need. In our case, we are going to "push an existing repository from the command line." +:::{.column-page} ![](images/setup-empty-repo.png) +::: Click the clipboard icon to copy the code for the middle option of the three on this page. It should have three lines and look like this: @@ -351,12 +476,14 @@ There's a lot we haven't covered in this brief tutorial. There are some great an - Branching and merging - Pull requests versus direct contributions for collaboration - Using `.gitignore` to protect sensitive data -- GitHub Issues and why they are useful +- GitHub Issues - how to use them for project management and collaboration and much, much more. -## Additional `Git` resources +## `Git` resources +- [Pro Git Book](https://git-scm.com/book/en/v2) +- [Happy Git and GitHub for the useR](https://happygitwithr.com/) - [GitHub Documentation](https://docs.github.com/en/get-started/quickstart/set-up-git) - [Learn `Git` Branching](https://learngitbranching.js.org/) is an interactive tool to learn `Git` on the command line - [Software Carpentry Version Control with `Git`](https://swcarpentry.github.io/git-novice/) diff --git a/materials/sections/git-setup.qmd b/materials/sections/git-setup.qmd index d98f9cfb..ef0072ad 100644 --- a/materials/sections/git-setup.qmd +++ b/materials/sections/git-setup.qmd @@ -5,49 +5,58 @@ ## Set up global options in `Git` -Before using `Git`, you need to tell it who you are, also known as setting the **global options**. The only way to do this is through the command line. Newer versions of RStudio have a nice feature where you can open a terminal window in your RStudio session. Do this by selecting Tools \> Terminal \> New Terminal. +Before using `Git`, you need to tell it who you are, also known as setting the **global options**. To do this, we will be setting the global options in the Terminal. + +::: {.callout-caution icon=false} +## What's the Terminal? + +Technically, the Terminal is an interface for the shell, a computer program. To put that simply, we use the Terminal to tell a computer what to do. This is different from the Console in RStudio, which interprets R code and returns a value. +::: + +To get started, let's open a new Terminal window in RStudio. Do this by clicking Tools \> Terminal \> New Terminal. A Terminal tab should now be open where your Console usually is. -To set the global options, type the following into the command prompt, with your exact GitHub username, and press enter: +::: {.callout-caution icon=false} +## Don't be afraid to dip your toes in the Terminal +Most of our `git` operations will be done in RStudio, but there are some situations where you must work in the Terminal and use command line. It may be daunting to code in the Terminal, but as your comfort increases over time, you might find you prefer it. Either way, it's beneficial to learn *enough* command line and to feel comfortable in the Terminal. +::: + +Let's start by adding your user name to the global options. Type the following into the command prompt, with your **exact** GitHub username, and press enter: ``` bash -git config --global user.name "hdolinh" +git config --global user.name "my_user_name" ``` ::: column-margin Note that **if it ran successfully, it will look like nothing happened**. We will check at the end to make sure it worked. ::: -Next, enter the following line, with the email address you used when you created your account on github.com: +Next, enter the following line, with the email address you used when you created your account on [github.com](https://github.com/): ``` bash -git config --global user.email "dolinh@nceas.ucsb.edu" +git config --global user.email "my_email@nceas.ucsb.edu" ``` -::: column-margin -Note that these lines need to be run **one at a time**. -::: - ::: callout-important ## Case and spelling matters! -When you add your username and email to the global options you **must** use the exact same spelling and case that you use on GitHub otherwise, `Git` won't be able to sync to your account to use. +When you add your username and email to the global options you **must** use the exact same spelling and case that you used on GitHub otherwise, `Git` won't be able to sync to your account. ::: -Next, we will set our credentials to not time out for a very long time. This is related to the way that our server operating system handles credentials - not doing this will make your Personal Access Token (which we will set up soon) expire immediately on the system, even though it is actually valid for a month. +Next, we will set our credentials to not time out for a very long time. This is related to how our server operating system handles credentials - not doing this will make your Personal Access Token (PAT, which we will set up in the next section) expire immediately on the system, even though it is actually valid for at least a month. ``` bash git config --global credential.helper 'cache --timeout=10000000' ``` -Next, we will set the default branch name to `main` for any new repositories that are created moving forward. Why are we doing this? Previously, the default branch name was `master` and this racist terminology for git branches motivates us to update our default branch to `main` instead. +Next, we will set the default branch name to `main` for any new repositories that are created moving forward. Why are we doing this? Previously, the default branch name was `master` and this racist terminology for `git` branches motivates us to update our default branch to `main` instead. ``` bash git config --global init.defaultBranch main ``` -Finally, check to make sure everything looks correct by entering this command, which will return the options that you have set. +Finally, check to make sure everything looks correct by entering this command, which will return the global options you have set. ``` bash git config --global --list @@ -55,7 +64,9 @@ git config --global --list ## GitHub Authentication -GitHub recently deprecated password authentication for accessing repositories, so we need to set up a secure way to authenticate. The book [Happy Git and GitHub for the useR](https://happygitwithr.com/index.html) has a wealth of information related to working with `Git` in R, and these instructions are based off of [Chapter 9 Personal access token for HTTPS](https://happygitwithr.com/https-pat.html). +GitHub recently deprecated password authentication for accessing repositories, so we need to set up a secure way to authenticate. + +The book [Happy Git and GitHub for the useR](https://happygitwithr.com/index.html) has a wealth of information related to working with `Git` in R, and these instructions are based off of [Chapter 9 Personal access token for HTTPS](https://happygitwithr.com/https-pat.html). We will be using a **Personal Access Token (PAT)** in this course. For better security and long term use, we recommend taking the extra steps to set up SSH keys (check out [Chapter 10 Set up Keys for SSH](https://happygitwithr.com/ssh-keys.html)). @@ -70,4 +81,4 @@ We will be using a **Personal Access Token (PAT)** in this course. For better se 6. Last thing, run `usethis::git_sitrep()` in the Console to check your `Git` configuration and that you've successful stored your PAT. ::: -Congrats! Now that you've set up your authentication you should be able to work with GitHub in RStudio now. +Congrats! Now you've setup your authentication you should be able to work with GitHub in RStudio now. diff --git a/materials/sections/intro-tidy-data.qmd b/materials/sections/intro-tidy-data.qmd index 5664cc48..ffe47432 100644 --- a/materials/sections/intro-tidy-data.qmd +++ b/materials/sections/intro-tidy-data.qmd @@ -1,172 +1,339 @@ ---- -bibliography: references.bib ---- ## Learning Objectives {.unnumbered} -- Understand basics of relational data models aka tidy data -- Learn how to design and create effective data tables +Learn how to design and create effective data tables by: -## Introduction +- applying tidy and normalized data principles, +- following best practices to format data tables' content, +- relating tables following relational data models principles, and +- understanding how to perform table joins. -In this lesson we are going to learn what relational data models are, and how they can be used to manage and analyze data efficiently. Relational data models are what relational databases use to organize tables. However, you don't have to be using a relational database (like mySQL, MariaDB, Oracle, or Microsoft Access) to enjoy the benefits of using a relational data model. Additionally, your data don't have to be large or complex for you to benefit. Here are a few of the benefits of using a relational data model: -- Powerful search and filtering -- Handle large, complex datasets -- Enforce data integrity -- Decrease errors from redundant updates +## Tidy Data -### Simple guidelines for data management +### Values, variables, observations, and entities -A great paper called *Some Simple Guidelines for Effective Data Management* [@borer2009] lays out exactly that - guidelines that make your data management, and your reproducible research, more effective. The first six guidelines are straightforward, but worth mentioning here: +Before we dive into tidy data, we need to get acquainted with our building blocks. +A dataset is a collection of **values**, with each value belonging to an observation and a variable. -- Use a scripted program (like R!) -- Non-proprietary file formats are preferred (eg: csv, txt) -- Keep a raw version of data -- Use descriptive file and variable names (without spaces!) -- Include a header line in your tabular data files -- Use plain ASCII text +- An **observation** groups all the values measured for an individual **entity**. For example, an observation about a plant could include the species name, date of collection and altitude of the place where the plant was found. The plants found would be the entity. -The next three are a little more complex, but all are characteristics of the relational data model: +- A **variable** groups all the values that measure the same attribute. In the previous example, the variables would be the date of collection, altitude and species name. -- Design tables to add rows, not columns -- Each column should contain only one type of information -- Record a single piece of data only once; separate information collected at different scales into different tables. +::: callout-note +“Variable” is a general term that covers multiple types of attributes. For example, when we are collecting data to investigate a causal relationship, variables include both *explanatory variables* (also called independent variables) and *response variables* (also called dependent variables). We could also have a classifier variable that assigns a unique identifier to each observation. -## Recognizing Untidy Data +When we want to document our values, we must think of the best way to organize them so they clearly relate to the variables they represent and the observations they belong to. +::: -Before we learn how to create a relational data model, let's look at how to recognize data that does not conform to the model. -### Data Organization +### What is tidy data? -This is a screenshot of an actual dataset that came across NCEAS. We have all seen spreadsheets that look like this - and it is fairly obvious that whatever this is, it isn't very tidy. Let's dive deeper in to exactly **why** we wouldn't consider it tidy. +Tidy data is a standardized way of organizing data tables that allows us to manage and analyze data efficiently, because it makes it straightforward to understand the corresponding variable and observation of each value. +The **tidy data principles** are: -![](images/excel-org-01.png) +1. Every column is a variable. +2. Every row is an observation. +3. Every cell is a single value. -### Multiple Tables +The following is an example of tidy data - it’s easy to see the three tidy data principles apply. -Your human brain can see from the way this sheet is laid out that it has three tables within it. Although it is easy for us to see and interpret this, it is extremely difficult to get a computer to see it this way, which will create headaches down the road should you try to read in this information to R or another programming language. -![](images/excel-org-02.png) +![](images/tidy-data-images/tidy_data/tidy_data.png) -### Inconsistent Observations +![](images/tidy-data-images/tidy_data/tidy_variables.png) -Rows correspond to **observations**. If you look across a single row, and you notice that there are clearly multiple observations in one row, the data are likely not tidy. -![](images/excel-org-03.png) +![](images/tidy-data-images/tidy_data/tidy_observations.png) -### Inconsistent Variables +![](images/tidy-data-images/tidy_data/tidy_values.png) -Columns correspond to variables. If you look down a column, and see that multiple variables exist in the table, the data are not tidy. A good test for this can be to see if you think the column consists of only one unit type. +### Recognizing untidy data +Anything that does not follow the three tidy data principles is **untidy data**. -![](images/excel-org-04.png) +There are *many* ways in which data can become untidy, some can be noticed right away, while others are more subtle. In this section we will look at some examples of common untidy data situations. -### Marginal Sums and Statistics +#### Example 1 +The following is a screenshot of an actual dataset that came across NCEAS. We have all seen spreadsheets that look like this - and it is fairly obvious that whatever this is, it isn't very tidy. Let's dive deeper into why we consider it untidy data. -Marginal sums and statistics also are not considered tidy, and they are not the same type of observation as the other rows. Instead, they are a combination of observations. +![](images/tidy-data-images/tidy_data/excel-org-01.png) -![](images/excel-org-05.png) +##### Multiple tables +To begin with, notice there are actually three smaller tables within this table. Although for our human brain it is easy to see and interpret this, it is extremely difficult to get a computer to see it this way. -## Good Enough Data Modeling +![](images/tidy-data-images/tidy_data/excel-org-02.png) -### Denormalized Data +Having multiple tables within the same table will create headaches down the road should you try to read in this information using R or another programming language. +**Having multiple tables immediately breaks the tidy data principles**, as we will see next. -When data are "denormalized" it means that observations about different entities are combined. -![](images/table-denorm.png) +##### Inconsistent columns -In the above example, each row has measurements about both the site at which observations occurred, as well as observations of two individual plants of possibly different species found at that site. This is **not normalized** data. +In tidy data, **each column corresponds to a single variable**. +If you look down a column, and see that multiple variables exist in the table, the data is not tidy. +A good test for this can be to see if you think the column consists of only one unit type. -People often refer to this as **wide** format, because the observations are spread across a wide number of columns. Note that, should one encounter a new species in the survey, we would have to add new columns to the table. This is difficult to analyze, understand, and maintain. +![](images/tidy-data-images/tidy_data/excel-org-04.png) -### Tabluar Data +##### Inconsistent rows -**Observations**. A better way to model data is to organize the observations about each type of entity in its own table. This results in: +The second principle of tidy data is: **every column must be a single observation**. If you look across a single row, and you notice that there are clearly multiple observations in one row, the data are likely not tidy. -- Separate tables for each type of entity measured -- Each row represents a single observed entity -- Observations (rows) are all unique -- This is **normalized** data (aka **tidy** data) +![](images/tidy-data-images/tidy_data/excel-org-03.png) -**Variables**. In addition, for normalized data, we expect the variables to be organized such that: +##### Marginal sums and statistics -- All values in a column are of the same type -- All columns pertain to the same observed entity (e.g., row) -- Each column represents either an identifying variable or a measured variable +**Marginal sums and statistics are not considered tidy**. They break principle one, “Every column is a variable”, because a marginal statistic does not represent the same variable as the values it is summarizing. They also break principle two, “Every row is an observation”, because they represent a combination of observations, rather than a single one. -::: callout-note -## Challenge +![](images/tidy-data-images/tidy_data/excel-org-05.png) -Try to answer the following questions: +#### Example 2 -- What are the observed entities in the example above? -- What are the measured variables associated with those observations? +Consider the following table. +It’s a single one this time! +It shows data about species observed at a specific site and date. The column headers refer to the following: -**Answer** ![](images/table-denorm-entity-var.png) -::: +- *id*: id of row +- *date*: date when a species was observed +- *site*: site where a species was observed +- *name*: site’s name +- *altitude*: site’s altitude +- *sp1code*, *sp2code*: species code for two plants observed +- *sp1height*, *sp2height*: height of the plants observed -If we use these questions to tidy our data, we should end up with: +Take a moment to see why this is not tidy data. -- One table for each entity observed -- One column for each measured variable -- Additional columns for identifying variables (such as site ID) +![](images/tidy-data-images/tidy_data/untidy_1.png) -Here is what our tidy data look like: +##### Multiple Observations +Remember that an observation is all the values measured for an individual **entity**. -![](images/tables-norm.png){width="80%" fig-align="center"} +If our entity is a single observed plant, then the values we measured are date and site of observation, the altitude, and the species code and height. +This table breaks the second tidy data principles: Every row is an observation. -Note that this normalized version of the data meets the three guidelines set by [@borer2009]: -- Design tables to add rows, not columns -- Each column should contain only one type of information -- Record a single piece of data only once; separate information collected at different scales into different tables. +![](images/tidy-data-images/tidy_data/untidy_2.png) -## Using Normalized Data +People often refer to this as “*wide* format”, because the observations are spread across a wide number of columns. Note that, should one encounter a new species in the survey, we would have to add new columns to the table. This is difficult to analyze, understand, and maintain. To solve this problem, we can create a single column for species code and a single column for species height as in the following table. -Normalizing data by separating it into multiple tables often makes researchers really uncomfortable. This is understandable! The person who designed this study collected all of these measurements for a reason - so that they could analyze the measurements together. Now that our site and species information are in separate tables, how would we use site elevation as a predictor variable for species composition, for example? The answer is keys - and they are the cornerstone of relational data models. +![](images/tidy-data-images/tidy_data/tidy_not_normal.png) -When one has normalized data, we often use unique identifiers to reference particular observations, which allows us to link across tables. Two types of identifiers are common within relational data: -::: column-margin -**Note** Is a primary key necessary to have in a dataset? -::: -- **Primary Key**: unique identifier for each observed entity, one per row -- **Foreign Key**: reference to a primary key in another table (linkage) +## Data Normalization -::: callout-note -## Challenge +### What is data normalization? +**Data normalization** is the process of creating **normalized data**, which are datasets free from data redundancy to simplify query, analysis, storing, and maintenance. In normalized data we organize data so that : -In our normalized tables below, identify the following: +- Each table follows the tidy data principles +- We have separate tables for each type of entity measured +- Observations (rows) are all unique +- Each column represents either an identifying variable or a measured variable -1. The primary key for each table -2. Any foreign keys that exist +In **denormalized data** observations about different entities are combined. A good indication that a data table is denormalized and needs normalization is seeing the same column values repeated across multiple rows. -![](images/tables-norm.png){width="80%" fig-align="center"} -::: +### Example + +In the previous data table the row values for the last three columns are repeated. + +![](images/tidy-data-images/data_normalization/denormalized.png) + +This means the data is denormalized and it happens because each row has measurements about multiple entities: + +- 1st entity: individual plants found at that site, and +- 2nd entity: sites at which the plants were observed. + +![](images/tidy-data-images/data_normalization/two_entities.png) + +If we use this information to normalize our data, we should end up with: + +- one tidy table for each entity observed, and +- additional columns for identifying variables (such as site ID). + +Here’s how our normalized data would look like: + +![](images/tidy-data-images/data_normalization/normalized.png){fig-align="center" width=70%} + +Notice that each table also satisfies the tidy data principles. + +Normalizing data by separating it into multiple tables often makes researchers really uncomfortable. This is understandable! +The person who designed this study collected all of these measurements for a reason - so that they could analyze the measurements together. +Now that our site and plant information are in separate tables, how would we use site temperature as a predictor variable for species composition, for example? +We will go over a solution in the next section. + +## Relational Data Models + +### What are relational data models? +A **relational data model** is a way of encoding links between multiple tables in a database. A database organized following a relational data model is a **relational database**. A few of the advantages of using a relational data model are: + +- Enabling powerful search and filtering +- Ability to handle large, complex data sets +- Enforcing data integrity +- Decreasing errors from redundant updates + +Relational data models are used by relational databases (like mySQL, MariaDB, Oracle, or Microsoft Access) to organize tables. However, you don't have to be using a relational database or handling large and complex data to enjoy the benefits of using a relational data model. + + +### Primary and foreign keys +The main way in which relational data models encode relationships between different tables is by using keys. Keys are variables whose values uniquely identify observations. For tidy data, where variables and columns are equivalent, a column is a key if it has a different value in each row. This allows us to use keys as unique identifiers that reference particular observations and create links across tables. + +Two types of keys are common within relational data models: + +- **Primary Key**: chosen key for a table, uniquely identifies each observation in the table, +- **Foreign Key**: reference to a primary key in another table, used to create links between tables. + +### Example +On our previously normalized data for plants and sites, let’s choose primary keys for these tables and then identify any foreign keys. + +**Primary keys** + +First, notice that the columns ‘date’, ‘site’ and ‘sp_code’ cannot be primary keys because they have repeated values across rows. The columns ‘sp_height’ and ‘id’ both have different values in each row, so both are candidates for primary keys. However, the decimal values of ‘sp_height’ don’t make it as useful to use it to reference observations. So we chose ‘id’ as the primary key for this table. + +For the sites table, all three columns could be keys. We chose ‘site’ as the primary key because it is the most succinct and it also allows us to link the sites table with the plants table. + +**Foreign keys** + +The 'site' column is the *primary key* of that table because it uniquely identifies each row of the table as a unique observation of a site. In the first table, however, the 'site' column is a *foreign key* that references the primary key from the second table. This linkage tells us that the first height measurement for the DAPU observation occurred at the site with the name Taku. + +![](images/tidy-data-images/relational_data_models/primary_foreign_keys.png){fig-align="center" width=70%} + +### Surrogate, natural, and compound keys + +In the sites data table from the previous example, we noticed that ‘site’ and ‘name’ are variables whose values uniquely identify the rows. In other words, ‘site’ and ‘name’ are keys. However, ‘site’ and ‘name’ are very different keys since the values in ‘site’ are “made up”, while ‘name’ has values that are used in the external world, unrelated to the table. This leads us to the following key types: + +- **Surrogate Key**: a key whose values do not exist in the real world, +- **Natural Key**: a key whose values exist in the real world. + +A surrogate key is often simpler, and can be a better choice than a natural key to become the primary key of a data table. + +![](images/tidy-data-images/relational_data_models/surrogate_natural_keys.png){fig-align="center" width=70%} + +Finally, it can also be the case that a variable is not a key, but by combining it with a second variable we get that the combined values uniquely identify the rows. This is called a + +- **Compound Key**: a key that is made up of more than one variable. + +For example, the ‘site’ and ‘sp_code’ columns in the plants table cannot be keys on their own, since each has repeated values. However, when we look at their combined values (1-DAPU, 1-DAMA, 2-DAMA, 2-DAPU) we see each row has a unique value. So ‘site’ and ‘sp_code’ together form a compound key. -The primary key of the top table is `id`. The primary key of the bottom table is `site`. +![](images/tidy-data-images/relational_data_models/compound_key.png) -The `site` column is the **primary key** of that table because it uniquely identifies each row of the table as a unique observation of a site. In the first table, however, the `site` column is a **foreign key** that references the primary key from the second table. This linkage tells us that the first height measurement for the `DAPU` observation occurred at the site with the name `Taku`. +There are different advantages and disadvantages to choosing surrogate, natural, or compound keys as primary keys. You can read more about this in this article. -![](images/tables-keys.png){width="80%" fig-align="center"} +### Entity-Relationship models + +An **Entity-Relationship model (E-R model)**, also known as an E-R diagram, is a way to draw a compact diagram that reflects the structure and relationships of the tables in a relational database. These can be particularly useful for big databases that have many tables and complex relationships between them. + +We will explain the steps to drawing a simplified E-R model with our previous plants and sites tables. + + +*Step 1: Identify the entities in the relational database and add each one in a box.* +In our case, entities are [plants] and [sites], since we are gathering observations about both of these. + +![](images/tidy-data-images/relational_data_models/ER_diagram_1.png){fig-align="center" width=50%} + +*Step 2: Add variables for each entity and identify keys.* +Add the variables as a list inside each box. Then, identify the primary and foreign keys in each of the boxes. To visualize this, we have indicated the primary key of each entity in red and any foreign keys in blue. + +![](images/tidy-data-images/relational_data_models/ER_diagram_2.png){fig-align="center" width=50%} + +*Step 3: Add relationships between entities.* + +- Draw a line between the boxes of any two entities that have a relationship. + +- Identify which box has the primary key of the other as a foreign key. Let’s call the box that has the foreign key [box1] and the other box [box2]. +Using the previous diagram we can see that “site” is the primary key of [sites] and appears as a foreign key in [plants]. So [plants] is [box1] and [sites] is [box2]. + +- Add a word describing how [box1] is related to [box2] above the line connecting the two boxes. So, for example, we need to describe how [plants] is related to [sites]. The relation is “a plant is located in a site”, so we write “located” above the line indicating the relationship between [plants] and [sites]. + +![](images/tidy-data-images/relational_data_models/ER_diagram_3.png){fig-align="center" width=50%} + +*Step 4: Add cardinality to every relationship in the diagram.* +At this step we want to quantify how many items in an entity are related to another entity. This is easiest if we reuse the description we found in the previous step. For example, “a plant is located in one site”. Then we add the symbol for “one” at the end of the line going from [plants] to [sites]. + +![](images/tidy-data-images/relational_data_models/ER_diagram_4.png){fig-align="center" width=50%} + +To finish, we also indicate how many plants are related to a single site. Since “a site has many plants”, we add the symbol for “many” at the end of the line going from [sites] to [plants] + +![](images/tidy-data-images/relational_data_models/ER_diagram_5.png){fig-align="center" width=50%} + +That’s it! The symbols we used at the end of the lines are called **ERD “crow’s foot”**. You can see all the existing ones together with an example in the next diagram. + +![](images/tidy-data-images/relational_data_models/ERD_Relationship_Symbols_Quick_Reference.png) + +::: callout-note +If you need to produce a publishable E-R model such as the one above, Mermaid is a great option. Read more about how to use this tool to create diagrams here . +::: ## Merging Data -Frequently, analysis of data will require merging these separately managed tables back together. There are multiple ways to join the observations in two tables, based on how the rows of one table are merged with the rows of the other. +Frequently, analysis of data will require merging these separately managed tables back together. There are multiple ways to join the observations in two tables, based on how the rows of one table are merged with the rows of the other. Regardless of the join we will perform, we need to start by identifying the primary key in each table and how these appear as foreign keys in other tables. + +When conceptualizing merges, one can think of two tables, one on the *left* and one on the *right*. + +![](images/tidy-data-images/merging_data/join-diagrams-separate.png) + +### Inner Join +An *INNER JOIN* is when you merge the subset of rows that have matches in both the left table and the right table. + +![](images/tidy-data-images/merging_data/join-diagrams-inner.png) + +### Left Join +A *LEFT JOIN* takes all of the rows from the left table, and merges on the data from matching rows in the right table. Keys that don't match from the left table are still provided with a missing value (na) from the right table. + +![](images/tidy-data-images/merging_data/join-diagrams-left.png) + +### Right Join +A *RIGHT JOIN* is the same as a left join, except that all of the rows from the right table are included with matching data from the left, or a missing value. Notice that left and right joins can ultimately be the same depending on the positions of the tables + +![](images/tidy-data-images/merging_data/join-diagrams-right.png) + + +### Full Outer Join +Finally, a *FULL OUTER JOIN* includes all data from all rows in both tables, and includes missing values wherever necessary. + +![](images/tidy-data-images/merging_data/join-diagrams-full.png) + +Sometimes people represent joins as Venn diagrams, showing which parts of the left and right tables are included in the results for each join. This representation is useful, however, they miss part of the story related to where the missing value comes from in each result. + +![Image source: R for Data Science, Wickham & Grolemund.](images/tidy-data-images/merging_data/join-venn.png) + +We suggest reading the Relational Data chapter in the "R for Data Science" book for more examples and best practices about joins. + +## Best Practices Summary +This is a summary of what we have covered, and some extra advice! + +The **tidy data principles** are: + +1. Every column is a variable. +2. Every row is an observation. +3. Every cell is a single value. + +In **normalized data** we organize data so that : + +- We have separate tables for each type of entity measured +- Observations (rows) are all unique +- Each column represents either an identifying variable or a measured variable +- Each table follows the tidy data principles + +Creating **relational data models** by assigning **primary and foreign keys** to each table allows us to maintain relationships between separate normalized tables. +Choose the primary key for each table based on your understanding of the data and take efficiency into account. Once you choose a column as the primary key, make sure that all the values in that column are there! -When conceptualizing merges, one can think of two tables, one on the **left** and one on the **right**. The most common (and often useful) join is when you merge the subset of rows that have matches in both the left table and the right table: this is called an **INNER JOIN**. Other types of join are possible as well. A **LEFT JOIN** takes all of the rows from the left table, and merges on the data from matching rows in the right table. Keys that don't match from the left table are still provided with a missing value (`NA`) from the right table. A **RIGHT JOIN** is the same, except that all of the rows from the right table are included with matching data from the left, or a missing value. Finally, a **FULL OUTER JOIN** includes all data from all rows in both tables, and includes missing values wherever necessary. +For a big relational database, an **Entity-Relationship model** can be an effective way to explain how different tables and their keys are related to each other. If we need to merge tables we can do it using different types of **joins**. -![](images/join-diagrams.png){width="80%" fig-align="center"} +## More on Data Management +Tidy data is one very important step to data management best practices. However there is more to consider. Here we provide some **extra advice** from a great paper called 'Some Simple Guidelines for Effective Data Management'. -Sometimes people represent these as Venn diagrams showing which parts of the left and right tables are included in the results for each join. These however, miss part of the story related to where the missing value come from in each result. +- Design tables to add rows, not columns +- Use a scripted program (like R!) +- Non-proprietary file formats are preferred (eg: csv, txt) +- Keep a raw version of data +- Use descriptive files and variable names (without spaces!) +- Include a header line in your tabular data files +- Use plain ASCII text -![](images/sql-joins.png){width="80%" fig-align="center"} +In the Cleaning & Wrangling chapter we will cover more best practices for cleaning irregular and missing data and how to implement them using R. -In the figure above, the blue regions show the set of rows that are included in the result. For the INNER JOIN, the rows returned are all rows in A that have a matching row in B. +## Activity +We will work on an in-person, offline activity to practice identifying tidy data, normalizing data, draing E-R models, and performing joins. The data tables we will use have been adapted from the following dataset for teaching purposes: -## Additonal Tidy Data Resources +Warren, P.S., A. Kinzig, C.A. Martin, and L. Machabee. 2021. Ecological and social Interactions in urban parks: bird surveys in local parks in the central Arizona-Phoenix metropolitan area ver 10. Environmental Data Initiative. https://doi.org/10.6073/pasta/f6f004bc7112ce266fde2b80fad19ff4 (Accessed 2023-06-28). -- [White et al. 2013. Nine simple ways to make it easier to (re)use your data. Ideas in Ecology and Evolution 6.](https://doi.org/10.4033/iee.2013.6b.6.f) -- [Software Carpentry SQL Tutorial](https://swcarpentry.github.io/sql-novice-survey/) -- [Tidy Data](http://vita.had.co.nz/papers/tidy-data.pdf) diff --git a/materials/sections/logic-modeling.qmd b/materials/sections/logic-modeling.qmd new file mode 100644 index 00000000..67e79b3c --- /dev/null +++ b/materials/sections/logic-modeling.qmd @@ -0,0 +1,148 @@ +## Learning Objectives {.unnumbered} + +- Provide an overview of Logic Models +- Apply the principles of Logic Models to synthesis development +- Refine synthesis group challenges + +## Logic Models + +Logic models are a planning tool that are designed to support program development by depicting the flow of resources and processes leading to a desired result. They are also used for outcomes-based evaluation of a program and are often requested as part of an evaluation planning process by funders or stakeholders. + +A simplified logic models comprise three main parts: Inputs, Outputs and Outcomes. + +![](images/LM1.png) + +Inputs reflect ***what is invested***, outputs are ***what is done*** and outcomes are the ***results of the program***. + +In a more detailed logic model, outputs and outcomes are further broken down. Outputs are often represented as 'Activities' and 'Participants'. By including participation (or participants), the logic model is explicitly considering the intended audience, or stakeholders, impacted by the program. Engagement of this audience is an output. In the case of outcomes, these can be split into short, medium and long-term outcomes. Sometimes this last category may be labeled 'Impact' + +![](images/LM2.png) + +Defining the inputs, outputs and outcomes early in a planning process enables teams to visualize the workflow from activity to results and can help mitigate potential challenges. Logic models can be thought of as having an 'if this then that' structure where inputs -\> outputs -\> outcomes. + +![](images/LM3.png) + +In the example below we have constructed a simple logic model for a hypothetical project where training materials are being developed for a group of educators to implement at their respective institutions. + +![](images/LM4.png) + +Linkages are not always sequential and can be within categories, bi-directional and/or include feedback loops. Detailing this complexity of relationships, or theory of action, can be time consuming but is a valuable part of the thought process for project planning. In exploring all relationships, logic modeling also allows for assessing program feasibility. + +![](images/LM5.png) + +The above graphics include two sections within Outputs - Activities and Participants - and this is quite common. There is variation in logic model templates, including versions with a third type of output - "Products'. Sometimes description of these products is contained within the Activities section - for example, 'develop curricula', 'produce a report' - however calling these out explicitly is beneficial for teams focused on product development. + +Program development (and logic modeling) occurs in response to a given 'Situation' or need, and exploring this is the first step in modeling. The situation defines the objective, or problem, that the program is designed to solve hence some logic models may omit the left-hand situation column but be framed with Problem and Solution statements. Finally, comprehensive logic modeling takes into consideration assumptions that are made with respect to the resources available, the people involved, or the way the program will work and also recognizes that there are external factors that can impact the program's success. + +![](images/LM6.png) + +In summary: + +Logic models support program development and evaluation and comprise three primary steps in the workflow: + +- **Inputs:** Resources, contributions, and investments required for a program; +- **Outputs:** Activities conducted, participants reached, and products produced; and +- **Outcomes:** Results or expected changes arising from the program structured as short-, medium- and long-term. + +## Logic models for synthesis development + +Logic models are one tool for program development and have sufficient flexibility for a variety of situations, including planning for a research collaboration. While some logic model categories may feel less relevant (can we scale up to a long-term outcome from a published synthesis?), the process of articulating the research objective, proposed outcome, associated resources and activities has value. Below are examples of questions that a typical logic model (LM) will ask, and how these might be reframed for a research collaboration (RC). + +**Objective/Problem Statement** + +LM: What is the problem? Why is this a problem? Who does this impact? + +RC: What is the current state of knowledge? What gaps exists in understanding? Why is more information / synthesis important? + +**Inputs** + +LM: What resources are needed for the program? Personnel, money, time, equipment, partnerships .. + +RC: What is needed to undertake the synthesis research? For personnel, think in terms of the roles that are needed - data manager, statistician, writer, editor etc. Consider the time frame. DATA - what data are needed and what already exists? + +**Outputs - Activities** + +LM: What will be done? Development, design, workshops, conferences, counseling, outreach.. + +RC: What activities are needed to conduct the research? This could be high level or it could be broken down into details such as the types of statistical approaches. + +**Outputs - Participants** + +LM: Who will we reach? Clients, Participants, Customers.. + +RC: Who is the target audience? Who will be impacted by this work? Who is positioned to leverage this work? + +**Outputs - Products** + +LM: What will you create? Publications, websites, media communications ... + +RC: What research products are planned / expected? Consider this in relation to the intended audience. Is a peer-reviewed publication, report or white paper most appropriate? How will derived data be handled? Will documentation, workflows, or code be published? + +**Short-term Outcomes** + +LM: What short-term outcomes are anticipated among participants. These can include changes in awareness, knowledge, skills, attitudes, opinions and intent. + +RC: Will this work represent a significant contribution to current understanding? + +**Medium-term Outcomes** + +LM: What medium-term outcomes are predicted among participants? These might include changes in behaviors, decision-making and actions. + +RC: Will this work promote increased research activity or open new avenues of inquiry? + +**Long-term Outcomes** + +LM: What long-term benefits, or impacts, are expected? Changes in social, economic, civic, and environmental conditions? + +RC: Will this work result in local, regional or national policy change? What will be the long-term impact of increased investment in the ecosystem? + +::: {.callout-tip title="Breakout: Synthesis planning with logic models"} +Breakout groups will focus on refining ideas for synthesis topcis using the logic modeling tools described in this section. The goal for this session is to develop one or more high-level logic models that: + +1. Summarize the synthesis challenge +2. Define the inputs needed to approach the synthesis +3. Define the outputs, including activities and products that would would be needed to address the issue +4. Define the short term outcomes and longer-term impacts of the work + +Often it is helpful to start with a brainstorming activity to list activities and products that might be used to address the synthesis challenge, then connect those in terms of outcomes and impacts, and then circle back to the resource and data inputs needed to feed the logic model. Thinking of the whole model as a workflow can help conceptualize the dependencies among steps. + +Tools: + +- Powerpoint [logic model template](https://docs.google.com/presentation/d/1MBUy35QIAkA5FNCdRzWJM2aPaFXfUmGl/edit?usp=sharing&ouid=113328660159760492310&rtpof=true&sd=true) +- [Mermaid flowcharts](https://mermaid.js.org/syntax/flowchart.html) embedded in [Quarto documents](https://quarto.org/docs/authoring/diagrams.html#mermaid) + +```{mermaid} +flowchart LR + INPUTS --> ACTIVITIES --> OUTPUTS --> OUTCOMES/IMPACTS + + Scenario{{Accelerate synthesis via data science training}} + + R1[Instructor] & R2[Classroom space] & R3[Projector] --> B{Data Science Workshop} + B --> C(Workshop Curriculum) + B --> D(Presentations and Practice) + + C & D --> E[/Improved Delta management/] & F[/Increased analytic efficiency/] +``` + +Source + +```` mermaid +```{mermaid} +flowchart LR + INPUTS --> ACTIVITIES --> OUTPUTS --> OUTCOMES/IMPACTS + + Scenario{{Accelerate synthesis via data science training}} + + R1[Instructor] & R2[Classroom space] & R3[Projector] --> B{Data Science Workshop} + B --> C(Workshop Curriculum) + B --> D(Presentations and Practice) + + C & D --> E[/Improved Delta management/] & F[/Increased analytic efficiency/] +``` +```` +::: + +## Resources + +- [Logic model template](https://deltacouncil.sharepoint.com/:p:/r/sites/Extranet-Science/Shared%20Documents/NCEAS-DSP%20working%20group/2023/2023-06-week-1/logic_model_template.pptx?d=w32341d669685448d96b70e61e7e5e187&csf=1&web=1&e=2ojEmj) (ppt) on Sharepoint + - Same [Logic model template](https://docs.google.com/presentation/d/1MBUy35QIAkA5FNCdRzWJM2aPaFXfUmGl/edit?usp=sharing&ouid=113328660159760492310&rtpof=true&sd=true) on Google Drive diff --git a/materials/sections/programmatic-data-access.qmd b/materials/sections/programmatic-data-access.qmd new file mode 100644 index 00000000..3667da13 --- /dev/null +++ b/materials/sections/programmatic-data-access.qmd @@ -0,0 +1,284 @@ +## Learning Objectives {.unnumbered} + +- Best practices for reproducible data access +- Accessing data on the web and with `pins` +- How content identifiers differ from DOIs +- How content identifiers make research more reproducible +- Ways to register and resolve content identifiers for unpublished data +- How content identifiers can resolve to published data sources + +## Reproducible Data Access + +```{r cid-load-libs} +#| echo: false +#| warning: false +#| message: false + +library(readr) +library(pins) +library(contentid) +library(stringr) +library(dplyr) +library(solrium) +library(dataone) + +``` + + +## Barriers to data access + +Traditional ways of working with data -- as files on a file system -- limit the +reproducibility of code to local compute environments. A typical R analysis file will load one or many data files from the local disk with code like this: + +```{r, contentid-file-load} +#| eval: false + +delta_catch <- readr::read_csv('/Users/jkresearcher/Projects/2018/Delta_Analysis/delta_catch.csv') +delta_taxa <- readr::read_csv('../../Delta_2021/delta_taxa.csv') +delta_effort <- readr::read_csv('delta_effort.csv') +delta_sites <- readr::read_csv('data/delta_sites.csv') +``` + +Which of those file paths are the most portable? And which will run unmodified on both the original computer that they were written on, and on colleagues' computers? In reality, none of them, in that they require that a specific data file be present in a specific location for the code to work properly, and these assumptions are rarely met and hard to maintain. In practice, the relative paths are reasonably robust, as long as the code and data are shipped together, such as in a GitHub repository. Hardcoded paths like these are often spread deeply through the scripts that researchers write, and can become a surprise when they are encountered during execution. + +## Web URLs for data access + +The Web partly solves this problem, because it allows code to access data that is located somewhere on the Internet with a web URI. For example, loading data from a web site can be much more portable than loading the equivalent data from a local computer. + +```{r, contentid-load-web} +#| eval: true + +delta_sites_edi <- 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.233.2&entityid=6a82451e84be1fe82c9821f30ffc2d7d' + +delta_sites <- readr::read_csv(delta_sites_edi, show_col_types = FALSE) + +head(delta_sites) + +``` + +In theory, that code will work from anyone's computer with an internet connection. But code that downloads data each and every time it is run is not particularly efficient, and will be prohibitive for all but the smallest datasets. A simple solution to this issue is to cache a local copy of the dataset, and only retrieve the original from the web when we don't have a local copy. In this way, people running code or a script will download the data the first time their code is run, but use a local copy from thence forward. While this can be accomplished with some simple conditional logic in R, the pattern has been simplified using the `pins` package: + +```{r, contentid-pins} +#| eval: true + +delta_sites_pin <- pins::pin(delta_sites_edi) + +delta_sites <- readr::read_csv(delta_sites_pin, show_col_types = FALSE) +head(delta_sites) +``` + +You'll note that code takes longer the first time it is run, as the data file is downloaded only the first time. While this works well over the short term, abundant evidence shows that web URIs have short lifespan. Most URIs are defunct within a few years (e.g., see [McCown et al. 2005](http://arxiv.org/abs/cs/0511077)). Only the most carefully curated web sites maintain the viability of their links for longer. And maintaining them for decade-long periods requires a focus on archival principles and dedicated staff to ensure that files and the URLs at which they are published remain accessible. This is precisely the role of archival data repositories like the [Arctic Data Center](https://arcticdata.io), the [KNB Data Repository](https://knb.ecoinformatics.org), and the [Environmental Data Initiative (EDI)](https://portal.edirepository.org). + +## DOIs and data access + +Finally, no discussion of data access and persistence would be complete without discussing the use of [Digital Object Identifiers (DOIs)](https://datacite.org). DOIs have become the dominant means to create persistent links to academic articles, publications, and datasets. As **authority-based** identifiers, they work when an authority assigns a DOI *name* to a published work, and then ensures that the DOI name always redirects to the current web location of the resource. This is a lot of work, and there is no guarantees that the authorities will keep the links up-to-date. Journals, societies, and data repositories actively maintain the redirection between a DOI such as [doi:10.6073/pasta/b0b15aef7f3b52d2c5adc10004c05a6f](https://doi.org/10.6073/pasta/b0b15aef7f3b52d2c5adc10004c05a6f) and its current location on the EDI Repository. DOIs are commonly assigned to published datasets, and include the bibliographic metadata needed to properly cite and access the dataset. + +The challenge with DOIs as they are typically implemented is that they are usually assigned to a [`Dataset`](https://www.w3.org/TR/vocab-dcat-3/#Class:Dataset), which is a collection of digital objects that are composed to form the whole Dataset and that can be accessed individually or through an API. Typically, the metadata attached to DOIs does not include an enumeration of those digital objects or a clear mechanism to get to the actual data -- rather, the DOI redirects to a dataset landing page that provides a human readable summary of the dataset, and often various types of links to find and eventually download the data. Despite advances in metadata interoperability from [DCAT](https://www.w3.org/TR/vocab-dcat-3/#Class:Dataset) and [schema.org/Dataset](https://schema.org/Dataset), there is currently no reliable way to universally go from a known DOI for a dataset to the list of current locations of all of the digital objects that compose that dataset. And yet, this is exactly what we need for portable and persistent data access. In addition, we frequently work with data that doesn't have a DOI yet as we are creating derived data products for analysis locally before they are published. In conclusion, DOIs are a great approach to uniquely citing a dataset, but they do not provde a way for code to download specific, versioned digital objects from a dataset in a portable way that is persistent over many years. + +Thus, we want data access to be: + +- **Portable** -- works for anyone, even if they don't already have the data +- **Persistent** -- over long time periods +- **Versioned** -- the specific version of data used is guaranteed to be returned +- **Traceable** -- references to the provenance of data processing can be made +- **Transparent** -- it is clear from the script what data were used +- **Citable** -- it is clear how to properly cite the associated Dataset for attribution + +## Content-based identifiers + +A powerful approach to solving these problems is by using **content-based** identifiers, rather than authority-based identifiers like DOIs. A content-based identifier, or `contentid` for short, can be calculated from the content in a data file itself, and is unique (within constraints) to that content. This is accomplished by using a "hash" function, which calculates a relatively short, fixed-length, and unique value for any given input. Hash functions form the basis of secure cryptography for secure messaging, and so there are many tools available for conveniently hashing data inputs. In our use case, we can use commonly available cryptographic hash functions (such as `SHA-256` and `SHA-1`) to calculate a unique identifier for any given file. This gives us a unique identifier for the file which can be calculated by anyone with a copy of the file, and which can be registered as metadata in repositories that hold those files. + +![](images/contentid-hashes.png) + +Once we have a content identifier for an object, we can cache the file locally (just like we did with `pins`), and we can query repositories to see if they contain a copy of that file. Unlike authority-based identifiers, anyone who possesses a copy of a specific version of a data file can calculate the content-identifier for it, enabling us to build systems to find and access those data files across the repository landscape, and really across any web-accessible location. This has all of the power of cacheing and pinning web resources that we demonstrated before, but has the advantage that all holders of the content will use an identical identifier, avoiding broken links. And because content-identifiers can be determined locally before files are published on the web, we can use them in our scripts for data files that have yet to be published and yet know that they will work for others once the files have been published in a repository. + +![](images/contentid-registries.png) + +## Persistent and portable data access for improving reproducibility + +We'll be working with the following IEP dataset that is stored on EDI: + +> Interagency Ecological Program (IEP), B. Schreier, B. Davis, and N. Ikemiyagi. 2019. Interagency Ecological Program: Fish catch and water quality data from the Sacramento River floodplain and tidal slough, collected by the Yolo Bypass Fish Monitoring Program, 1998-2018. ver 2. Environmental Data Initiative. https://doi.org/10.6073/pasta/b0b15aef7f3b52d2c5adc10004c05a6f (Accessed 2021-10-30). + +You can [view this IEP dataset on DataONE](https://search.dataone.org/view/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Fmetadata%2Feml%2Fedi%2F233%2F2): + +![](images/contentid-dataset.png) +It also is visible from the [EDI dataset landing page](https://portal.edirepository.org/nis/mapbrowse?packageid=edi.233.2): + +![](images/contentid-edi-landing-page.png) + +It contains several data files, each of which is at a specific web URI, including: + +- Fish catch and water quality +- Fish taxonomy +- Trap Effort +- Site locations + +Here are the URLs for these 4 data files as registered on DataONE: + +```{r, contentid-dataone-uris} +#| echo: true + +delta_catch_url <- "https://cn.dataone.org/cn/v2/resolve/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Fdata%2Feml%2Fedi%2F233%2F2%2F015e494911cf35c90089ced5a3127334" +delta_taxa_url <- "https://cn.dataone.org/cn/v2/resolve/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Fdata%2Feml%2Fedi%2F233%2F2%2F0532048e856d4bd07deea11583b893dd" +delta_effort_url <- "https://cn.dataone.org/cn/v2/resolve/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Fdata%2Feml%2Fedi%2F233%2F2%2Face1ef25f940866865d24109b7250955" +delta_sites_url <- "https://cn.dataone.org/cn/v2/resolve/https%3A%2F%2Fpasta.lternet.edu%2Fpackage%2Fdata%2Feml%2Fedi%2F233%2F2%2F6a82451e84be1fe82c9821f30ffc2d7d" + +``` + +And here are the URLS of these four data files as presented on EDI: + +```{r, contentid-edi-uris} +#| echo: true + +delta_catch_edi <- 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.233.2&entityid=015e494911cf35c90089ced5a3127334' +delta_taxa_edi <- 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.233.2&entityid=0532048e856d4bd07deea11583b893dd' +delta_effort_edi <- 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.233.2&entityid=ace1ef25f940866865d24109b7250955' +delta_sites_edi <- 'https://portal.edirepository.org/nis/dataviewer?packageid=edi.233.2&entityid=6a82451e84be1fe82c9821f30ffc2d7d' +``` + +So, these data files are available from two different web locations. Do they produce the same data? Are the files identical? The `contentid` package can answer those questions for us. + +## Storing a content identifier from a URI + +Let's use the `contentid` package for portable access to data. First, using a web URI, store the content identifier in your local content registry to cache it on your machine. The `contentid::store()` function retrieves the data from the URL, calculates a hash value for the content, and stores both in a local registry on your machine. This is very similar to the `pins::pin` function, but it uses the content identifier to point to the data. + +```{r, contentid_store} +#| eval: true + +delta_catch_id <- store(delta_catch_url) +delta_taxa_id <- store(delta_taxa_url) +delta_effort_id <- store(delta_effort_url) +delta_sites_id <- store(delta_sites_url) + +print(c(delta_catch_id=delta_catch_id, + delta_taxa_id=delta_taxa_id, + delta_effort_id=delta_effort_id, + delta_sites_id=delta_sites_id)) +``` + +## Loading data from a content identifier + +Once you have the content identifier for a data file of interest (e.g., `delta_catch_id` in this case), you can call `contentid::resolve()` to find the locations where that data is stored. Because you already have it stored locally, it returns the file path to the file on your local registry, which you can then use to load the data into a data frame or process the data as needed. + +```{r, contentid-resolve} +#| warning: false + +delta_catch_file <- contentid::resolve(delta_catch_id, store = TRUE) +delta_catch <- readr::read_csv(delta_catch_file, show_col_types=FALSE) +head(delta_catch) + +# And two more examples +delta_taxa_file <- contentid::resolve(delta_taxa_id, store = TRUE) +delta_taxa <- readr::read_csv(delta_taxa_file, show_col_types=FALSE) + +delta_sites_file <- contentid::resolve(delta_sites_id, store = TRUE) +delta_sites <- readr::read_csv(delta_sites_file, show_col_types = FALSE) +``` + +This approach is **portable**, as anyone can run it without having the data local beforehand. This is because `resolve(id)` will store the data locally if someone does not already have a copy of the data in their local cache. This works by consulting a number of well-know registries to discover the location of the files, including [DataONE](https://dataone.org), [Hash Archive](https://hash-archive.cboettig.info), [Zenodo](https://zenodo.org), and [Software Heritage](https://software-heritage.org). + +This approach is **persistent**, because it pulls data from these persistent archives, and can take advantage of archive redundancy to locate a file even if it has moved to a new URL location (assuming that new location has been registered). + +This approach is **reproducible**, as the exact version of the data will be used every time (even if someone changes the data at the original web URI, which would require a new content identifier). + +This approach is **traceable** because there is a reference in the code to the specific data used based on in its content identifier, and the only way to change which data are used is to change the `checksum` that is being referenced to a new version. + +## Storing and using local data identifiers + +Because not all data are already published, it is also helpful to being working with content identifiers before the data are made public on the web. This is easily accomplished by storing a file in the local registry, and then using its content identifier during analysis. + +```{r, register_local} + +# Store a local file +vostok_co2 <- system.file("extdata", "vostok.icecore.co2", package = "contentid") +vostok_id <- store(vostok_co2) +vostok <- contentid::resolve(vostok_id, store=TRUE) +co2 <- read.table(vostok, col.names = c("depth", "age_ice", "age_air", "co2"), skip = 21) +head(co2) +``` + +From this point forward, we can view the repository sources that have a copy of the data associated with this content identifier: + + +```{r show_sources} +contentid::sources(vostok_id) +``` + +Later, when the data file is published to a DataONE repository, or Zenodo or other supported repositories, the script will work for other people trying to access it via `contentid::resolve()`. + +## Citing content identifiers + +Although content identifiers are excellent from a reproducibility perspective, they are unfortunately not directly linked to DOIs that are commonly used for citing data. DOIs are the current standard for citing data, and carry the citation metadata for data packages (such as author, title, publication year, etc.). But the `contentid` package currently lacks a mechanism to determine the citation for a file that is used in a script. Fortunately. because data networks like [DataONE](https://dataone.org) maintain the association between each content-identifier that is registered there with the assigned DOI for the Dataset that the object is part of, retrieve the citation for a given content identifier from the DataONE network. + +```{r, contentid_get_citation} +#| echo: false +#| eval: true + +# Function to return the text of a citation for a given contentid +lookup_citation <- function(contentid) { + + # Set up for SOLR queries + solr <- SolrClient$new(host = "cn.dataone.org", path = "/cn/v2/query/solr/", scheme = "https", port=443) + + # Use query_sources to determine if a copy is on DataONE or amenable repository (based on the URL pattern) + d1_locations <- contentid::sources(contentid, cols=c("identifier", "source", "date", "status", "sha1", "sha256", "md5")) %>% + filter(grepl('cn.dataone.org|v2/object|v2/resolve', source)) + + if (nrow(d1_locations) > 0) { + # Query that network API to determine which Datasets the content identifier is associated with + # If more than one is found, reduce the list to the most recent version of each Dataset + # (eliminating duplicate older versions, in favor of citing the most recent) + + # Look up the metadata for this object, including which metadata documents describe it in DataONE + pids <- d1_locations$source %>% basename() %>% unique() + subquery_pids <- stringr::str_replace_all(stringr::str_flatten(pids, collapse=" OR "), ":", "\\\\:") + subquery <- paste0("id:(", subquery_pids, ") AND -obsoletedBy:*") + fields <- 'identifier,checksum,checksumAlgorithm,datasource,isDocumentedBy,resourceMap' + metadata <- solr$search(params = list(q=subquery, rows=100, fl=fields)) + + # Retrieve the bibliographic metadata for each of those datasets by searching on the PID for each + documented_by <- stringr::str_split(metadata$isDocumentedBy, ",") + subquery_ids <- stringr::str_replace_all(stringr::str_flatten(documented_by[[1]], collapse=" OR "), ":", "\\\\:") + subquery <- paste0("id:(", subquery_ids, ")") + fields <- paste(sep=",", 'origin,identifier,formatId,checksum,checksumAlgorithm,title,datePublished,pubDate', + 'datasource,obsoletes, obsoletedBy, isDocumentedBy,resourceMap') + datasets <- solr$search(params = list(q=subquery, rows=100, fl=fields)) + + # Determine the repository name and URI + library(dataone) + repos <- listNodes(CNode()) + repo_list <- repos[sapply(repos, function(repo) { repo@identifier==datasets$datasource[[1]]}, simplify = TRUE ) ] + + # Return a list of citations, one for each matching Dataset (possibly providing different formats + # for returning the citation information (text string, bibtex, CSL-formatted string)) + # Currently this only returns the first result as a POC, needs work + citation_text <- paste0(datasets$origin[[1]], ". ", format(as.Date(datasets$pubDate[[1]]), "%Y"), ". ", + datasets$title[[1]], ". ", repo_list[[1]]@name, ". ", datasets$identifier[[1]], " ", + paste0("https://search.dataone.org/view/", datasets$identifier[[1]])) + return(citation_text) + } else { + return(list()) + } +} + +``` + +For example, if the data is on DataONE, we could write a function to lookup the citation: + +```{r, contentid-lookup-citation} +cite_string <- lookup_citation(vostok_id) +``` + +And this can even be displayed inline in the markdown text. For example, this chapter used data from: + +> `r cite_string` + +## Challenges and future work + +The `contentid` package is a work in progress. One challenge is that content identifiers are **opaque**, and not particularly transparent. For many researchers, seeing the long hash value in a script will not be very meaningful. In addition, the `contentid` package needs mechanisms to transparently indicate what a content identifier refers to. Again, we have this information available dynamically through the metadata collated in networks like [DataONE](https://dataone.org). A useful extension to `contentid` would be to provide functions for displaying detailed metadata about a content identifier when it is available, as well as a mechanism to provide more human-readable aliases for the identifier. + +Another major challenge is that most repositories don't register content identifiers with registries that support search and discovery. So, use of these identifiers can be challenging while we build a critical mass of repositories that provide seamless access to their data via content identifiers. + +These types of ideas for extension are how the R ecosystem advances. Anyone who sees a better way to extend packages like `contentid` can do so, or even create their own packages to explore new approaches to reproducible data access. + diff --git a/materials/sections/provenance-reproducibility-datapaper.qmd b/materials/sections/provenance-reproducibility-datapaper.qmd index c18fcc35..fe3404d2 100644 --- a/materials/sections/provenance-reproducibility-datapaper.qmd +++ b/materials/sections/provenance-reproducibility-datapaper.qmd @@ -1,7 +1,5 @@ ## Learning Objectives {.unnumbered} -In this lesson, we will: - - Discuss the concept of reproducible workflows including computational reproducibility and provenance metadata - Learn how to use R to package your work by building a reproducible paper in RMarkdown - Introduce tools and techniques for reproducibility supported by the NCEAS and DataONE @@ -123,6 +121,7 @@ Research compendium makes it easy for researchers to do their work but also for - [R](https://www.r-project.org/) - [RMarkdown](https://rmarkdown.rstudio.com/) +- [Quarto](https://quarto.org/) - [git](https://git-scm.com/) and [GitHub](https://github.com) Fortunately for us, Ben Marwick (and others) have written an R package called [rrtools](https://github.com/benmarwick/rrtools) that helps us create a research compendium from scratch. @@ -249,7 +248,7 @@ Things we can do with our research compendium: - Write out any figures in `./analysis/figures` -You can then write all of your R code in your RMarkdown, and generate your manuscript all in the format needed for your journal (using it's .csl file, stored in the paper directory). +You can then write all of your R code in your RMarkdown/Quarto, and generate your manuscript all in the format needed for your journal (using it's .csl file, stored in the paper directory). @@ -285,7 +284,7 @@ You can then write all of your R code in your RMarkdown, and generate your manus **In practice** -- Once you have your research compendium, you can called `rrtools::use_dockerfile()` +- Once you have your research compendium, you can called `rrtools::use_dockerfile()`. If needed, re-install `rrtools` directly from GitHub `remotes::install_github("benmarwick/rrtools")` - This, first creates a Dockerfile that loads a standard image for using R with the tidyverse, @@ -293,17 +292,17 @@ You can then write all of your R code in your RMarkdown, and generate your manus - If we look at the Dockerfile (example below), it calls to `renv::restore()`, as described above. -- The last line of the docker file renders our RMarkdown reproducible paper! +- The last line of the docker file renders our Quarto/RMarkdown reproducible paper! ``` # get the base image, the rocker/verse has R, RStudio and pandoc -FROM rocker/verse:4.1.0 +FROM rocker/verse:4.2.2 # required -MAINTAINER Jeanette Clark +MAINTAINER Your Name -COPY . /mypaper +COPY . / # go into the repo directory RUN . /etc/environment \ @@ -313,11 +312,12 @@ RUN . /etc/environment \ && sudo apt-get install libudunits2-dev -y \ # build this compendium package && R -e "install.packages('remotes', repos = c(CRAN = 'https://cloud.r-project.org'))" \ - && R -e "install.packages(c('renv', 'rmarkdown'))" \ + && R -e "remotes::install_github(c('rstudio/renv', 'quarto-dev/quarto-r'))" \ # install pkgs we need && R -e "renv::restore()" \ - # render the manuscript into a pdf, - && R -e "rmarkdown::render('mypaper/analysis/paper/paper.Rmd')" + # render the manuscript into a docx, you'll need to edit this if you've + # customised the location and name of your main qmd file + && R -e "quarto::quarto_render('//analysis/paper/paper.qmd')" ``` - After running `rrtools::use_dockerfile()`, the package also sets up GitHub Actions for you. @@ -349,14 +349,9 @@ By combining data, code and the compute environment, tales allow researchers to: - Achieve computational reproducibility - “Set the default to reproducible.” -They also empower users to verify and extend results with different data, methods, and environments. You can browse existing tales, run and interact with published tales and create new tales via the [Whole Tale Dashboard](https://dashboard.wholetale.org/browse). - -![](images/Tale-browser.png) - -By integrating with DataONE and Dataverse, Whole Tale includes over 90 major research repositories from which a user can select datasets to make those datasets the starting point of an interactive data exploration and analysis inside of one of the Whole Tale environments. DataONE, is adding functionality to work with data in the Whole Tale environment directly from the dataset landing page. -**Full circle reproducibility can be achieved by publishing data, code AND the environment.** +**Full circle reproducibility can be achieved by publishing data, code AND the computational environment.** ### Resources diff --git a/materials/sections/r-creating-functions.qmd b/materials/sections/r-creating-functions.qmd new file mode 100644 index 00000000..1a819065 --- /dev/null +++ b/materials/sections/r-creating-functions.qmd @@ -0,0 +1,222 @@ +## Learning Objectives {.unnumbered} + +- Learn why we should write code in small functions +- Write code for one or more functions +- Document functions to improve understanding and code communication + +## Creating R Functions + +Many people write R code as a single, continuous stream of commands, often drawn from the R Console itself and simply pasted into a script. While any script brings benefits over non-scripted solutions, there are advantages to breaking code into small, reusable modules. This is the role of a `function` in R. In this lesson, we will review the advantages of coding with functions, practice by creating some functions and show how to call them, and then do some exercises to build other simple functions. + +```{r} +#| label: load-libs-funcs +#| message: false +#| warning: false +#| echo: false + +library(DT) +``` + +### Why functions? + +In a word: + +- DRY: Don't Repeat Yourself + +By creating small functions that only one logical task and do it well, we quickly gain: + +- Improved understanding +- Reuse via decomposing tasks into bite-sized chunks +- Improved error testing + +#### Temperature conversion {.unnumbered} + +Imagine you have a bunch of data measured in Fahrenheit and you want to convert that for analytical purposes to Celsius. You might have an R script that does this for you. + +```{r} +#| label: temp-test-data + +airtemps <- c(212, 30.3, 78, 32) +celsius1 <- (airtemps[1]-32)*5/9 +celsius2 <- (airtemps[2]-32)*5/9 +celsius3 <- (airtemps[3]-32)*5/9 +``` + +Note the duplicated code, where the same formula is repeated three times. This code would be both more compact and more reliable if we didn't repeat ourselves. + +#### Creating a function {.unnumbered} + +Functions in R are a mechanism to process some input and return a value. Similarly to other variables, functions can be assigned to a variable so that they can be used throughout code by reference. To create a function in R, you use the `function` function (so meta!) and assign its result to a variable. Let's create a function that calculates celsius temperature outputs from fahrenheit temperature inputs. + +```{r} +#| label: f2c-function + +fahr_to_celsius <- function(fahr) { + celsius <- (fahr-32)*5/9 + return(celsius) +} +``` + +By running this code, we have created a function and stored it in R's global environment. The `fahr` argument to the `function` function indicates that the function we are creating takes a single parameter (the temperature in fahrenheit), and the `return` statement indicates that the function should return the value in the `celsius` variable that was calculated inside the function. Let's use it, and check if we got the same value as before: + +```{r} +#| label: demo-f2c-function + +celsius4 <- fahr_to_celsius(airtemps[1]) +celsius4 +celsius1 == celsius4 +``` + +Excellent. So now we have a conversion function we can use. Note that, because most operations in R can take multiple types as inputs, we can also pass the original vector of `airtemps`, and calculate all of the results at once: + +```{r} +#| label: f2c-function-vector + +celsius <- fahr_to_celsius(airtemps) +celsius +``` + +This takes a vector of temperatures in fahrenheit, and returns a vector of temperatures in celsius. + +#### Challenge {.unnumbered .exercise} + +Now, create a function named `celsius_to_fahr` that does the reverse, it takes temperature data in celsius as input, and returns the data converted to fahrenheit. Then use that formula to convert the `celsius` vector back into a vector of fahrenheit values, and compare it to the original `airtemps` vector to ensure that your answers are correct. Hint: the formula for C to F conversions is `celsius*9/5 + 32`. + +```{r} +#| label: func-exercise-1 +#| echo: true +#| eval: false + +# Your code goes here +``` + +Did you encounter any issues with rounding or precision? + +::: {.callout-note collapse=true} + +#### Solution, but don't peek! {.unnumbered} + +Don't peek until you write your own... + +```{r} +#| label: f2c-func-solution + +# Your code goes here +celsius_to_fahr <- function(celsius) { + fahr <- celsius*9/5 + 32 + return(fahr) +} + +result <- celsius_to_fahr(celsius) +airtemps == result +``` + +::: + + +### Documenting R functions + +Functions need documentation so that we can communicate what they do, and why. The `roxygen2` package provides a simple means to document your functions so that you can explain what the function does, the assumptions about the input values, a description of the value that is returned, and the rationale for decisions made about implementation. + +Documentation in ROxygen is placed immediately before the function definition, and is indicated by a special comment line that always starts with the characters `#'`. Here's a documented version of a function: + +```{r} +#| label: f2c-func-docs + +#' Convert temperature data from Fahrenheit to Celsius +#' +#' @param fahr Temperature data in degrees Fahrenheit to be converted +#' @return temperature value in degrees Celsius +#' @keywords conversion +#' @export +#' @examples +#' fahr_to_celsius(32) +#' fahr_to_celsius(c(32, 212, 72)) +fahr_to_celsius <- function(fahr) { + celsius <- (fahr-32)*5/9 + return(celsius) +} +``` + +Note the use of the `@param` keyword to define the expectations of input data, and the `@return` keyword for defining the value that is returned from the function. The `@examples` function is useful as a reminder as to how to use the function. Finally, the `@export` keyword indicates that, if this function were added to a package, then the function should be available to other code and packages to utilize. + +### Summary + +- Functions are useful to reduce redundancy, reuse code, and reduce errors +- Build functions with the `function` function +- Document functions with `roxygen2` comments + +### Examples: Minimizing work with functions + +Functions can of course be as simple or complex as needed. They can be be very effective in repeatedly performing calculations, or for bundling a group of commands that are used on many different input data sources. For example, we might create a simple function that takes fahrenheit temperatures as input, and calculates both celsius and Kelvin temperatures. All three values are then returned in a list, making it very easy to create a comparison table among the three scales. + +```{r} +#| label: func-return-list + +convert_temps <- function(fahr) { + celsius <- (fahr-32)*5/9 + kelvin <- celsius + 273.15 + return(list(fahr=fahr, celsius=celsius, kelvin=kelvin)) +} + +temps_df <- data.frame(convert_temps(seq(-100,100,10))) +``` + +```{r} +#| label: list-return-table +#| echo: false + +datatable(temps_df) +``` + +Once we have a dataset like that, we might want to plot it. One thing that we do repeatedly is set a consistent set of display elements for creating graphs and plots. By using a function to create a custom `ggplot` theme, we can enable to keep key parts of the formatting flexible. FOr example, in the `custom_theme` function, we provide a `base_size` argument that defaults to using a font size of 9 points. Because it has a default set, it can safely be omitted. But if it is provided, then that value is used to set the base font size for the plot. + +```{r} +#| label: func-custom-theme + +custom_theme <- function(base_size = 9) { + ggplot2::theme( + text = ggplot2::element_text(family = 'Helvetica', color = 'gray30', size = base_size), + plot.title = ggplot2::element_text(size = ggplot2::rel(1.25), hjust = 0.5, face = 'bold'), + panel.background = ggplot2::element_blank(), + panel.border = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_line(colour = 'grey90', linewidth = .25), + legend.position = 'right', + legend.key = ggplot2::element_rect(colour = NA, fill = NA), + axis.ticks = ggplot2::element_blank(), + axis.line = ggplot2::element_blank() + ) +} + +library(ggplot2) + +ggplot(temps_df, mapping=aes(x=fahr, y=celsius, color=kelvin)) + + geom_point() + + custom_theme(10) + +``` + +In this case, we set the font size to 10, and plotted the air temperatures. The `custom_theme` function can be used anywhere that one needs to consistently format a plot. + +But we can go further. One can wrap the entire call to ggplot in a function, enabling one to create many plots of the same type with a consistent structure. For example, we can create a `scatterplot` function that takes a data frame as input, along with a point_size for the points on the plot, and a font_size for the text. + +```{r} +#| label: func-plot-custom + +scatterplot <- function(df, point_size = 2, font_size=9) { + ggplot(df, mapping=aes(x=fahr, y=celsius, color=kelvin)) + + geom_point(size=point_size) + + custom_theme(font_size) +} +``` + +Calling that let's us, in a single line of code, create a highly customized plot but maintain flexibiity via the arguments passed in to the function. Let's set the point size to 3 and font to 16 to make the plot more legible. + +```{r} +#| label: func-call-sp + +scatterplot(temps_df, point_size=3, font_size = 16) +``` + +Once these functions are set up, all of the plots built with them can be reformatted by changing the settings in just the functions, whether they were used to create 1, 10, or 100 plots. diff --git a/materials/sections/r-creating-packages.qmd b/materials/sections/r-creating-packages.qmd new file mode 100644 index 00000000..ed39222c --- /dev/null +++ b/materials/sections/r-creating-packages.qmd @@ -0,0 +1,329 @@ + +## Learning Objectives + +In this lesson, you will learn: + +- The advantages of using R packages for organizing code +- Simple techniques for creating R packages +- Approaches to documenting code in packages + +## Why packages? + +Most R users are familiar with loading and utilizing packages in their work. And they know how rich CRAN is in providing for many conceivable needs. Most people have never created a package for their own work, and most think the process is too complicated. Really it's pretty straighforward and super useful in your personal work. Creating packages serves two main use cases: + +- Mechanism to redistribute reusable code (even if just for yourself) +- Mechanism to reproducibly document analysis and models and their results + +Even if you don't plan on writing a package with such broad appeal such as, say, `ggplot2` or `dplyr`, you still might consider creating a package to contain: + +- Useful utility functions you write i.e. a [Personal Package](https://hilaryparker.com/2013/04/03/personal-r-packages/). Having a place to put these functions makes it much easier to find and use them later. +- A set of shared routines for your lab or research group, making it easier to remain consistent within your team and also to save time. +- The analysis accompanying a thesis or manuscript, making it all that much easier for others to reproduce your results. + +The `usethis`, `devtools` and `roxygen2` packages make creating and maintining a package to be a straightforward experience. + +## Install and load packages + +```{r} +#| eval: false + +library(devtools) +library(usethis) +library(roxygen2) +``` + +## Create a basic package + +Thanks to the great [usethis](https://github.com/r-lib/usethis) package, it only takes one function call to create the skeleton of an R package using `create_package()`. Which eliminates pretty much all reasons for procrastination. To create a package called +`mytools`, all you do is: + +```{r, eval=FALSE} +#| eval: false + +usethis::create_package("~/mytools") +``` + + ✔ Setting active project to '/Users/jones/development/mytools' + ✔ Creating 'R/' + ✔ Creating 'man/' + ✔ Writing 'DESCRIPTION' + ✔ Writing 'NAMESPACE' + ✔ Writing 'mytools.Rproj' + ✔ Adding '.Rproj.user' to '.gitignore' + ✔ Adding '^mytools\\.Rproj$', '^\\.Rproj\\.user$' to '.Rbuildignore' + ✔ Opening new project 'mytools' in RStudio + +Note that this will open a new project (`mytools`) and a new session in RStudio server. + +The `create_package` function created a top-level directory structure, including a number of critical files under the [standard R package structure](http://cran.r-project.org/doc/manuals/r-release/R-exts.html#Package-structure). The most important of which is the `DESCRIPTION` file, which provides metadata about your package. Edit the `DESCRIPTION` file to provide reasonable values for each of the fields, +including your own contact information. + +Information about choosing a LICENSE is provided in the [Extending R](http://cran.r-project.org/doc/manuals/r-release/R-exts.html#Licensing) documentation. +The DESCRIPTION file expects the license to be chose from a predefined list, but +you can use it's various utility methods for setting a specific license file, such +as the `MIT` license or the `Apache 2` license: + +```{r} +#| eval: false + +usethis::use_apache_license() + +``` + + ✔ Setting License field in DESCRIPTION to 'Apache License (>= 2.0)' + ✔ Writing 'LICENSE.md' + ✔ Adding '^LICENSE\\.md$' to '.Rbuildignore' + +Once your license has been chosen, and you've edited your DESCRIPTION file with your contact information, a title, and a description, it will look like this: + +```{r, eval=FALSE} +#| eval: false + +Package: mytools +Title: Utility Functions Created by Matt Jones +Version: 0.1 +Authors@R: "Matthew Jones [aut, cre]" +Description: Package mytools contains a suite of utility functions useful whenever I need stuff to get done. +Depends: R (>= 3.5.0) +License: Apache License (>= 2.0) +LazyData: true +``` + + +## Add your code + +The skeleton package created contains a directory `R` which should contain your source files. Add your functions and classes in files to this directory, attempting to choose names that don't conflict with existing packages. For example, you might add a file `custom_theme` that contains a function `custom_theme()` that you might want to reuse. The `usethis::use_r()` function will help set up you files in the right places. For example, running: + +```{r eval=FALSE} +#| eval: false + +usethis::use_r("custom_theme") +``` + + ● Modify 'R/custom_theme' + +creates the file `R/custom_theme`, which you can then modify to add the implementation of the following function from the functions lesson: + +```{r} +#| eval: false + +custom_theme <- function(base_size = 9) { + ggplot2::theme( + axis.ticks = ggplot2::element_blank(), + text = ggplot2::element_text(family = 'Helvetica', color = 'gray30', size = base_size), + plot.title = ggplot2::element_text(size = ggplot2::rel(1.25), hjust = 0.5, face = 'bold'), + panel.background = ggplot2::element_blank(), + legend.position = 'right', + panel.border = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_line(colour = 'grey90', linewidth = .25), + legend.key = ggplot2::element_rect(colour = NA, fill = NA), + axis.line = ggplot2::element_blank() + ) +} + +``` + +If your R code depends on functions from another package, then you must declare so +in the `Imports` list in the `DESCRIPTION` file for your package. In our example +above, we depend on the `ggplot2` package, and so we need to list it as a dependency. +Once again, `usethis` provides a handy helper method: + +```{r} +#| eval: false + +usethis::use_package("ggplot2") +``` + + ✔ Adding 'ggplot2' to Imports field in DESCRIPTION + ● Refer to functions with `devtools::fun()` + +## Add documentation + +You should provide documentation for each of your functions and classes. This is done in the `roxygen2` approach of providing embedded comments in the source code files, which are in turn converted into manual pages and other R documentation artifacts. Be sure to define the overall purpose of the function, and each of its parameters. + +```{r} +#' Set a custom ggplot theme. +#' +#' This function sets ggplot theme elements that I like, with the ability to change +#' the base size of the text. +#' +#' @param base_size Base size of plot text +#' +#' @keywords plotting +#' +#' @export +#' +#' @examples +#' library(ggplot2) +#' +#' ggplot(iris, aes(Sepal.Length, Sepal.Width)) + +#' geom_point() + +#' custom_theme(base_size = 10) +#' +custom_theme <- function(base_size = 9) { + ggplot2::theme( + axis.ticks = ggplot2::element_blank(), + text = ggplot2::element_text(family = 'Helvetica', color = 'gray30', size = base_size), + plot.title = ggplot2::element_text(size = ggplot2::rel(1.25), hjust = 0.5, face = 'bold'), + panel.background = ggplot2::element_blank(), + legend.position = 'right', + panel.border = ggplot2::element_blank(), + panel.grid.minor = ggplot2::element_blank(), + panel.grid.major = ggplot2::element_line(colour = 'grey90', size = .25), + legend.key = ggplot2::element_rect(colour = NA, fill = NA), + axis.line = ggplot2::element_blank() + ) +} + +``` + +Once your files are documented, you can then process the documentation using the `document()` function to generate the appropriate .Rd files that your package needs. + +```{r} +#| eval: false + +devtools::document() +``` + + Updating mytools documentation + Updating roxygen version in /Users/jones/development/mytools/DESCRIPTION + Writing NAMESPACE + Loading mytools + Writing NAMESPACE + Writing custom_theme.Rd + +That's really it. You now have a package that you can `check()` and `install()` and `release()`. See below for these helper utilities. + +## Test your package + +You can test your code using the `tetsthat` testing framework. The `ussethis::use_testthat()` +function will set up your package for testing, and then you can use the `use_test()` function +to setup individual test files. For example, in the functions lesson we created some tests for our `fahr_to_celsius` functions but ran them line by line in the console. + +First, lets add that function to our package. Run the `use_r` function in the console: + +```{r} +#| eval: false + +usethis::use_r("fahr_to_celsius") +``` + +Then copy the function and documentation into the R script that opens and save the file. + +```{r} +#' Convert temperature data from Fahrenheit to Celsius +#' +#' @param fahr Temperature data in degrees Fahrenheit to be converted +#' @return temperature value in degrees Celsius +#' @keywords conversion +#' @export +#' @examples +#' fahr_to_celsius(32) +#' fahr_to_celsius(c(32, 212, 72)) +fahr_to_celsius <- function(fahr) { + celsius <- (fahr-32)*5/9 + return(celsius) +} +``` + +Now, set up your package for testing: + +```{r} +#| eval: false + +usethis::use_testthat() +``` + ✔ Adding 'testthat' to Suggests field in DESCRIPTION + ✔ Creating 'tests/testthat/' + ✔ Writing 'tests/testthat.R' + + +Then write a test for `fahr_to_celsius`: + +```{r} +#| eval: false + +usethis::use_test("fahr_to_celsius") +``` + ✔ Writing 'tests/testthat/test-fahr_to_celsius.R' + ● Modify 'tests/testthat/test-fahr_to_celsius.R' + +You can now add tests to the `test-fahr_to_celsius.R`, and you can run all of the +tests using `devtools::test()`. For example, if you add a test to the `test-fahr_to_celsius.R` file: + +```{r, eval=FALSE} +#| eval: false + +test_that("fahr_to_celsius works", { + expect_equal(fahr_to_celsius(32), 0) + expect_equal(fahr_to_celsius(212), 100) +}) + +``` + +Then you can run the tests to be sure all of your functions are working using `devtools::test()`: + +```{r, eval=FALSE} +#| eval: false + +devtools::test() +``` + + Loading mytools + Testing mytools + ✔ | OK F W S | Context + ✔ | 2 | test-fahr_to_celsius [0.1 s] + + ══ Results ════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════ + Duration: 0.1 s + + OK: 2 + Failed: 0 + Warnings: 0 + Skipped: 0 + +Yay, all tests passed! + +## Checking and installing your package + +Now that your package is built, you can check it for consistency and completeness using `check()`, and then you can install it locally using `install()`, which needs to be run from the parent directory of your module. + +```{r, eval=FALSE} +#| eval: false + +devtools::check() +devtools::install() +``` + +Your package is now available for use in your local environment. + +## Sharing and releasing your package + +- **GitHub**: The simplest way to share your package with others is to upload it to a [GitHub repository](https://github.com), which allows others to install your package using the `install_github('mytools','github_username')` function from `devtools`. + +- **CRAN**: If your package might be broadly useful, also consider releasing it to CRAN, using the `release()` method from `devtools()`. Releasing a package to CRAN requires a significant amount of work to ensure it follows the standards set by the R community, but it is entirely tractable and a valuable contribution to the science community. If you are considering releasing a package more broadly, you may find that the supportive community at [ROpenSci](https://ropensci.org) provides incredible help and valuable feeback through their onboarding process. + + +- **R-Universe**: A newer approach is to link your package release to [R-Universe](https://r-universe.dev), which is an effective way to make it easy to test and maintain packages so that many people can install them using the familiar `install.pacakges()` function in R. In R-Universe, people and organizations can [create their own *universe* of packages](https://github.com/r-universe-org/help#how-to-setup-your-personal-universe), which represent a collection of packages that appear as a CRAN-compatible repository in R. For example, in DataONE we maintaine the DataONE R-Universe (https://dataoneorg.r-universe.dev), which lists the packages we actively maintain as an organization. So, any R-user that wants to install these packages can do so by adding our universe to their list of repositories, and then installing packages as normal. For example, to install the `codyn` package, one could use: + +```{r} +#| eval: false + +install.packages('codyn', repos = c('https://dataoneorg.r-universe.dev', 'https://cloud.r-project.org')) +``` + + +## Challenge {- .exercise} + +Add the other temperature conversion functions with full documentation to your package, write tests to ensure the functions work properly, and then +`document()`, `check()`, and `install()` the new version of the package. Don't forget to update the version number before you install! + +## More reading + +- Hadley Wickham's awesome book: [R Packages](http://r-pkgs.had.co.nz/) +- Thomas Westlake's blog [Writing an R package from scratch](https://r-mageddon.netlify.com/post/writing-an-r-package-from-scratch/) +- How to set up your [personal R-Universe](https://github.com/r-universe-org/help#how-to-setup-your-personal-universe) + + + diff --git a/materials/sections/r-intro-quarto.qmd b/materials/sections/r-intro-quarto.qmd new file mode 100644 index 00000000..b902c382 --- /dev/null +++ b/materials/sections/r-intro-quarto.qmd @@ -0,0 +1,510 @@ +## Learning Objectives {.unnumbered} + +- Introduce literate analysis using Quarto (an extension of RMarkdown's features) +- Learn markdown syntax and run R code using Quarto +- Build and render an example analysis + +## Introduction + +## Literate Programming + +All too often, computational methods are written in such a way as to be borderline incomprehensible even to the person who originally wrote the code! The reason for this is obvious, computers interpret information very differently than people do. In 1984, Donald Knuth proposed a reversal of the programming paradigm by introducing the concept of *Literate Programming* ([Knuth 1984](http://www.literateprogramming.com/knuthweb.pdf)). + +> *"Instead of imagining that our main task is to instruct a computer what to do, let us concentrate rather on explaining to human beings what we want a computer to do."* + +If our aim is to make scientific research more transparent, the appeal of this paradigm reversal is immediately apparent. By switching to a literate analysis model, **you help enable human understanding of what the computer is doing**. As Knuth describes, in the literate analysis model, the author is an "*essayist*" who chooses variable names carefully, explains what they mean, and introduces concepts in the analysis in a way that facilitates understanding. + +Quarto and RMarkdown are an excellent way to generate literate analysis, and a reproducible workflow. These types of files, combine R the programming language, and **markdown, a set of text formatting directives**. + +In an R script, the language assumes that you are writing R code, unless you specify that you are writing prose (using a comment, designated by `#`). The paradigm shift of literate analysis comes in the switch to RMarkdown or Quarto, where instead of assuming you are writing code, they assume that you are writing prose unless you specify that you are writing code. This, along with the formatting provided by markdown, encourages the "essayist" to write understandable prose to accompany the code that explains to the human-beings reading the document what the author told the computer to do. This is in contrast to writing just R code, where the author telling to the computer what to do with maybe a smattering of terse comments explaining the code to a reader. + + +Before we dive in deeper, let's look at an example of what a rendered literate analysis can look like using a real example. [Here is an example](https://nceas.github.io/sasap-training/materials/reproducible_research_in_r_fairbanks/example-brood-table-analysis.html) of an analysis workflow written using RMarkdown. Note that if this analysis would be in Quarto, the render version it would be similar, except for formatting and layout (eg: the default font in Quarto is different). + +There are a few things to notice about this document, which assembles a set of similar data sources on salmon brood tables with different formatting into a single data source. + +- It introduces the data sources using in-line images, links, interactive tables, and interactive maps. +- An example of data formatting from one source using R is shown. +- The document executes a set of formatting scripts in a directory to generate a single merged file. +- Some simple quality checks are performed (and their output shown) on the merged data. +- Simple analysis and plots are shown. + +In addition to achieving literate analysis, this document also represents a **reproducible analysis**. Because the entire merging and quality control of the data is done using the R code in the Quarto file, if a new data source and formatting script are added, the document can be run all at once with a single click to re-generate the quality control, plots, and analysis of the updated data. + +::: callout-important +## A note on reproducibility + +Reproducible analysis allow you to automatize how the figures and the statistics in your analysis are generated. This process also helps your collaborators, your readers and your future self to follow your code trail the leads to the original data, increasing the transparency of your science. + +Literate analysis help reduce the mistakes from copying and pasting across software, keeps results and models in sync, and allows you to provide interested readers with more information about the different approaches and analyses you tried before coming up with the final results. +::: + +## RMarkdown and Quarto + +You can identify a Quarto file with the `.qmd` extension. On the other hand, an RMarkdown file has a `.Rmd` extension. Both have similar structures and both combine prose with code.Quarto provides a rich support to languages other than R such as Python, Observable, and Julia. It also excels in formatting and layout. Allowing users to customize in details the looks of the rendered documents. On the other hand, RMarkdown is compatible with some languages that Quarto is not, for example bash. Quarto and Rmarkdown are amazing tools to use for collaborative research. During this course e will spend some time learning and using the basics of Quarto and provide some comparisons to RMarkdown. + +![](images/quarto-rmd-logo.png){fig-alt="Bui & Csik, 2023, SORTEE"} + +Now, let's take a look at the structure of each of these files. The both look for the most part the same with minor differences. + +::: column-page +![](images/quarto-rmd-structure.png){fig-alt="Bui & Csik, 2023, SORTEE"} + +Finally, lets compare each of these files when knitted/rendered. + +![](images/quarto-rmd-rendered.png){fig-alt="Bui & Csik, 2023, SORTEE"} +::: + +Again, we see similar outcoumes, with minor differences mainly in formatting (font, style of showing code chunks, etc.) + +Both type of documents have three main components: + +- YAML metadata to guide the document's build process +- Code chunks to run +- Prose (Text to display) + +Today we are going to use Quarto to run some analysis on data. We are specifically going to focus on the code chunk and text components. We will discuss more about the how the YAML works in an Quarto later in the course. + +::: callout-important +## The YAML + +Is the document's metadata which sets guidelines on how your want the output of your document to look like. It is located at the top of your file, delineated by three dashes (`---`) at the top and at the bottom of it. It can be used to specify: + +- Characteristics of your documents such at title, author, date of creation. + +- Argument to pass on the building process to control the format of the output. + +- Add additional information such as the bibliography file (and formatting of the references) + +- Specific parameters for your report (eg: just used a subset of the data). +::: + +## A Quarto Document + +Let's open an Quarto file following the instructions below. + +::: callout-tip +## Setup + +- Open a new Quarto file using the following prompts: File \> New File \> Quarto Document +- A popup window will appear. +- Give your file a new title, e.g "Introduction to Quarto". +- Leave the output format as HTML and Engine set to Knitr. +- Then click the "Create" button. +::: + +The first thing to notice is that by opening a file, we see the fourth pane of the RStudio pops up. This is our Quarto document which is essentially a text editor. We also see in the upper left side that we are looking at the document under the "Visual editor". This is probably a familiar way of looking at a text document. To introduce the **markdown** syntax, we re going to move to the source editor and then come back to the visual editor. In the upper left corner, click on Source. See how the formatting changed? In the Source editor we are looking at the same text, but in markdown syntax. The visual editor on the other hand, allows us to see how markdown is rendered, therefore how is it going to look in our output document. + +Let's have a look at this file --- As we saw in the examples above, it looks a little different than a R script. It's not blank; there is some initial text already provided for you. Lets identify the three main components we introduces before. We have the YAML a the top, in between the two sets of dashed lines. Then we also see white and grey sections. The gray sections are R code chunks and the white sections are plain text. + +Let's go ahead and render this file by clicking the "Render" button, next to the blue arrow at the top of the Quarto file. When you first click this button, RStudio will prompt you to save this file. Save it in the top level of your home directory on the server, and name it something that you will remember (like `quarto-intro.Rmd`). + +::: column-page +![](images/quarto-side-by-side.png) +::: + +What do you notice between the two? + +First, the render process produced a second file (an HTML file) that popped up in a second window in the browser. You'll also see this file in your directory with the same name as your qmd, but with the .html extension. In it's simplest format, Quarto files come in pairs (same than RMarkdown files) the Quarto document, and its rendered version. In this case, we are rendering, the file into HTML. You can also knit to PDF or Word files and others. + +Notice how the grey **R code chunks** are surrounded by 3 back-ticks and `{r LABEL}`. The first chunk, in this case `1+1`, is evaluated and return the output number (2). Notice the line in the second chunk that says `#| echo: false`? This is a code chunk option that indicates not to print the code. In the rendered version, we can see the outcome of `2*2` but not the executed code that created the outcome. + +The table below show some of the options available to customizing outputs ([Quarto.org](https://quarto.org/docs/computations/execution-options.html)). + +| Option | Description | +|------------------------|-----------------------------------------------| +| `#| eval:` | Evaluate the code chunk (if `false`, just echos the code into the output). | +| `#| echo:` | Include the source code in output | +| `#| warning:` | Include warnings in the output. | +| `#| error:` | Include warnings in the output. | +| `#| include:` | Catch all for preventing any output (code or results) from being included (e.g.`include: false` suppresses all output from the code block). | + +: Code chunk options + +Note that you can also combine these options by adding more than one to a code chunk. + +::: callout-important +One important difference between Quarto documents and RMarkdown documents is that in Quarto, chunk options are written in special comment format (`#|`) at the top of code chunks rather than within the wiggly brackets next to \`\`\``{r}` at the begging of the chunk. For example: + +#### Quarto code options syntax {.unnumbered} + +![](images/quarto-code-options.png){width="60%"} + +#### RMarkdown code options syntax {.unnumbered} + +![](images/rmd-code-options.png){width="60%"} +::: + +It is important to emphasize one more time that in an Quarto (and RMarkdown) document, the gray areas of the document are *code*, in this case R code because that is what it is indicated in the \`\`\``{r}` syntax at the start of this gray area. And the white areas of a qmd are in markdown language. + +## Markdown Syntax + +Let's start by talking about markdown. **Markdown is a formatting language for plain text**, and there are only around 15 rules to know. + +Notice the syntax in the document we just knitted: + +- **Headers** get rendered at multiple levels: `#`, `##` +- **Bold**: `**word**` + +There are some good [cheatsheets](https://github.com/adam-p/markdown-here/wiki/Markdown-Here-Cheatsheet) to get you started, and here is one built into RStudio: Go to Help \> Markdown Quick Reference. + +::: callout-important +**The hash symbol \# is used differently in markdown and in R** + +- In an R script or inside an R code chunk, a hash indicates a comment that will not be evaluated. You can use as many as you want: `#` is equivalent to `######`. It's just a matter of style. +- In markdown, a hash indicates a level of a header. And the number you use matters: `#` is a "level one header", meaning the biggest font and the top of the hierarchy. `###` is a level three header, and will show up nested below the `#` and `##` headers. + +![](images/rmarkdown_headers.png) +::: + +::: callout-note +## Exercise + +1. In markdown, Write some italic text, make a numbered list, and add a few sub-headers. Use the Markdown Quick Reference (in the menu bar: Help \> Markdown Quick Reference). +2. Re-knit your html file and observe your edits. +::: + +## The Visual Editor + +Quarto has a "what you see is what you mean" (WYSIWYM) editor or Visual editor, which can be a nice way to write markdown without remembering all of the markdown rules. Since there aren't many rules for markdown, we recommend just learning them especially since markdown is used in many, many other contexts besides Quarto and RMarkdown. For example, formatting GitHub comments and README files. + +To access the editor, click the Visual button in the upper left hand corner of your editor pane. You'll notice that your document is now formatted as you type, and you can change elements of the formatting using the row of icons in the top of the editor pane. Although we don't really recommend doing all of your markdown composition in the Visual editor, there are two features to this editor that we believe are **immensely** helpful, adding citations, and adding tables. + +### Adding citations + +To add a citation, go to the visual editor and in the insert drop down, select "Citation." In the window that appears, there are several options in the left hand panel for the source of your citation. If you have a citation manager, such as Zotero, installed, this would be included in that list. For now, select "From DOI", and in the search bar enter a DOI of your choice (e.g.: 10.1038/s41467-020-17726-z), then select "Insert." + +![](images/markdown-citation.png) + +After selecting insert, a couple of things happen. First, the citation reference is inserted into your markdown text as `[@oke2020]`. Second, a file called references.bib containing the BibTex format of the citation is created. Third, that file is added to the YAML header of your Quarto document (`bibliography: references.bib`). Adding another citation will automatically update your `references.bib` file. So easy! + +### Adding table in markdown + +The second task that the visual editor is convenient for is generating tables. Markdown tables are a bit finicky and annoying to type, and there are a number of formatting options that are difficult to remember if you don't use them often. In the top icon bar, the "Table" drop down gives several options for inserting, editing, and formatting tables. Experiment with this menu to insert a small table. + +## Code Chunks in Quarto + +Every time when opening a new Quarto document we should start by deleting all template text (everything except for the YAML). Then we save the document into the most convenient folder of our project. Now we are ready to start our work. + +You can create a new chunk in your Quarto in one of these ways: + +- Go to Code in the top menu bar, click "Insert Chunk" +- Type by hand `{r}` +- Use the keyboard shortcut + - Mac:`command` + `option` + `i` + - Windows: `Ctrl` + `Alt` + `i` + +::: callout-important +## About code chunks + +Each code chunk needs to have an opening syntax \`\`\``{r}` and a closing syntax \`\`\`. Everything in between these lines will be identified as R code. +::: + +If I want to write some R code, this is how it would look like. + +```{r} +#| eval: false + +x <- 4 * 8 + +hights_ft <- c(5.2, 6.0, 5.7) + +coef <- 3.14 +``` + +Hitting return does not execute this command; remember, it's just a text file. To execute it, we need to get what we typed in the the R chunk (the grey R code) down into the console. How do we do it? There are several ways (let's do each of them): + +1. Copy-paste this line into the console (generally not recommended as a primary method) +2. Select the line (or simply put the cursor there), and click "Run". This is available from: + a. the bar above the file (green arrow) + b. the menu bar: Code \> Run Selected Line(s) + c. keyboard shortcut: command-return +3. Click the green arrow at the right of the code chunk + +## Practice: Literate Analysis with ocean water samples + +Now that we have gone over the basics, let's go a little deeper by building a simple, Quarto document that represents a literate analysis using real data. We are going to work with the seawater chemistry data. We are going to download a file named `BGchem2008data.csv` from the Arctic Data Center repository. Please follow the steps below to download the data and then upload to your RStudio Server `data` folder. + +::: callout-tip +## Setup + +- Navigate to the following dataset: +- Download the file `BGchem2008data.csv` +- Click the "Upload" button in your RStudio server file browser. +- In the dialog box, make sure the destination directory is the `data` directory in your R project, click "Choose File," and locate the `BGchem2008data.csv` file. Press "OK" to upload the file. +- Check your file was successfully uploaded by navigating into your `data` folder in the **Files** pane. +::: + +### Getting Started + +Experienced R users who have never used Quarto (or RMarkdown) often struggle a bit in the transition to developing analysis in Prose+Code format --- which makes sense! It is switching the code paradigm to a new way of thinking. + +Rather than starting an R chunk and putting all of your code in that single chunk, below we describe what we think is a better way. + +1. Open a document and block out the high-level sections you know you'll need to include using top level headers. +2. Add bullet points for some high level pseudo-code steps you know you'll need to take. +3. Start filling in under each bullet point the code that accomplishes each step. As you write your code, transform your bullet points into prose, and add new bullet points or sections as needed. + +For this mini-analysis, we will have the following sections and code steps: + +1. Introduction + a. About the data + b. Setup + c. Read in data +2. Analysis + a. Calculate summary statistics + b. Calculate mean Redfield ratio + c. Plot Redfield ratio +3. Conclusion + +::: callout-note +## Exercise + +Under "About the data", write a sentence saying where the data set came from, including a hyperlink ti the data. Also mention when was the data downloaded. + +Hint: Navigate to Help \> Markdown Quick Reference to look-up the hyperlink syntax. +::: + +### Read in the data + +Now that we have outlined our document, we can start writing code! To read the data into our environment, we will use a function from the `readr` package. + +To use a package in our analysis, we need to first make sure it is installed (you can install a package by running `install.package("name-of-package")`). Once installed you need to load it into our environment using `library(package_name)`. Even though we have installed it, we haven't yet told our R session to access it. Because there are so many packages (many with conflicting namespaces) R cannot automatically load every single package you have installed. Instead, you load only the ones you need for a particular analysis. **Loading the package is a key part of the *reproducible* aspect of our literate analysis**, so we will include it as an R chunk as part of our Setup. + +::: {.callout-caution icon="false"} +## Best Practice + +It is generally good practice to include all of your `library()` calls in a single, dedicated R chunk near the top of your document. This lets collaborators know what packages they might need to install before they start running your code. +::: + +The server should have already installed `readr`, so add a new R chunk below your Setup header that calls the `readr` library, and run it. It should look like this: + +```{r} +library(readr) +``` + +Now, under "Read data", add a code chunk that uses the `read_csv()` function to read in your data file. + +```{r} +bg_chem <- read_csv("data/BGchem2008data.csv") +``` + +::: column-margin +**Why `read_csv()` over `read.csv()`?** + +We chose to show `read_csv()` from the `readr` package to introduce the concept of packages, to show you how to load packages, and `read_csv()` has several advantages over `read.csv()` from base R, including: + +- More reasonable function defaults (no `stringsAsFactors`!) +- Smarter column type parsing, especially for dates +- `read_csv()` is much faster than `read.csv()`, which is helpful for large files +::: + +Once you run this line in your document, you should see the `bg_chem` object populate in your environment pane. It also spits out lots of text explaining what types the function parsed each column into. This text is important, and should be examined, but we might not want it in our final document. + +::: callout-note +## Exercise + +How would you suppress the warnings (so they don't show in our output file) form a specific code chunk? + +**Hint:** Code chunk options +::: + +### Calculate Summary Statistics + +As our "analysis" we are going to calculate some very simple summary statistics and generate a single plot. Using water samples from the Arctic Ocean, we will examine the ratio of nitrogen to phosphate to see how closely the data match the Redfield ratio, which is the consistent 16:1 ratio of nitrogen to phosphorous atoms found in marine phytoplankton. + +Let's start by exploring the data we just read. Every time we read a new data set, it is important to familiarize yourself with it and make sure that the data looks as expected. Below some useful functions for exploring your data. + +Let's start by creating a new R chunk and run the following functions. Because this just an exploration and we do not want this chunk to be part of our report, we will indicate that by adding `#|eval: false` and `#| echo: false` in the setup of the chunk, that way, the code in this chunk will not run and not be displayed when I knit the final document. + +```{r} +#| eval: false + +## Prints the column names of my data frame +colnames(bg_chem) + +## General structure of the data frame - shows class of each column +str(bg_chem) + +## First 6 lines of the data frame +head(bg_chem) + +## Summary of each column of data +summary(bg_chem) + +## Prints unique values in a column (in this case Date) +unique(bg_chem$Date) +``` + +To peek out data frame, we can type `View(bg_chem)` in the console. This will open a tab with our data frame in a tabular format. + +Now that we know a more about the data set we are working with lets do some analyses. Under the appropriate bullet point in your analysis section, create a new R chunk, and use it to calculate the mean nitrate (NO3), nitrite (NO2), ammonium (NH4), and phosphorous (P) measured. + +Save these mean values as new variables with easily understandable names, and write a (brief) description of your operation using markdown above the chunk. Remember that the `$` (aka the subset operator) indicates which column of your data to look into. + +```{r} +nitrate <- mean(bg_chem$NO3) +nitrite <- mean(bg_chem$NO2) +amm <- mean(bg_chem$NH4) +phos <- mean(bg_chem$P) +``` + +In another chunk, use those variables to calculate the nitrogen: phosphate ratio (Redfield ratio). + +```{r} +ratio <- (nitrate + nitrite + amm)/phos +``` + +You can access this variable in your markdown text by using R in-line in your text. The syntax to call R in-line (as opposed to as a chunk) is a single backtick \`, followed by the letter "r", then whatever your simple R command is --- here we will use `round(ratio)` to print the calculated ratio, and finally a closing backtick \`. This allows us to access the value stored in this variable in our explanatory text without resorting to the evaluate-copy-paste method so commonly used for this type of task. + +So, the text in you Quarto document should look like this: + +The Redfield ratio for this dataset is approximately: \`r `round(ratio)`\` + +And the rendered text like this: + +The Redfield ratio for this dataset is approximately `r round(ratio)`. + +Finally, create a simple plot using base R that plots the ratio of the individual measurements, as opposed to looking at mean ratio. + +```{r} +plot(bg_chem$P, bg_chem$NO2 + bg_chem$NO3 + bg_chem$NH4) +``` + +::: callout-note +## Exercise + +Decide whether or not you want the plotting code above to show up in your knitted document along with the plot, and implement your decision as a chunk option. + +Render your Quarto document (by pressing the Render button) and observe the results. +::: + +::: callout-important +## How do I decide when to make a new code chunk? + +Like many of life's great questions, there is no clear cut answer. A rule of thumb is to have one chunk per functional unit of analysis. This functional unit could be 50 lines of code or it could be 1 line, but typically it only does one "thing." This could be reading in data, making a plot, or defining a function. It could also mean calculating a series of related summary statistics (as we'll see below). Ultimately, the choice is one related to personal preference and style, but generally you should ensure that code is divided up such that it is easily explainable in a literate analysis as the code is run. +::: + +## Quarto file paths and environement + +As we discussed during our setup session, in computing, a path specifies the unique location of a file on the filesystem. A path can come in one of two forms: absolute or relative. + +- **Absolute paths** start at the very top of your file system, and work their way down the directory tree to the file. +- **Relative paths** start at an arbitrary point in the file system. In R, this point is set by your working directory. + +Quarto has a special way of handling relative paths that can be very handy. When working in an Quarto document, **R will set all paths relative to the location of the Quarto file**. This way, you don't have to worry about setting a working directory, or changing your colleagues absolute path structure with the correct user name, etc. If your Quarto document is stored near where the data it analyses are stored (good practice, generally), setting paths becomes much easier! + +If you saved your `BGchem2008data.csv` data file in the same location as your qmd, you can just write `read_csv("BGchem2008data.csv")` to read it in. Checkout the help page by typing `?read_csv()` in the console. This tells you that for this function the first argument should be a pointer to the file. Rstudio has some nice helpers to help you navigate paths. If you open quotes and press `tab` with your cursor between the quotes, a popup menu will appear showing you some options. + +### Practice: Quarto and Environments + +Let's walk through an exercise with the document we just created to demonstrate how Quarto handles environments. We will be deliberately inducing some errors here for demonstration purposes. + +First, follow these steps: + +::: callout-tip +## Setup + +- Restart your R session (Session \> Restart R) +- Run the last chunk in your Quarto document by pressing the play button on the chunk +::: + +Perhaps not surprisingly, we get an error: + +``` +Error in plot(bg_chem$P, bg_chem$NO2 + bg_chem$NO3 + bg_chem$NH4) : + object 'bg_chem' not found +``` + +This is because we have not run the chunk of code that reads in the `bg_chem` data. The R part of Quarto works just like a regular R script. You have to execute the code, and the order that you run it in matters. It is relatively easy to get mixed up in a large Quarto document --- running chunks out of order, or forgetting to run chunks. + +To resolve this, follow the next step: + +::: callout-tip +## Setup continued + +- Select from the "Run" menu (top right of the editor pane) "Run All." +- Observe the `bg_chem` variable in your environment +::: + +This is a great way to reset and re-run code when things seem to have gone sideways. It is great practice to do periodically since it helps ensure you are writing code that actually runs and it's reproducible. + +::: callout-tip +## For the next exercise: + +- Clean your environment by clicking the broom in the environment pane +- Restart your R session (Session \> Restart R) +- Press "Render" to run all of the code in your document +- Observe the state of your environment pane + +Assuming your document rendered and produced an html page, your code ran. Yet, the environment pane is empty. What happened? +::: + +The Render button is rather special --- it doesn't just run all of the code in your document. It actually spins up a fresh R environment separate from the one you have been working in, runs all of the code in your document, generates the output, and then closes the environment. This is one of the best ways Quarto (or RMarkdown) helps ensure you have built a reproducible workflow. If, while you were developing your code, you ran a line in the console as opposed to adding it to your Quarto document, the code you develop while working actively in your environment will still work. However, when you knit your document, the environment RStudio spins up doesn't know anything about that working environment you were in. Thus, your code may error because it doesn't have that extra piece of information. Commonly, `library()` calls are the source of this kind of frustration when the author runs it in the console, but forgets to add it to the script. + +To further clarify the point on environments, perform the following steps: + +::: callout-tip +## Setup continued + +- Select from the "Run" menu (top right of editor pane) "Run All" +- Observe all of the variables in your environment +::: + +::: callout-important +## What about all my R Scripts? + +Some pieces of R code are better suited for R scripts than Quarto or RMarkdown. A function you wrote yourself that you use in many different analyses is probably better to define in an R script than repeated across many Quarto or RMarkdown documents. Some analyses have mundane or repetitive tasks that don't need to be explained very much. For example, in the document shown in the beginning of this lesson, 15 different excel files needed to be reformatted in slightly different, mundane ways, like renaming columns and removing header text. Instead of including these tasks in the primary Quarto document, the authors chose to write one R script per file and stored them all in a directory. Then, took the contents of one script and included it in the literate analysis, using it as an example to explain what the scripts did, and then used the `source()` function to run them all from within the Quarto document. + +So, just because you know Quarto now, doesn't mean you won't be using R scripts anymore. Both `.R` and `.qmd` have their roles to play in analysis. With practice, it will become more clear what works well in Quarto or RMarkdown, and what belongs in a regular R script. +::: + +## Additional Quarto Resources + +- Posit (the organization that developed Quarto) has great documentation, check out [Quarto.org](https://quarto.org/) +- R for Data Science (2e) (Wickham et al, 2023), this is an awesome book for all R related things. Chapter [29 and 30](https://r4ds.hadley.nz/quarto.html) are specific to Quarto. +- [Quarto Gallery:](https://quarto.org/docs/gallery/) Example of different outputs created using Quarto +- [Hello Quarto: share, collaborate, teach, reimagine](https://openscapes.org/blog/2022-08-10-quarto-keynote/). A talk by Julia Stewart Lowndes and Mine Cetinkaya-Runde. + + + +## Troubleshooting: My RMarkdown Won't Knit to PDF + +If you get an error when trying to knit to PDF that says your computer +doesn't have a LaTeX installation, one of two things is likely +happening: + +- Your computer doesn't have LaTeX installed +- You have an installation of LaTeX but RStudio cannot find it (it is + not on the path) + +If you already use LaTeX (like to write papers), you fall in the second +category. Solving this requires directing RStudio to your installation - +and isn't covered here. + +If you fall in the first category - you are sure you don't have LaTeX +installed - can use the R package `tinytex` to easily get an +installation recognized by RStudio, as long as you have administrative +rights to your computer. + +To install `tinytex` run: + +```{r} +#| eval: false +install.packages("tinytex") +tinytex::install_tinytex() +``` + +If you get an error that looks like destination /usr/local/bin not +writable, you need to give yourself permission to write to this +directory (again, only possible if you have administrative rights). To +do this, run this command in the terminal: + +``` bash +sudo chown -R `whoami`:admin /usr/local/bin +``` + +and then try the above install instructions again. Learn more about +`tinytex` from [Yihui Xie's online book +*TinyTeX*](https://yihui.org/tinytex/). +```` diff --git a/materials/sections/r-intro-rmarkdown.qmd b/materials/sections/r-intro-rmarkdown.qmd index f13ada4d..ad86295e 100644 --- a/materials/sections/r-intro-rmarkdown.qmd +++ b/materials/sections/r-intro-rmarkdown.qmd @@ -344,7 +344,10 @@ phos <- mean(bg_chem$P) In another chunk, use those variables to calculate the nitrogen: phosphate ratio (Redfield ratio). ```{r} + ratio <- (nitrate + nitrite + amm)/phos + + ``` You can access this variable in your Markdown text by using R in-line in your text. The syntax to call R in-line (as opposed to as a chunk) is a single backtick \`, followed by the letter "r", then whatever your simple R command is --- here we will use `round(ratio)` to print the calculated ratio, and finally a closing backtick \`. This allows us to access the value stored in this variable in our explanatory text without resorting to the evaluate-copy-paste method so commonly used for this type of task. @@ -356,7 +359,8 @@ The Redfield ratio for this dataset is approximately: \`r `round(ratio)`\` And the rendered text like this: -The Redfield ratio for this dataset is approximately `r round(ratio)`. +The Redfield ratio for this dataset is approximately `r round(ratio)` + Finally, create a simple plot using base R that plots the ratio of the individual measurements, as opposed to looking at mean ratio. diff --git a/materials/sections/r-practice-clean-wrangle.qmd b/materials/sections/r-practice-clean-wrangle.qmd new file mode 100644 index 00000000..595a28b1 --- /dev/null +++ b/materials/sections/r-practice-clean-wrangle.qmd @@ -0,0 +1,254 @@ + +## Learning Objectives {.unnumbered} + +- Practice using common cleaning and wrangling functions +- Practice joining two data frames +- Practice git and GitHub workflow + + +## About the data {.unnumbered} + +These exercises will be using data on abundance, size, and trap counts (fishing pressure) of California spiny lobster (*Panulirus interruptus*) and were collected along the mainland coast of the Santa Barbara Channel by Santa Barbara Coastal LTER researchers [@lter2022]. + +## Setup + +::: callout-tip +## GitHub & R setup + +1. Create a new repository on GitHub. Use the following settings: + + a. Add a brief description for your new repository. For example: R practice session cleaning and wrangling data during Delta Science Program Synthesis and Training session 1. + + b. Keep the repository public. + + c. Initialize the repository with a `README` file and an R `.gitignore` template. + +2. Clone the repository to a new project in RStudio. + + +3. Create a new Quarto file in RStudio. Follow these steps: + a. Add a title to the new Quarto file, for example: "Exercise: Explore, Clean, and Wrangle Data". Add your name to the `author` field. Press "Create". + + b. Delete the default text in the new Quarto file. + + c. Using level 2 headers, create an outline for this exercise. Include headers for the following sections: About the data, Setup, Read and explore data, Exercise 1, Exercise 2 , ... Exercise 6. + + d. Save this file with a meaningful name, eg. `exercise-clean-wrangle.qmd`. + +4. After saving the file, `stage`, `commit`, write a commit message, `pull`, and `push` this file to the remote repository (on GitHub). +::: + + +::: callout-tip +## Read in data + + +1. For this exercise we are going to use the +SBC LTER: Reef: Abundance, size and fishing effort for California Spiny Lobster (*Panulirus interruptus*) + data. +Navigate to this link and briefly explore the data package. + +2. Under the "About the data" section in the `.qmd` file, write a short description of the data, including a link to the data and the access date. + +2. Under the "Setup" section, load the following libraries in a new code chunk. + +```{r} +#| code-fold: false +#| message: false +library(readr) +library(dplyr) +library(ggplot2) +library(tidyr) +``` + +3. Read in the data. + + a. Create a new code chunk under the "Read and explore data" header. + + b. Navigate to the data package site and copy the the URL to access the *Time-series of lobster abundance and size* data. To copy the URL: hover over the Download button --> right click --> "Copy Link Address". + + c. Read in the data from the URL using the `read_csv` function and store it as `lobster_abundance`. + + e. Read in the *Time-series of lobster trap buoy counts* data as `lobster_traps` by repeating setps (b) and (c). + +```{r} +#| code-fold: false +#| message: false + +# Read in data +lobster_abundance <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-sbc.77.8&entityid=f32823fba432f58f66c06b589b7efac6") + +lobster_traps <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-sbc.77.8&entityid=66dd61c75bda17c23a3bce458c56ed84") + +``` + +4. Look at each data frame. Take a minute to explore their data structure, find out which data types are in the data frame, or use a function to get a high-level summary of the data. + +5. Use the `Git` workflow: `Stage > Commit > Pull > Push`. +::: + + +## Convert missing values using `mutate()` and `na_if()` + +::: callout-note +### Exercise 1: `lobster_abundance` + +The variable `SIZE_MM` uses -99999 as the code for missing values (see metadata). This has the potential to cause conflicts with our analyses. Modify the data following these steps: + +1. Verify the `SIZE_MM` variable contains -99999 values using `unique()`. +2. Convert every -99999 value to an `NA` value using `mutate()` and `na_if()`. Look up the help page to see how to use `na_if()`. +3. Check your output data using `unique()`. +::: + +```{r} +lobster_abundance <- lobster_abundance %>% + mutate(SIZE_MM = na_if(SIZE_MM, -99999)) +``` + +## `filter()` practice + +::: callout-note +### Exercise 2: `lobster_abundance` + +Create a subset with the data for the lobsters at Arroyo Quemado (`AQUE`) that have a carapace length greater than 70 mm. +::: + +```{r} +aque_70mm <- lobster_abundance %>% + filter(SITE == "AQUE" & SIZE_MM >= 70) +``` + +::: callout-note +### Exercise 3: `lobster_traps` +Create a subset with the traps' information at all sites where abundance data is not `NA`. Note that you first have to identify which are these sites. + +HINT: use `%in%`. + +::: + +```{r} +## Create a vector with unique sites in lobster_abundance +abundance_sites <- unique(lobster_abundance$SITE) + +## Filter sites in vector above +traps_subset <- lobster_traps %>% + filter(SITE %in% abundance_sites) + +``` + + + +::: callout-important +## Save your work and use `Git` + +Don't forget the `Git` workflow! After you've completed the exercises or reached a significant stopping point, use the workflow: `Stage > Commit > Pull > Push`. +::: + + +## Calculate totals by site and year + + +::: callout-note +### Exercise 4: `lobster_abundance` and `traps_subset` +For each data frame, calculate the total count and total traps by site and year (i.e. total traps for every combination of site and year). +Store these summary statistics in separate data frames. + +HINT: use `group_by()` and `summarize()`. + +Do you notice anything not right in the outcome data frame? We'll get to it in exercise 7. +::: + +```{r} +#| warning: false + +total_abundance <- lobster_abundance %>% + group_by(SITE, YEAR) %>% + summarize(total_lobsters = sum(COUNT, na.rm = T)) + + +total_traps <- traps_subset %>% + group_by(SITE, YEAR) %>% + summarize(total_traps = sum(TRAPS, na.rm = T)) + +``` + + + +## Joining two data frames + +::: callout-note +### Exercise 5: `total_abundance` and `total_traps` + +Use one of the `join_` functions to get an output data frame with the following columns: +`SITE`, `YEAR`, `total_lobsters`, `total_traps`. + + +Discuss with your neighbor how the output data frame varies when you do a `left_join()` or a `full_join()`. +What happens when you do an `inner_join()`? +::: + +```{r} +abundance_traps <- total_abundance %>% + left_join(total_traps, by = c("SITE", "YEAR")) + +## Or + +abundance_traps <- total_abundance %>% + full_join(total_traps, by = c("SITE", "YEAR")) + +## Or + +abundance_traps <- total_abundance %>% + inner_join(total_traps, by = c("SITE", "YEAR")) + +``` + +## Adding a new column + +::: callout-note +### Exercise 6 + +The sites `IVEE` and `NAPL` are marine protected areas (MPAs). +Read the documentation about the `case_when()` function and use it to add this designation to your data set. + +HINT: Notice you will have to create a new column with the MPA designation. +What function have you previously used to create new columns? +::: + +```{r} +lobster_mpa <- abundance_traps %>% + mutate(DESIGNATION = case_when( + SITE %in% c("IVEE", "NAPL") ~ "MPA", + SITE %in% c("AQUE", "CARP", "MOHK") ~ "not MPA")) + +``` + + +::: callout-important +## Save your work and use `Git` + +Don't forget the `Git` workflow! After you've completed the exercises or reached a significant stopping point, use the workflow: `Stage > Commit > Pull > Push`. +::: + + +## Bonus + +::: callout-note +### Exercise 7 + +What would you do to fix the issues with the values in the `total_traps` column? Find the root of the issue, modify the dataset to solve it, and discuss with your neighbor where in your script you would include this step. +::: + +```{r} +# Replace -99999 values for NAs at the beginning of the script, +# similar to what we did in question 1 but for lobster_traps data frame. +# Then re run all the other steps. + +lobster_traps <- lobster_traps %>% + mutate(TRAPS = na_if(TRAPS, -99999)) +``` + + + diff --git a/materials/sections/rstudio-server-setup.qmd b/materials/sections/rstudio-server-setup.qmd index 8c578f38..d99cac53 100644 --- a/materials/sections/rstudio-server-setup.qmd +++ b/materials/sections/rstudio-server-setup.qmd @@ -4,6 +4,32 @@ - Organize an R Project for effective project management - Understand how to move in an R Project using paths and working directories +## Before we start + +### Non-Verbal Feedback + +We'll be using the Zoom "Non Verbal Feedback" buttons throughout this session. We will ask you to put a green check by your name when you're all set and ready to move on, and a red x by your name if you're stuck or need assistance. These buttons can be found in the Reaction menu on the toolbar. When you're asked to answer using these buttons, please ensure that you select one so that the instructor has the feedback that they need to either continue the lesson or pause until everyone gets back on the same page. + +![](images/non-verbal-feedback.png) + + + +### Questions and Getting Help + +When you need to ask a question, please do so in one of the following ways: + +- Turn your mic on and ask. If you are uncomfortable interrupting the instructor, you may also raise your virtual hand (in the Reaction menu) and the session facilitator will ask the instructor to pause and call upon you. +- Ask your question in the chat + + +If you have an issue/error and get stuck, you can ask for help in the following ways: + +- Turn your mic on and ask for help. See also above regarding the use of a virtual raised hand. +- Let one of the instructors know through the chat +- If prompted to do so, put a red X next to your name as your status in the participant window. +- If you have an issue that requires in-depth trouble shooting, please let us know and we will coordinate a time with you after this call. + + ## Logon to the RStudio Server To prevent us from spending most of this lesson troubleshooting the myriad of issues that can arise when setting up the R, RStudio, and git environments, we have chosen to have everyone work on a remote server with all of the software you need installed. We will be using a special kind of RStudio just for servers called RStudio Server. If you have never worked on a remote server before, you can think of it like working on a different computer via the internet. Note that the server has no knowledge of the files on your local filesystem, but it is easy to transfer files from the server to your local computer, and vice-versa, using the RStudio server interface. @@ -17,6 +43,42 @@ After you have successfully changed your password log in at: [https://included-c [![](images/included-crab-login.png)](https://included-crab.nceas.ucsb.edu/) ::: + +## Programming in R + +![Artwork by Allison Horst](images/allison-horst-code-kitchen.png) + +There is a vibrant community out there that is collectively developing increasingly easy to use and powerful open source programming tools. The changing landscape of programming is making learning how to code easier than it ever has been. Incorporating programming into analysis workflows not only makes science more efficient, but also more computationally reproducible. In this course, we will use the programming language R, and the accompanying integrated development environment (IDE) RStudio. R is a great language to learn for data-oriented programming because it is widely adopted, user-friendly, and (most importantly) open source! + +So what is the difference between R and RStudio? Here is an analogy to start us off. **If you were a chef, R is a knife**. You have food to prepare, and the knife is one of the tools that you'll use to accomplish your task. + +And **if R were a knife, RStudio is the kitchen**. RStudio provides a place to do your work! Other tools, communication, community, it makes your life as a chef easier. RStudio makes your life as a researcher easier by bringing together other tools you need to do your work efficiently - like a file browser, data viewer, help pages, terminal, community, support, the list goes on. So it's not just the infrastructure (the user interface or IDE), although it is a great way to learn and interact with your variables, files, and interact directly with git. It's also data science philosophy, R packages, community, and more. Although you can prepare food without a kitchen and we could learn R without RStudio, that's not what we're going to do. We are going to take advantage of the great RStudio support, and learn R and RStudio together. + +Something else to start us off is to mention that you are learning a new language here. It's an ongoing process, it takes time, you'll make mistakes, it can be frustrating, but it will be overwhelmingly awesome in the long run. We all speak at least one language; it's a similar process, really. And no matter how fluent you are, you'll always be learning, you'll be trying things in new contexts, learning words that mean the same as others, etc, just like everybody else. And just like any form of communication, there will be miscommunication that can be frustrating, but hands down we are all better off because of it. + +While language is a familiar concept, programming languages are in a different context from spoken languages and you will understand this context with time. For example: you have a concept that there is a first meal of the day, and there is a name for that: in English it's "breakfast." So if you're learning Spanish, you could expect there is a word for this concept of a first meal. (And you'd be right: "desayuno"). We will get you to expect that programming languages also have words (called functions in R) for concepts as well. You'll soon expect that there is a way to order values numerically. Or alphabetically. Or search for patterns in text. Or calculate the median. Or reorganize columns to rows. Or subset exactly what you want. We will get you to increase your expectations and learn to ask and find what you're looking for. + + +## RStudio IDE + +Let's take a tour of the RStudio interface. + +![](images/RStudio_IDE.png) + +Notice the default panes: + +- Console (entire left) +- Environment/History (tabbed in upper right) +- Files/Plots/Packages/Help (tabbed in lower right) + +::: {.callout-caution icon="false"} +### Quick Tip + +You can change the default location of the panes, among many other things, see [Customizing RStudio](https://support.rstudio.com/hc/en-us/articles/200549016-Customizing-RStudio). +::: + + + ## Create an R Project In this course, we are going to be using an R project to organize our work. An R project is tied to a directory on your local computer, and makes organizing your work and collaborating with others easier. @@ -29,7 +91,7 @@ In this course, we are going to be using an R project to organize our work. An R 1. In the "File" menu, select "New Project" 2. Click "New Directory" 3. Click "New Project" -4. Under “Directory name” type: `training_{USERNAME}` (i.e. `training_do-linh`) +4. Under “Directory name” type: `training_{USERNAME}` (i.e. `training_vargas`) 5. Leave "Create Project as subdirectory of:” set to `~` 6. Click "Create Project" @@ -39,7 +101,7 @@ RStudio should open your new project automatically after creating it. One way to ## Organizing an R Project -When starting a new research project, one of the first things I do is create an R Project for it (just like we have here!). The next step is to then populate that project with relevant directories. There are many tools out there that can do this automatically. Some examples are `rrtools` or `usethis::create_package()`. The goal is to organize your project so that it is a compendium of your research. This means that the project has all of the digital parts needed to replicate your analysis, like code, figures, the manuscript, and data access. +When starting a new research project, step 1 is to create an R Project for it (just like we have here!). The next step is to then populate that project with relevant directories. There are many tools out there that can do this automatically. Some examples are `rrtools` or `usethis::create_package()`. The goal is to organize your project so that it is a compendium of your research. This means that the project has all of the digital parts needed to replicate your analysis, like code, figures, the manuscript, and data access. Some common directories are: @@ -64,18 +126,26 @@ Now that we have your project created (and notice we know it’s an R Project be There are two types of paths in computing: **absolute paths** and **relative paths**. -- An **absolute path** always starts with the root of your file system and locates files from there. The absolute path to my project directory is: `/home/do-linh/training_do-linh` +- An **absolute path** always starts with the root of your file system and locates files from there. The absolute path to my project directory is: `/home/vargas-poulsen/training_vargas` - **Relative paths** start from some location in your file system that is below the root. Relative paths are combined with the path of that location to locate files on your system. R (and some other languages like MATLAB) refer to the location where the relative path starts as our **working directory**. -**RStudio projects automatically set the working directory to the directory of the project**. This means that you can reference files from within the project without worrying about where the project directory itself is. If I want to read in a file from the data directory within my project, I can simply type `read.csv("data/samples.csv")` as opposed to `read.csv("/home/do-linh/training_do-linh/data/samples.csv")`. +**RStudio projects automatically set the working directory to the directory of the project**. This means that you can reference files from within the project without worrying about where the project directory itself is. If I want to read in a file from the data directory within my project, the code to do this would be `read.csv("data/samples.csv")` (path relative to my R project) as opposed to `read.csv("/home/vargas-poulsen/training_vargas/data/samples.csv")` (absolute path of my home directory). + +This is not only convenient for you, but also when working collaboratively. For example if Matt makes a copy of my R project that I have published on GitHub, and I am using relative paths, he can run my code exactly as I have written it, without going back and changing `/home/vargas-poulsen/training_vargas/data/samples.csv` to `/home/jones/training_jones/data/samples.csv`. -This is not only convenient for you, but also when working collaboratively. We will talk more about this later, but if Matt makes a copy of my R project that I have published on GitHub, and I am using relative paths, he can run my code exactly as I have written it, without going back and changing `/home/do-linh/training_do-linh/data/samples.csv` to `/home/jones/training_jones/data/samples.csv`. Note that once you start working in projects you should basically never need to run the `setwd()` command. If you are in the habit of doing this, stop and take a look at where and why you do it. Could leveraging the working directory concept of R projects eliminate this need? Almost definitely! + +::: column-margin +`setwd()` sets your working directory to specified file path (aka directory). +::: + Similarly, think about how you work with absolute paths. Could you leverage the working directory of your R project to replace these with relative paths and make your code more portable? Probably! + + ## Setting up R and RStudio on your Computer {#rstudio-personal-setup} ### Check your R Version @@ -123,8 +193,3 @@ update.packages(ask=FALSE) ``` - - - - - diff --git a/materials/sections/slack-instructions.qmd b/materials/sections/slack-instructions.qmd new file mode 100644 index 00000000..3f33a7ae --- /dev/null +++ b/materials/sections/slack-instructions.qmd @@ -0,0 +1,11 @@ +## Slack + +Slack is an instant messaging tool that connects people with the information they need. At NCEAS, we are big fans of Slack. It eases communication within a team, avoiding back-and-forth emailing. It is a much more straightforward way of communicating with each other, making it easier to collaborate. + +You all should be part of the NCEAS Slack space (or at least have been invited to join). And all of you, plus all the instructors, are all part of the #delta-synthesis channel. + +![](images/delta-synthesis-slack.png) + + +This is a discussion channel where you can contact instructors and colleagues with questions and share information relevant to the Delta Science Program / NCEAS data training and synthesis collaboration. During our first week of training, we will create independent channels for each synthesis group to facilitate communication within teams. + diff --git a/materials/sections/synthesis-questions.qmd b/materials/sections/synthesis-questions.qmd new file mode 100644 index 00000000..c24cd134 --- /dev/null +++ b/materials/sections/synthesis-questions.qmd @@ -0,0 +1,4 @@ + +## Synthesis Development + +This is a hands-on facilitated session to guide the development of synthesis questions. diff --git a/materials/session_01.qmd b/materials/session_01.qmd index c2ad1336..863018e8 100644 --- a/materials/session_01.qmd +++ b/materials/session_01.qmd @@ -1,9 +1,8 @@ --- title: "RStudio Server Setup" title-block-banner: true -format: - html: - code-overflow: wrap + --- -{{< include /sections/rstudio-server-setup.qmd >}} \ No newline at end of file +{{< include /sections/rstudio-server-setup.qmd >}} + diff --git a/materials/session_03.qmd b/materials/session_03.qmd index 4ffa25e9..3ecb63a2 100644 --- a/materials/session_03.qmd +++ b/materials/session_03.qmd @@ -1,9 +1,8 @@ --- -title: "Intro to R Programming" +title: "Literate Analysis with Quarto" title-block-banner: true -execute: - eval: false --- -{{< include /sections/intro-r-programming.qmd >}} + +{{< include /sections/r-intro-quarto.qmd >}} diff --git a/materials/session_04.qmd b/materials/session_04.qmd index 03baa5de..4f6305f9 100644 --- a/materials/session_04.qmd +++ b/materials/session_04.qmd @@ -1,7 +1,7 @@ --- -title: "Introduction to RMarkdown" +title: "LEGO® Reproducibility Activity" from: markdown+emoji title-block-banner: true --- -{{< include /sections/r-intro-rmarkdown.qmd >}} \ No newline at end of file +{{< include /sections/activity-reproducibility-lego.qmd >}} \ No newline at end of file diff --git a/materials/session_05.qmd b/materials/session_05.qmd index b356eadd..c43290a9 100644 --- a/materials/session_05.qmd +++ b/materials/session_05.qmd @@ -1,6 +1,8 @@ --- -title: "FAIR and CARE Principles" +title: "Data Management Lens for Publishing and Accesing Data" title-block-banner: true --- -{{< include /sections/fair-care-principles.qmd >}} \ No newline at end of file + + +{{< include /sections/accessing-and-publishing-data.qmd >}} diff --git a/materials/session_06.qmd b/materials/session_06.qmd index 38d3af59..0dd8f9eb 100644 --- a/materials/session_06.qmd +++ b/materials/session_06.qmd @@ -1,6 +1,6 @@ --- -title: "Intro to `Git` and GitHub" +title: "Logic Model" title-block-banner: true --- -{{< include /sections/git-github-intro.qmd >}} \ No newline at end of file +{{< include /sections/logic-modeling.qmd >}} \ No newline at end of file diff --git a/materials/session_07.qmd b/materials/session_07.qmd index 304429fe..982b8298 100644 --- a/materials/session_07.qmd +++ b/materials/session_07.qmd @@ -1,12 +1,10 @@ --- -title: "Cleaning & Wrangling Data" -title-block-banner: true +title: "Introduction to Git and GitHub" +title-block-banner: true format: html: code-overflow: wrap code-link: true --- -{{< include /sections/clean-wrangle-data.qmd >}} - - +{{< include /sections/git-github-intro.qmd >}} diff --git a/materials/session_08.qmd b/materials/session_08.qmd index 2152cc2a..b7aed988 100644 --- a/materials/session_08.qmd +++ b/materials/session_08.qmd @@ -1,6 +1,6 @@ --- -title: "Intro to Tidy Data" +title: "Reproducible Data Access" title-block-banner: true --- -{{< include /sections/intro-tidy-data.qmd >}} \ No newline at end of file +{{< include /sections/programmatic-data-access.qmd >}} diff --git a/materials/session_09.qmd b/materials/session_09.qmd index 2f3dcca1..f1406c2e 100644 --- a/materials/session_09.qmd +++ b/materials/session_09.qmd @@ -1,5 +1,5 @@ --- -title: "R Practice: Tidy Data & Joins" +title: "Data Modeling Essentials" title-block-banner: true execute: eval: false @@ -11,4 +11,6 @@ format: code-overflow: wrap --- -{{< include /sections/r-practice-tidy-data-joins.qmd >}} \ No newline at end of file + + +{{< include /sections/intro-tidy-data.qmd >}} \ No newline at end of file diff --git a/materials/session_10.qmd b/materials/session_10.qmd index afe2227b..61983f57 100644 --- a/materials/session_10.qmd +++ b/materials/session_10.qmd @@ -1,6 +1,6 @@ --- -title: "Collaborating using `Git` and GitHub & Merge Conflicts" +title: "Hands On Synthesis" title-block-banner: true --- -{{< include /sections/git-collab-merge-conflicts.qmd >}} + diff --git a/materials/session_11.qmd b/materials/session_11.qmd index a2df4a6e..5cd96880 100644 --- a/materials/session_11.qmd +++ b/materials/session_11.qmd @@ -1,6 +1,7 @@ --- -title: "Publishing to the Web using GitHub Pages" +title: "Cleaning and Wrangling Data" title-block-banner: true --- -{{< include /sections/git-github-publishing-analysis.qmd >}} \ No newline at end of file +{{< include /sections/clean-wrangle-data.qmd >}} + diff --git a/materials/session_12.qmd b/materials/session_12.qmd index a91f0add..a750f4b6 100644 --- a/materials/session_12.qmd +++ b/materials/session_12.qmd @@ -1,10 +1,18 @@ --- -title: "Intro to Data visualization" +title: "Practice: Cleaning and Wrangling Data" title-block-banner: true +execute: + eval: false format: - html: + html: + code-link: true + code-fold: true + code-summary: "Answer" code-overflow: wrap + --- -{{< include /sections/visualization-ggplot-leaflet.qmd >}} + +{{< include /sections/r-practice-clean-wrangle.qmd >}} + diff --git a/materials/session_13.qmd b/materials/session_13.qmd index 4009f5bc..2b966cb0 100644 --- a/materials/session_13.qmd +++ b/materials/session_13.qmd @@ -1,5 +1,5 @@ --- -title: "R Practice: Collaborating on, Wrangling & Visualizing Data" +title: "Hands on Synthesis" title-block-banner: true execute: eval: false @@ -11,4 +11,4 @@ format: code-overflow: wrap --- -{{< include /sections/r-practice-clean-wrangle-visualize.qmd >}} +{{< include /sections/synthesis-questions.qmd >}} diff --git a/materials/session_14.qmd b/materials/session_14.qmd index bd36870f..4805c2d2 100644 --- a/materials/session_14.qmd +++ b/materials/session_14.qmd @@ -1,6 +1,6 @@ --- -title: "Writing Data Management Plans, Metadata Best Practices & Publishing Data" +title: "Creating Functions in R" title-block-banner: true --- -{{< include /sections/combined-dmp-metadata-publishing.qmd >}} +{{< include /sections/r-creating-functions.qmd >}} diff --git a/materials/session_15.qmd b/materials/session_15.qmd index 2995d4f0..7382a8b6 100644 --- a/materials/session_15.qmd +++ b/materials/session_15.qmd @@ -1,5 +1,5 @@ --- -title: "Using `sf` for Spatial Data & Intro to Making Maps" +title: "Creating R Packages" title-block-banner: true format: html: @@ -7,4 +7,4 @@ format: code-overflow: wrap --- -{{< include /sections/geospatial-vector-analysis.qmd >}} \ No newline at end of file +{{< include /sections/r-creating-packages.qmd >}} \ No newline at end of file diff --git a/materials/session_16.qmd b/materials/session_16.qmd index 85e1c729..ace853ab 100644 --- a/materials/session_16.qmd +++ b/materials/session_16.qmd @@ -1,17 +1,9 @@ --- -title: "R Practice: Wrangling Spatial Data & Making Maps" +title: "FAIR and CARE principles" title-block-banner: true -execute: - eval: false -format: - html: - code-link: true - code-fold: true - code-summary: "Answer" - code-overflow: wrap --- -{{< include /sections/r-practice-sf-wrangle-maps.qmd >}} +{{< include /sections/fair-care-principles.qmd >}} diff --git a/materials/session_17.qmd b/materials/session_17.qmd index f0a96c2f..98bdce0b 100644 --- a/materials/session_17.qmd +++ b/materials/session_17.qmd @@ -1,8 +1,6 @@ --- -title: "Git Workflows: Pull Requests, Branches & Forks" +title: "Collaborating using Git & GitHub and Merge Conflicts" title-block-banner: true --- -{{< include /sections/git-workflows.qmd >}} - - +{{< include /sections/git-collab-merge-conflicts.qmd >}} diff --git a/materials/session_18.qmd b/materials/session_18.qmd index 8c3c3bb0..5b884f04 100644 --- a/materials/session_18.qmd +++ b/materials/session_18.qmd @@ -1,8 +1,8 @@ --- -title: "Reproducibility & Provenance" +title: "Hands on Synthesis" title-block-banner: true --- -{{< include /sections/provenance-reproducibility-datapaper.qmd >}} + diff --git a/materials/session_20.qmd b/materials/session_20.qmd new file mode 100644 index 00000000..1624b822 --- /dev/null +++ b/materials/session_20.qmd @@ -0,0 +1,6 @@ +--- +title: "Team Communication" +title-block-banner: true +--- + +{{< include /sections/slack-instructions.qmd >}} \ No newline at end of file