UBC-DSCI · trevorcampbell · Dec 20, 2023 · Nov 17, 2023 · Nov 20, 2023 · Nov 21, 2023
diff --git a/build_pdf.sh b/build_pdf.sh
@@ -1,2 +1,15 @@
+# Script to generate PDF book
+
+# backup original index.Rmd
+cp source/index.md index_backup.md
+
+# PDF book doesn't need the welcome page. I couldn't find a way to stop jupyterbook from including it.
+# so this script manually removes the welcome page before building the PDF. This is a bit painful, but it works...
+sed -n -i "/graphic/q;p" source/index.md
+echo "# Data Science: A First Introduction" >> source/index.md
+
 chmod -R o+w source/
 docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:20231112004031dd2207 /bin/bash -c "export BOOK_BUILD_TYPE='PDF'; jupyter-book build source --builder pdflatex"
+
+# restore the backed up full index.Rmd
+mv index_backup.md source/index.md
diff --git a/source/_config.yml b/source/_config.yml
@@ -1,6 +1,6 @@
 # Book settings
 title: "Data Science: A First Introduction (Python Edition)"
-author: "The DSCI100 Development Team"
+author: "Tiffany Timbers, Trevor Campbell, Melissa Lee, Joel Ostblom, and Lindsey Heagy"
 copyright: "2022" # Copyright year to be placed in the footer
 logo: "" # A path to the book logo
 # Patterns to skip when building the book. Can be glob-style (e.g. "*skip.ipynb")

diff --git a/source/_toc.yml b/source/_toc.yml
@@ -4,7 +4,7 @@ parts:
 - caption: Front Matter
   chapters:
     - file: preface-text.md
-    #- file: foreword.md
+    - file: foreword-text.md
     - file: acknowledgements.md
     - file: authors.md
 - caption: Chapters

diff --git a/source/classification2.md b/source/classification2.md
@@ -89,7 +89,7 @@ when predicting whether a patient's tumor is malignant or benign!
 
 +++
 
-```{figure} img/classification2/training_test.jpeg
+```{figure} img/classification2/training_test.png
 :name: fig:06-training-test
 
 Splitting the data into training and testing sets.
@@ -1301,7 +1301,7 @@ Here we are using the shortcut `point=True` to layer a point and line chart.
 accuracy_vs_k = alt.Chart(accuracies_grid).mark_line(point=True).encode(
     x=alt.X("n_neighbors").title("Neighbors"),
     y=alt.Y("mean_test_score")
-        .scale(domain=(0.85, 0.90))
+        .scale(zero=False)
         .title("Accuracy estimate")
 )
 
@@ -1388,7 +1388,7 @@ large_accuracies_grid = pd.DataFrame(large_cancer_tune_grid.cv_results_)
 large_accuracy_vs_k = alt.Chart(large_accuracies_grid).mark_line(point=True).encode(
     x=alt.X("param_kneighborsclassifier__n_neighbors").title("Neighbors"),
     y=alt.Y("mean_test_score")
-        .scale(domain=(0.60, 0.90))
+        .scale(zero=False)
         .title("Accuracy estimate")
 )
 
@@ -1664,7 +1664,7 @@ estimate its accuracy.  The overall process is summarized in
 
 +++
 
-```{figure} img/classification2/train-test-overview.jpeg
+```{figure} img/classification2/train-test-overview.png
 :name: fig:06-overview
 
 Overview of K-NN classification.
@@ -1836,7 +1836,7 @@ plt_irrelevant_accuracies = (
         y=alt.Y(
             "accs",
             title="Model Accuracy Estimate",
-            scale=alt.Scale(domain=(0.80, 0.95)),
+            scale=alt.Scale(zero=False),
         ),
     )
 )
@@ -1899,7 +1899,7 @@ plt_irrelevant_nghbrs_fixed = (
         x=alt.X("ks", title="Number of Irrelevant Predictors"),
         y=alt.Y(
             "Accuracy",
-            scale=alt.Scale(domain=(0.75, 0.95)),
+            scale=alt.Scale(zero=False),
         ),
         color=alt.Color("Type"),
     )
@@ -2140,7 +2140,7 @@ fwd_sel_accuracies_plot = (
         y=alt.Y(
             "accuracy",
             title="Estimated Accuracy",
-            scale=alt.Scale(domain=(0.89, 0.935)),
+            scale=alt.Scale(zero=False),
         ),
     )
 )

diff --git a/source/clustering.md b/source/clustering.md
@@ -352,7 +352,7 @@ toy_example_clus1_center = alt.layer(
         x=alt.X("flipper_length_standardized"),
         y=alt.Y("bill_length_standardized")
     ),
-    alt.Chart(clus).mark_circle(color='coral', size=500, opacity=1).encode(
+    alt.Chart(clus).mark_circle(color='steelblue', size=300, opacity=1, stroke='black').encode(
         x=alt.X("mean(flipper_length_standardized)")
             .scale(zero=False, padding=20)
             .title("Flipper Length (standardized)"),
@@ -373,7 +373,7 @@ in {numref}`toy-example-clus1-center`
 :figwidth: 700px
 :name: toy-example-clus1-center
 
-Cluster 0 from the `penguins_standardized` data set example. Observations are in blue, with the cluster center highlighted in orange.
+Cluster 0 from the `penguins_standardized` data set example. Observations are small blue points, with the cluster center highlighted as a large blue point with a black outline.
 :::
 
 ```{code-cell} ipython3
@@ -417,7 +417,7 @@ These distances are denoted by lines in {numref}`toy-example-clus1-dists` for th
 :figwidth: 700px
 :name: toy-example-clus1-dists
 
-Cluster 0 from the `penguins_standardized` data set example. Observations are in blue, with the cluster center highlighted in orange. The distances from the observations to the cluster center are represented as black lines.
+Cluster 0 from the `penguins_standardized` data set example. Observations are small blue points, with the cluster center highlighted as a large blue point with a black outline. The distances from the observations to the cluster center are represented as black lines.
 :::
 
 ```{code-cell} ipython3
@@ -440,14 +440,15 @@ toy_example_all_clus_dists = alt.layer(
         alt.Y("bill_length_standardized"),
         alt.Color('cluster:N')
     ),
-    alt.Chart(penguins_clustered).mark_circle(color='coral', size=200, opacity=1).encode(
+    alt.Chart(penguins_clustered).mark_circle(size=200, opacity=1, stroke = "black").encode(
         alt.X("mean(flipper_length_standardized)")
           .scale(zero=False)
           .title("Flipper Length (standardized)"),
         alt.Y("mean(bill_length_standardized)")
           .scale(zero=False)
           .title("Bill Length (standardized)"),
-        alt.Detail('cluster:N')
+        alt.Detail('cluster:N'),
+        alt.Color('cluster:N')
     )
 )
 glue('toy-example-all-clus-dists', toy_example_all_clus_dists, display=True)
@@ -468,7 +469,7 @@ These distances are denoted by black lines in
 :figwidth: 700px
 :name: toy-example-all-clus-dists
 
-All clusters from the `penguins_standardized` data set example. Observations are in blue, orange, and red with the cluster center highlighted in orange. The distances from the observations to each of the respective cluster centers are represented as black lines.
+All clusters from the `penguins_standardized` data set example. Observations are small orange, blue, and yellow points with cluster centers denoted by larger points with a black outline. The distances from the observations to each of the respective cluster centers are represented as black lines.
 :::
 
 Since K-means uses the straight-line distance to measure the quality of a clustering,

diff --git a/source/foreword-text.md b/source/foreword-text.md
@@ -13,13 +13,13 @@ kernelspec:
   name: python3
 ---
 
-# Foreword -- TBD
+# Foreword
 
 *Roger D. Peng*
 
 *Johns Hopkins Bloomberg School of Public Health*
 
-*2022-01-04*
+*2023-11-30*
 
 The field of data science has expanded and grown significantly in recent years, 
 attracting excitement and interest from many different directions. The demand for introductory
@@ -44,9 +44,10 @@ is and what the implications are for the activities in which members of the fiel
 
 The first important concept addressed by this book is tidy data, which is a format for
 tabular data formally introduced to the statistical community in a 2014 paper by Hadley
-Wickham. The tidy data organization strategy has proven a powerful abstract concept for
-conducting data analysis, in large part because of the vast toolchain implemented in the
-Tidyverse collection of R packages. The second key concept is the development of workflows
+Wickham. Although originally popularized within the R programming language community
+via the Tidyverse package collection, the tidy data format is a language-independent concept
+that facilitates the application of powerful generalized data cleaning and wrangling tools.
+The second key concept is the development of workflows
 for reproducible and auditable data analyses. Modern data analyses have only grown in
 complexity due to the availability of data and the ease with which we can implement complex
 data analysis procedures. Furthermore, these data analyses are often part of 
@@ -61,7 +62,7 @@ collaboration is a core element of data science.
 This book takes these core concepts and focuses on how one can apply them to *do* data
 science in a rigorous manner. Students who learn from this book will be well-versed in
 the techniques and principles behind producing reliable evidence from data. This book is
-centered around the use of the R programming language within the tidy data framework,
+centered around the implementation of the tidy data framework within the Python programming language,
 and as such employs the most recent advances in data analysis coding. The use of Jupyter
 notebooks for exercises immediately places the student in an environment that encourages
 auditability and reproducibility of analyses. The integration of git and GitHub into the

diff --git a/source/img/classification2/ML-paradigm-test.ai b/source/img/classification2/ML-paradigm-test.ai
diff --git a/source/img/classification2/ML-paradigm-test.png b/source/img/classification2/ML-paradigm-test.png
diff --git a/source/img/classification2/cv.ai b/source/img/classification2/cv.ai
diff --git a/source/img/classification2/cv.png b/source/img/classification2/cv.png