Skip to content

Commit

Permalink
vertical line, mark rule, color consistency improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
trevorcampbell committed Nov 21, 2023
1 parent 1cedf29 commit 21170d6
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 55 deletions.
21 changes: 11 additions & 10 deletions source/inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ reliable—is there any way to improve the estimate? One way to improve a
point estimate is to take a *larger* sample. To illustrate what effect this
has, we will take many samples of size 20, 50, 100, and 500, and plot the
sampling distribution of the sample mean. We indicate the mean of the sampling
distribution with a orange vertical line.
distribution with a vertical line.

```{code-cell} ipython3
:tags: [remove-input]
Expand All @@ -721,10 +721,10 @@ glue(
alt.X("price", bin=alt.Bin(maxbins=30)),
alt.Y("count()")
),
base.mark_rule(color="#f58518", size=3).encode(
base.mark_rule(color="black", size=1.5, strokeDash=[6]).encode(
x="mean(price)"
),
base.mark_text(align="left", color="#f58518", size=12, fontWeight="bold", dx=10).transform_aggregate(
base.mark_text(align="left", color="black", size=12, fontWeight="bold", dx=10).transform_aggregate(
mean_price="mean(price)",
).transform_calculate(
label="'Mean = ' + round(datum.mean_price * 10) / 10"
Expand Down Expand Up @@ -755,7 +755,7 @@ glue(
:name: fig:11-example-means7
:figclass: caption-hack
Comparison of sampling distributions, with mean highlighted as a vertical orange line.
Comparison of sampling distributions, with mean highlighted as a vertical line.
```

+++
Expand Down Expand Up @@ -1154,17 +1154,17 @@ sampling_distribution.encoding.x["bin"]["extent"] = (90, 250)
alt.vconcat(
alt.layer(
sampling_distribution,
alt.Chart(sample_estimates).mark_rule(color="#f58518", size=2).encode(x="mean(mean_price)"),
alt.Chart(sample_estimates).mark_text(color="#f58518", size=12, align="left", dx=16, fontWeight="bold").encode(
alt.Chart(sample_estimates).mark_rule(color="black", size=1.5, strokeDash=[6]).encode(x="mean(mean_price)"),
alt.Chart(sample_estimates).mark_text(color="black", size=12, align="left", dx=16, fontWeight="bold").encode(
x="mean(mean_price)",
y=alt.value(7),
text=alt.value(f"Mean = {sampling_distribution['data']['mean_price'].mean().round(1)}")
)
).properties(title="Sampling distribution", height=150),
alt.layer(
boot_est_dist,
alt.Chart(boot20000_means).mark_rule(color="#f58518", size=2).encode(x="mean(mean_price)"),
alt.Chart(boot20000_means).mark_text(color="#f58518", size=12, align="left", dx=18, fontWeight="bold").encode(
alt.Chart(boot20000_means).mark_rule(color="black", size=1.5, strokeDash=[6]).encode(x="mean(mean_price)"),
alt.Chart(boot20000_means).mark_text(color="black", size=12, align="left", dx=18, fontWeight="bold").encode(
x="mean(mean_price)",
y=alt.value(7),
text=alt.value(f"Mean = {boot_est_dist['data']['mean_price'].mean().round(1)}")
Expand Down Expand Up @@ -1275,14 +1275,15 @@ the middle 95\% of the sample mean prices in the bootstrap distribution. We can
visualize the interval on our distribution in {numref}`fig:11-bootstrapping9`.

```{code-cell} ipython3
:tags: [remove-input]
# Create the annotation for for the 2.5th percentile
rule_025 = alt.Chart().mark_rule(color="#f58518", size=3, strokeDash=[5]).encode(
rule_025 = alt.Chart().mark_rule(color="black", size=1.5, strokeDash=[6]).encode(
x=alt.datum(ci_bounds[0.025])
).properties(
width=500
)
text_025 = rule_025.mark_text(
color="#f58518",
color="black",
size=12,
fontWeight="bold",
dy=-160
Expand Down
14 changes: 7 additions & 7 deletions source/regression1.md
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ the sale price?
```{code-cell} ipython3
:tags: [remove-output]
small_plot = alt.Chart(small_sacramento).mark_circle().encode(
small_plot = alt.Chart(small_sacramento).mark_circle(opacity=1).encode(
x=alt.X("sqft")
.scale(zero=False)
.title("House size (square feet)"),
Expand All @@ -268,7 +268,7 @@ small_plot = alt.Chart(small_sacramento).mark_circle().encode(
# add an overlay to the base plot
line_df = pd.DataFrame({"x": [2000]})
rule = alt.Chart(line_df).mark_rule(strokeDash=[2, 4]).encode(x="x")
rule = alt.Chart(line_df).mark_rule(strokeDash=[6], size=1.5, color="black").encode(x="x")
small_plot + rule
```
Expand Down Expand Up @@ -315,7 +315,7 @@ for i in range(5):
"sqft": [nearest_neighbors.iloc[i, 4], 2000],
"price": [nearest_neighbors.iloc[i, 6]] * 2
})
h_lines.append(alt.Chart(h_line_df).mark_line(color="orange").encode(x="sqft", y="price"))
h_lines.append(alt.Chart(h_line_df).mark_line(color="black").encode(x="sqft", y="price"))
nn_plot = alt.layer(*h_lines, small_plot, rule)
```
Expand Down Expand Up @@ -352,7 +352,7 @@ prediction
nn_plot_pred = nn_plot + alt.Chart(
pd.DataFrame({"sqft": [2000], "price": [prediction]})
).mark_circle(size=40).encode(x="sqft", y="price", color=alt.value("red"))
).mark_circle(size=80, opacity=1, color="#d62728").encode(x="sqft", y="price")
```

```{code-cell} ipython3
Expand Down Expand Up @@ -493,15 +493,15 @@ sacr_new_preds_hid = pd.concat(
sacr_new_preds_melted_df = sacr_new_preds_hid.melt(id_vars=["sqft"])
errors_plot = (
small_plot
+ alt.Chart(sacr_full_preds_hid).mark_line().encode(x="sqft", y="predicted")
+ alt.Chart(sacr_full_preds_hid).mark_line(color="#ff7f0e").encode(x="sqft", y="predicted")
+ alt.Chart(sacr_new_preds_hid)
.mark_circle(opacity=1)
.encode(x="sqft", y="price")
)
v_lines = []
for i in pts["sqft"]:
line_df = sacr_new_preds_melted_df.query("sqft == @i")
v_lines.append(alt.Chart(line_df).mark_line(color="red").encode(x="sqft", y="value"))
v_lines.append(alt.Chart(line_df).mark_line(color="black").encode(x="sqft", y="value"))
errors_plot = alt.layer(*v_lines, errors_plot)
errors_plot
Expand All @@ -516,7 +516,7 @@ glue("fig:07-verticalerrors", errors_plot, display=False)
:::{glue:figure} fig:07-verticalerrors
:name: fig:07-verticalerrors

Scatter plot of price (USD) versus house size (square feet) with example predictions (blue line) and the error in those predictions compared with true response values for three selected observations (vertical red lines).
Scatter plot of price (USD) versus house size (square feet) with example predictions (orange line) and the error in those predictions compared with true response values for three selected observations (vertical lines).
:::

+++
Expand Down
94 changes: 58 additions & 36 deletions source/regression2.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ small_sacramento = sacramento.sample(n=30)
small_plot = (
alt.Chart(small_sacramento)
.mark_circle()
.mark_circle(opacity=1)
.encode(
x=alt.X("sqft")
.scale(zero=False)
Expand All @@ -129,7 +129,50 @@ small_plot = (
)
)
small_plot += small_plot.transform_regression("sqft", "price").mark_line()
# create df_lines with one fake/empty line (for starting at 2nd color later)
df_lines = {"x": [500, 500], "y": [100000, 100000], "number": ["-1", "-1"]}
# set the domains (range of x values) of lines
min_x = small_sacramento["sqft"].min()
max_x = small_sacramento["sqft"].max()
# add the line of best fit
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(small_sacramento[["sqft"]], small_sacramento[["price"]])
pred_min = float(lm.predict(pd.DataFrame({"sqft": [min_x]})))
pred_max = float(lm.predict(pd.DataFrame({"sqft": [max_x]})))
df_lines["x"].extend([min_x, max_x])
df_lines["y"].extend([pred_min, pred_max])
df_lines["number"].extend(["0", "0"])
# add other similar looking lines
intercept_l = [-64542.23, -6900, -64542.23]
slope_l = [190, 175, 160]
for i in range(len(slope_l)):
df_lines["x"].extend([min_x, max_x])
df_lines["y"].extend([
intercept_l[i] + slope_l[i] * min_x,
intercept_l[i] + slope_l[i] * max_x,
])
df_lines["number"].extend([f"{i+1}", f"{i+1}"])
df_lines = pd.DataFrame(df_lines)
# plot the bogus line to skip the same color as the scatter
small_plot += alt.Chart(
df_lines[df_lines["number"] == "-1"]
).mark_line().encode(
x="x", y="y", color=alt.Color("number", legend=None)
)
# plot the real line with 2nd color
small_plot += alt.Chart(
df_lines[df_lines["number"] == "0"]
).mark_line().encode(
x="x", y="y", color=alt.Color("number", legend=None)
)
small_plot
```
Expand Down Expand Up @@ -189,11 +232,11 @@ prediction = float(lm.predict(pd.DataFrame({"sqft": [2000]})))
# the vertical dotted line
line_df = pd.DataFrame({"x": [2000]})
rule = alt.Chart(line_df).mark_rule(strokeDash=[2, 4]).encode(x="x")
rule = alt.Chart(line_df).mark_rule(strokeDash=[6], size=1.5).encode(x="x")
# the red point
point_df = pd.DataFrame({"x": [2000], "y": [prediction]})
point = alt.Chart(point_df).mark_circle(color="red", size=100).encode(x="x", y="y")
point = alt.Chart(point_df).mark_circle(color="red", size=80, opacity=1).encode(x="x", y="y")
# overlay all plots
small_plot_2000_pred = (
Expand All @@ -204,7 +247,7 @@ small_plot_2000_pred = (
+ alt.Chart(
pd.DataFrame(
{
"x": [2350],
"x": [2450],
"y": [prediction - 41000],
"prediction": ["$" + "{0:,.0f}".format(prediction)],
}
Expand Down Expand Up @@ -242,32 +285,11 @@ Some plausible examples are shown in {numref}`fig:08-several-lines`.
```{code-cell} ipython3
:tags: [remove-cell]
intercept_l = [-64542.23, -6900, -64542.23]
slope_l = [190, 175, 160]
line_color_l = ["green", "purple", "red"]
# set the domains (range of x values) of lines
min_x = small_sacramento["sqft"].min()
max_x = small_sacramento["sqft"].max()
several_lines_plot = small_plot.copy()
for i in range(len(slope_l)):
several_lines_plot += (
alt.Chart(
pd.DataFrame(
{
"x": [min_x, max_x],
"y": [
intercept_l[i] + slope_l[i] * min_x,
intercept_l[i] + slope_l[i] * max_x,
],
}
)
)
.mark_line(color=line_color_l[i])
.encode(x="x", y="y")
)
several_lines_plot += alt.Chart(
df_lines[df_lines["number"] != "0"]
).mark_line().encode(x="x", y="y", color=alt.Color("number",legend=None))
several_lines_plot
```
Expand All @@ -292,7 +314,7 @@ Scatter plot of sale price versus size with many possible lines that could be dr
Simple linear regression chooses the straight line of best fit by choosing
the line that minimizes the **average squared vertical distance** between itself and
each of the observed data points in the training data. {numref}`fig:08-verticalDistToMin` illustrates
these vertical distances as red lines. Finally, to assess the predictive
these vertical distances as lines. Finally, to assess the predictive
accuracy of a simple linear regression model,
we use RMSPE—the same measure of predictive performance we used with K-NN regression.

Expand All @@ -313,7 +335,7 @@ v_lines = []
for i in range(len(small_sacramento)):
sqft_val = small_sacramento.iloc[i]["sqft"]
line_df = small_sacramento_pred.query("sqft == @sqft_val")
v_lines.append(alt.Chart(line_df).mark_line(color="red").encode(x="sqft", y="value"))
v_lines.append(alt.Chart(line_df).mark_line(color="black").encode(x="sqft", y="value"))
error_plot = alt.layer(*v_lines, small_plot).configure_circle(opacity=1)
error_plot
Expand All @@ -328,7 +350,7 @@ glue("fig:08-verticalDistToMin", error_plot)
:::{glue:figure} fig:08-verticalDistToMin
:name: fig:08-verticalDistToMin

Scatter plot of sale price versus size with red lines denoting the vertical distances between the predicted values and the observed data points.
Scatter plot of sale price versus size with lines denoting the vertical distances between the predicted values and the observed data points.
:::

+++
Expand Down Expand Up @@ -482,7 +504,7 @@ so that we can qualitatively assess if the model seems to fit the data well.
sqft_prediction_grid = sacramento[["sqft"]].agg(["min", "max"])
sqft_prediction_grid["predicted"] = lm.predict(sqft_prediction_grid)
all_points = alt.Chart(sacramento).mark_circle(opacity=0.4).encode(
all_points = alt.Chart(sacramento).mark_circle().encode(
x=alt.X("sqft")
.scale(zero=False)
.title("House size (square feet)"),
Expand Down Expand Up @@ -966,7 +988,7 @@ lm_plot_outlier += lm_plot_outlier.transform_regression("sqft", "price").mark_li
outlier_pt = (
alt.Chart(sacramento_outlier)
.mark_circle(color="red", size=100)
.mark_circle(color="#d62728", size=100)
.encode(x="sqft", y="price")
)
Expand All @@ -987,7 +1009,7 @@ outlier_line = (
)
)
.transform_regression("sqft", "price")
.mark_line(color="red")
.mark_line(color="#d62728")
)
lm_plot_outlier += outlier_pt + outlier_line
Expand Down Expand Up @@ -1051,7 +1073,7 @@ outlier_line = (
)
)
.transform_regression("sqft", "price")
.mark_line(color="red")
.mark_line(color="#d62728")
)
lm_plot_outlier_large += outlier_pt + outlier_line
Expand Down
4 changes: 2 additions & 2 deletions source/viz.md
Original file line number Diff line number Diff line change
Expand Up @@ -1474,7 +1474,7 @@ so we are including it here already.
```

```{code-cell} ipython3
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[5], size=2).encode(
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[6], size=1.5).encode(
x=alt.datum(792.458)
)
Expand Down Expand Up @@ -1672,7 +1672,7 @@ morley_hist_rel = alt.Chart(morley_df).mark_bar().encode(
)
# Recreating v_line to indicate that the speed of light is at 0% relative error
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[5], size=2).encode(
v_line = alt.Chart(morley_df).mark_rule(strokeDash=[6], size=1.5).encode(
x=alt.datum(0)
)
Expand Down

0 comments on commit 21170d6

Please sign in to comment.