diff --git a/DESCRIPTION b/DESCRIPTION index 7fafc38..d73ae42 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: WVPlots Type: Package Title: Common Plots for Analysis Version: 1.2.1 -Date: 2019-10-04 +Date: 2019-10-09 Authors@R: c( person("John", "Mount", email = "jmount@win-vector.com", role = c("aut", "cre")), person("Nina", "Zumel", email = "nzumel@win-vector.com", role = c("aut")), diff --git a/NEWS.md b/NEWS.md index b76a7ad..d70f355 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,8 @@ -# WVPlots 1.2.1 2019-10-04 +# WVPlots 1.2.1 2019-10-09 * Fix PlotDistCountBinomial. + * More plots, adjust plots. # WVPlots 1.2.0 2019-10-03 diff --git a/docs/articles/WVPlots_concept.html b/docs/articles/WVPlots_concept.html index 59def5c..6e87a03 100644 --- a/docs/articles/WVPlots_concept.html +++ b/docs/articles/WVPlots_concept.html @@ -85,7 +85,7 @@
vignettes/WVPlots_concept.Rmd
WVPlots_concept.Rmd
vignettes/WVPlots_examples.Rmd
WVPlots_examples.Rmd
Plots precision and recall as functions of different classifier thresholds.
+ +PRTPlot()
can also plot sensitivity, specificity, and false positive rate as a function of threshold. One application for this is to “unroll” an ROC Plot to explicitly match thresholds to given achievable combinations of false positive rate (AKA (1 - specificity)) and sensitivity (AKA recall or true positive rate). Compare the below graph with the ROC plot for frm
, above.
set.seed(34903490)
-
-# discrete variable: letters of the alphabet
-# frequencies of letters in English
-# source: http://en.algoritmy.net/article/40379/Letter-frequency-English
-letterFreqs = c(8.167, 1.492, 2.782, 4.253, 12.702, 2.228,
- 2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929,
- 0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.360, 0.150, 1.974, 0.074)
-letterFreqs = letterFreqs/100
-letterFrame = data.frame(letter = letters, freq=letterFreqs)
-
-# now let's generate letters according to their letter frequencies
-N = 1000
-randomDraws = data.frame(draw=1:N, letter=sample(letterFrame$letter, size=N, replace=TRUE, prob=letterFrame$freq))
-
-WVPlots::ClevelandDotPlot(randomDraws, "letter", title = "Example Cleveland-style dot plot")
WVPlots::ClevelandDotPlot(randomDraws, "letter", limit_n = 10, title = "Top 10 most frequent letters")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=0, title="Example Cleveland-style dot plot, unsorted")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=1, stem=FALSE, title="Example with increasing sort order + coord_flip, no stem") + ggplot2::coord_flip()
ClevelandDotPlot
also accepts an integral x variable. You probably want sort = 0
in this case.
set.seed(34903490)
-N = 1000
-ncar_vec = 0:5
-prob = c(1.5, 3, 3.5, 2, 1, 0.75); prob = prob/sum(prob)
-
-df = data.frame(num_cars = sample(ncar_vec, size = N, replace = TRUE, prob=prob))
-WVPlots::ClevelandDotPlot(df, "num_cars", sort = 0, title = "Distribution of household vehicle ownership")
set.seed(34903490)
+
+# discrete variable: letters of the alphabet
+# frequencies of letters in English
+# source: http://en.algoritmy.net/article/40379/Letter-frequency-English
+letterFreqs = c(8.167, 1.492, 2.782, 4.253, 12.702, 2.228,
+ 2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929,
+ 0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.360, 0.150, 1.974, 0.074)
+letterFreqs = letterFreqs/100
+letterFrame = data.frame(letter = letters, freq=letterFreqs)
+
+# now let's generate letters according to their letter frequencies
+N = 1000
+randomDraws = data.frame(draw=1:N, letter=sample(letterFrame$letter, size=N, replace=TRUE, prob=letterFrame$freq))
+
+WVPlots::ClevelandDotPlot(randomDraws, "letter", title = "Example Cleveland-style dot plot")
WVPlots::ClevelandDotPlot(randomDraws, "letter", limit_n = 10, title = "Top 10 most frequent letters")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=0, title="Example Cleveland-style dot plot, unsorted")
WVPlots::ClevelandDotPlot(randomDraws, "letter", sort=1, stem=FALSE, title="Example with increasing sort order + coord_flip, no stem") + ggplot2::coord_flip()
ClevelandDotPlot
also accepts an integral x variable. You probably want sort = 0
in this case.
Plot a bar chart of row counts conditioned on the categorical variable condvar
, faceted on a second categorical variable, refinevar
. Each faceted plot also shows a “shadow plot” of the totals conditioned on condvar
alone.
This plot enables comparisons of sub-population totals across both condvar
and refinevar
simultaneously.
set.seed(354534)
-N = 100
-
-# rough proportions of eye colors
-eprobs = c(0.37, 0.36, 0.16, 0.11)
-
-eye_color = sample(c("Brown", "Blue", "Hazel", "Green"), size = N, replace = TRUE, prob = eprobs)
-sex = sample(c("Male", "Female"), size = N, replace = TRUE)
-
-# A data frame of eye color by sex
-dframe = data.frame(eye_color = eye_color, sex = sex)
-
-WVPlots::ShadowPlot(dframe, "eye_color", "sex", title = "Shadow plot of eye colors by sex")
set.seed(354534)
+N = 100
+
+# rough proportions of eye colors
+eprobs = c(0.37, 0.36, 0.16, 0.11)
+
+eye_color = sample(c("Brown", "Blue", "Hazel", "Green"), size = N, replace = TRUE, prob = eprobs)
+sex = sample(c("Male", "Female"), size = N, replace = TRUE)
+
+# A data frame of eye color by sex
+dframe = data.frame(eye_color = eye_color, sex = sex)
+
+WVPlots::ShadowPlot(dframe, "eye_color", "sex", title = "Shadow plot of eye colors by sex")
Plot a histogram of a continuous variable xvar
, faceted on a categorical conditioning variable, condvar
. Each faceted plot also shows a “shadow plot” of the unconditioned histogram for comparison.
set.seed(354534)
-N = 100
-
-dframe = data.frame(x = rnorm(N), gp = "region 2", stringsAsFactors = FALSE)
-dframe$gp = with(dframe, ifelse(x < -0.5, "region 1",
- ifelse(x > 0.5, "region 3", gp)))
-
-WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region")
ShadowHist
uses the Brewer Dark2 palette by default. You can pass in another Brewer palette to change the color scheme. If you prefer all the histograms to be the same color, set monochrome=TRUE
.
set.seed(354534)
+N = 100
+
+dframe = data.frame(x = rnorm(N), gp = "region 2", stringsAsFactors = FALSE)
+dframe$gp = with(dframe, ifelse(x < -0.5, "region 1",
+ ifelse(x > 0.5, "region 3", gp)))
+
+WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region")
To use a non-Brewer palette, such as viridis, or a manual color map, set palette=NULL
. Here’s an example of setting the color palette manually.
colormap = c("#1F968BFF", "#29AF7FFF", "#55C667FF")
-
-WVPlots::ShadowHist(dframe, "x", "gp", title = "X values by region", palette=NULL) +
- ggplot2::scale_fill_manual(values=colormap)
ShadowHist
uses the Brewer Dark2 palette by default. You can pass in another Brewer palette to change the color scheme. If you prefer all the histograms to be the same color, set monochrome=TRUE
.
To use a non-Brewer palette, such as viridis, or a manual color map, set palette=NULL
. Here’s an example of setting the color palette manually.
classes = c("a", "b", "c")
-means = c(2, 4, 3)
-names(means) = classes
-label = sample(classes, size=1000, replace=TRUE)
-meas = means[label] + rnorm(1000)
-frm2 = data.frame(label=label,
- meas = meas)
-
-WVPlots::ScatterBoxPlot(frm2, "label", "meas", pt_alpha=0.2, title="Example Scatter/Box plot")
classes = c("a", "b", "c")
+means = c(2, 4, 3)
+names(means) = classes
+label = sample(classes, size=1000, replace=TRUE)
+meas = means[label] + rnorm(1000)
+frm2 = data.frame(label=label,
+ meas = meas)
+
+WVPlots::ScatterBoxPlot(frm2, "label", "meas", pt_alpha=0.2, title="Example Scatter/Box plot")
Compare to a binomial with the same success rate as the observed data
+set.seed(13951)
+trial_size = 20 # one trial is 20 flips
+ntrial = 100 # run 100 trials
+true_frate = 0.4 # true heads probability
+fdata = data.frame(n_heads = rbinom(ntrial, trial_size, true_frate))
+
+title = paste("Distribution of head counts, trial size =", trial_size)
+# compare to empirical p
+WVPlots::PlotDistCountBinomial(fdata, "n_heads", trial_size, title)
Compare to a binomial with a specified success rate
+ + +set.seed(349521)
+N = 100 # number of cohorts
+psucc = 0.15 # true success rate in population
+group_size = round(runif(N, min=25, 50)) # sizes of observed sample groups
+nsucc = rbinom(N, group_size, psucc) # successes in each group
+hdata = data.frame(n_success=nsucc, group_size=group_size)
+
+# observed rate of successes in each group
+hdata$rate_success = with(hdata, n_success/group_size)
+
+title = "Observed prevalence of success in population"
+
+WVPlots::PlotDistHistBeta(hdata, "rate_success", title)
y = c(1,2,3,4,5,10,15,18,20,25)
-x = seq_len(length(y))
-df = data.frame(x=x,y=y)
-
-WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="centered smooth, one group")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="left smooth, one group", align="left")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="right smooth, one group", align="right")
n = length(x)
-df = rbind(data.frame(x=x, y=y+rnorm(n), gp="times 1"),
- data.frame(x=x, y=0.5*y + rnorm(n), gp="times 1/2"),
- data.frame(x=x, y=2*y + rnorm(n), gp="times 2"))
-
-WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="centered smooth, multigroup")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="left smooth, multigroup", align="left")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="right smooth, multigroup", align="right")
y = c(1,2,3,4,5,10,15,18,20,25)
+x = seq_len(length(y))
+df = data.frame(x=x,y=y)
+
+WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="centered smooth, one group")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="left smooth, one group", align="left")
WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="right smooth, one group", align="right")
n = length(x)
+df = rbind(data.frame(x=x, y=y+rnorm(n), gp="times 1"),
+ data.frame(x=x, y=0.5*y + rnorm(n), gp="times 1/2"),
+ data.frame(x=x, y=2*y + rnorm(n), gp="times 2"))
+
+WVPlots::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="centered smooth, multigroup")
set.seed(52523)
-d = data.frame(meas=rnorm(100))
-threshold = -1.5
-WVPlots::ShadedDensity(d, "meas", threshold,
- title="Example shaded density plot, left tail")
WVPlots::ShadedDensity(d, "meas", -threshold, tail="right",
- title="Example shaded density plot, right tail")
NEWS.md
Plot Precision-Recall or Enrichment-Recall as a function of threshold.
+Plot classifier performance metrics as a function of threshold.
PRTPlot(frame, xvar, truthVar, truthTarget, title, ..., +PRTPlot(frame, predVar, truthVar, truthTarget, title, ..., plotvars = c("precision", "recall"), thresholdrange = c(-Inf, Inf), linecolor = "black")@@ -141,12 +141,12 @@Arg
data frame to get values from
- xvar -+ name of the independent (input or model) column in frame
predVar +name of the column of predicted scores
truthVar -+ name of the dependent (output or result to be modeled) column in frame
name of the column of actual outcomes in frame
truthTarget @@ -162,7 +162,7 @@Arg
plotvars -+ variables to plot, must be at least one of "precision", "recall" and "enrichment". Defaults to c("precision", "recall")
variables to plot, must be at least one of the measures listed below. Defaults to c("precision", "recall")
thresholdrange @@ -182,20 +182,34 @@Details the average rate of positives. Plotting precision-recall or enrichment-recall as a function of classifier score helps identify a score threshold that achieves an acceptable tradeoff between precision and recall, or enrichment and recall. +
In addition to precision/recall,
+PRTPlot
can plot a number of other metrics:+
+ +- +
precision: fraction of predicted positives that are true positives
- +
recall: fraction of true positives that were predicted to be true
- +
enrichment: ratio of classifier precision to prevalence of positive class
- +
sensitivity: the same as recall (also known as the true positive rate)
- +
specificity: fraction of true negatives to all negatives (or 1 - false_positive_rate)
- +
false_positive_rate: fraction of negatives predicted to be true over all negatives
For example, plotting sensitivity/false_positive_rate as functions of threshold will "unroll" an ROC Plot.
+Plots are in a single column, in the order specified by
plotvars
.See also
- +Examples
-set.seed(34903490) -x = rnorm(50) -y = 0.5*x^2 + 2*x + rnorm(length(x)) -frm = data.frame(x=x,y=y,yC=y>=as.numeric(quantile(y,probs=0.8))) -frm$absY <- abs(frm$y) -frm$posY = frm$y > 0 -frm$costX = 1 -WVPlots::PRTPlot(frm, "x", "yC", TRUE, title="Example Precision-Recall threshold plot")+df <- iris +df$isVersicolor <- with(df, Species=='versicolor') +model = glm(isVersicolor ~ Petal.Length + Petal.Width + Sepal.Length + Sepal.Width, + data=df, family=binomial) +df$pred = predict(model, newdata=df, type="response") + +WVPlots::PRTPlot(df, "pred", "isVersicolor", TRUE, title="Example Precision-Recall threshold plot")+WVPlots::PRTPlot(df, "pred", "isVersicolor", TRUE, + plotvars = c("sensitivity", "specificity", "false_positive_rate"), + title="Sensitivity/specificity/FPR as functions of threshold")PlotDistDensityNormal(frm, xvar, title, ..., adjust = 0.5, - curve_color = "black", normal_color = "blue", mean_color = "blue", - sd_color = "darkgray")+ curve_color = "lightgray", normal_color = "blue", + mean_color = "blue", sd_color = "darkgray")Arguments
diff --git a/docs/reference/PlotDistHistBeta.html b/docs/reference/PlotDistHistBeta.html index 4b92ef4..3d9138a 100644 --- a/docs/reference/PlotDistHistBeta.html +++ b/docs/reference/PlotDistHistBeta.html @@ -194,7 +194,7 @@
Examp hdata = data.frame(n_gray=ngray, herd_size=herd_size) # observed rate of gray horses in each herd -hdata$rate_gray = with(hdata, ngray/herd_size) +hdata$rate_gray = with(hdata, n_gray/herd_size) title = "Observed prevalence of gray horses in population" diff --git a/docs/reference/ROCPlot.html b/docs/reference/ROCPlot.html index a5f3ccb..d701f20 100644 --- a/docs/reference/ROCPlot.html +++ b/docs/reference/ROCPlot.html @@ -198,7 +198,7 @@
Details and how the ROC plot relates to the precision/recall plot.
See also
- +Examples
diff --git a/man/PlotDistDensityNormal.Rd b/man/PlotDistDensityNormal.Rd index d5825d2..56d83c2 100644 --- a/man/PlotDistDensityNormal.Rd +++ b/man/PlotDistDensityNormal.Rd @@ -5,8 +5,8 @@ \title{Plot an empirical density with the matching normal distribution} \usage{ PlotDistDensityNormal(frm, xvar, title, ..., adjust = 0.5, - curve_color = "black", normal_color = "blue", mean_color = "blue", - sd_color = "darkgray") + curve_color = "lightgray", normal_color = "blue", + mean_color = "blue", sd_color = "darkgray") } \arguments{ \item{frm}{data frame to get values from}