Skip to content

Commit

Permalink
Merge pull request #1745 from ds26gte/more-stat
Browse files Browse the repository at this point in the history
More stat
  • Loading branch information
ds26gte authored Jul 5, 2024
2 parents eeef8a7 + d36171f commit 448cfdf
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 30 deletions.
8 changes: 7 additions & 1 deletion src/arr/trove/matrices.arr
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ provide:
make-matrix,
build-matrix,
vector-to-matrix,
list-to-vector,
list-to-matrix,
list-to-row-matrix,
list-to-col-matrix,
Expand Down Expand Up @@ -351,7 +352,7 @@ sharing:

method to-vector(self) -> Vector:
doc: "Returns the one-row/one-column matrix as a vector"
if ((self.cols == 1) <> (self.rows == 1)):
if ((self.cols == 1) or (self.rows == 1)):
vector-of-raw-array(self.elts)
else:
raise("Cannot convert non-vector matrix to vector")
Expand Down Expand Up @@ -974,6 +975,11 @@ where:
[mk-mtx(1, 3): 1, 2, 3]
end

fun list-to-vector(lst :: List<Number>):
doc: "Converts the given list of numbers into a vector"
vector(raw-array-from-list(lst))
end

fun list-to-matrix(rows :: NonZeroNat, cols :: NonZeroNat, lst :: List<Number>) block:
doc: "Converts the given list of numbers into a matrix of the given size"
when not(lst.length() == (rows * cols)):
Expand Down
160 changes: 134 additions & 26 deletions src/arr/trove/statistics.arr
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,19 @@ provide:
mode-smallest,
mode-largest,
mode-any,
variance,
stdev,
variance-sample,
stdev-sample,
linear-regression,
multiple-regression,
r-squared,
z-test,
t-test,
t-test-paired,
t-test-pooled,
t-test-independent,
chi-square,
group-and-count,
type *
end
Expand All @@ -21,6 +30,7 @@ include lists
import error as E
import math as math
import string-dict as SD
import matrices as MX

fun mean(l :: List<Number>) -> Number:
doc: "Find the average of a list of numbers"
Expand Down Expand Up @@ -180,53 +190,76 @@ fun mode-any(l :: List<Number>) -> Number:
ms.get(num-random(ms.length()))
end
end
fun stdev(l :: List) -> Number:
doc: ```returns the standard deviation of the list

fun variance(l :: List) -> Number:
doc: ```returns the variance of the list
of numbers, or raises an error if the list is empty```
reg-mean = mean(l)
sq-diff = l.map(lam(k): num-expt((k - reg-mean), 2) end)
sq-mean = mean(sq-diff)
num-sqrt(sq-mean)
sq-mean
end

fun stdev-sample(l :: List) -> Number:
doc: ```returns the standard deviation of the list
fun stdev(l :: List) -> Number:
doc: ```returns the standard deviation of the list
of numbers, or raises an error if the list is empty```
num-sqrt(variance(l))
end

fun variance-sample(l :: List) -> Number:
doc: ```returns the variance of the list
of numbers, or raises an error if the list is empty```
len = l.length()
reg-mean = mean(l)
sq-diff = l.map(lam(k): num-expt((k - reg-mean), 2) end)
sq-mean = math.sum(sq-diff) / (len - 1)
num-sqrt(sq-mean)
sq-mean
end

fun linear-regression(x :: List<Number>, y :: List<Number>) -> (Number -> Number):
doc: "returns a linear predictor function calculated with ordinary least squares regression"
if x.length() <> y.length():
fun stdev-sample(l :: List) -> Number:
doc: ```returns the standard deviation of the list
of numbers, or raises an error if the list is empty```
num-sqrt(variance-sample(l))
end

# please see: https://online.stat.psu.edu/stat462/

fun multiple-regression(x_s_s :: List<List<Number>>, y_s :: List<Number>) -> (List<Number> -> Number):
doc: "returns a predictor function given a list of list of independent inputs and the correspoinding list of outputs"
intercepts = MX.col-matrix.make(raw-array-of(1, y_s.length()))
mx_X = intercepts.augment(MX.lists-to-matrix(x_s_s))
mx_Y = MX.col-matrix.make(raw-array-from-list(y_s))
B = MX.mtx-to-vector(MX.mtx-least-squares-solve(mx_X, mx_Y))
B_n = B.length() - 1
fun B_pred_fn(x_s :: List<Number>) -> Number:
x_s_n = x_s.length()
if B_n <> x_s_n:
raise(E.message-exception("multiple-regression: the regression expected " + tostring(B_n) + " inputs, but received " + tostring(x_s_n) + " instead"))
else:
B.dot(MX.list-to-vector(x_s.push(1)))
end
end
B_pred_fn
end

fun linear-regression(x-s :: List<Number>, y-s :: List<Number>) -> (Number -> Number):
doc: "returns a linear predictor function given a list of single inputs and the corresponding list of outputs"
x-s-n = x-s.length()
if x-s-n <> y-s.length():
raise(E.message-exception("linear-regression: input lists must have equal lengths"))
else if x.length() < 2:
else if x-s-n < 2:
raise(E.message-exception("linear-regression: input lists must have at least 2 elements each"))
else:
shadow y = map(num-to-roughnum, y)
shadow x = map(num-to-roughnum, x)
xpt-xy = math.sum(map2(lam(xi, yi): xi * yi end, x, y))
xpt-x-xpt-y = (math.sum(x) * math.sum(y)) / x.length()
covariance = xpt-xy - xpt-x-xpt-y
v1 = math.sum(map(lam(n): n * n end, x))
v2 = (math.sum(x) * math.sum(x)) / x.length()
variance = v1 - v2
beta = covariance / variance
alpha = mean(y) - (beta * mean(x))

fun predictor(in :: Number) -> Number:
(beta * in) + alpha
predictor1 = multiple-regression(x-s.map(lam(x1 :: Number): [list: x1] end), y-s)
fun predictor2(x2 :: Number) -> Number:
predictor1([list: x2])
end

predictor
predictor2
end
end

fun r-squared(x :: List<Number>, y :: List<Number>, f :: (Number -> Number)) -> Number:
doc: "given a list of inputs, a list of outputs, and a model predictor, finds the r-squared of the model"
shadow x = map(num-to-roughnum, x)
shadow y = map(num-to-roughnum, y)
y-mean = mean(y)
Expand All @@ -241,3 +274,78 @@ fun r-squared(x :: List<Number>, y :: List<Number>, f :: (Number -> Number)) ->
end
end

fun z-test(sample-list :: List, population-sd :: Number, population-mean :: Number) -> Number:
doc: "given a sample and the population SD and mean, find the z-score of the sample's mean"
sample-size = sample-list.length()
sample-mean = mean(sample-list)
sd-of-mean = population-sd / num-sqrt(sample-size)
(sample-mean - population-mean) / sd-of-mean
end

fun t-test(sample-list :: List, population-mean :: Number) -> Number:
doc: "given a sample and the population mean, find the t-score of the sample's mean"
sample-size = sample-list.length()
sample-mean = mean(sample-list)
estimated-population-variance = variance-sample(sample-list)
variance-of-mean = estimated-population-variance / sample-size
(sample-mean - population-mean) / num-sqrt(variance-of-mean)
end

# please see:
# https://en.wikipedia.org/wiki/Student's_t-test
# https://www.investopedia.com/terms/t/t-test.asp
# (this has a typo for the pooled t-test, corrected here)

fun t-test-paired(l1 :: List, l2 :: List) -> Number:
doc: "given two paired samples, find the t-score of the difference of their means"
n = l1.length()
n2 = l2.length()
if n <> n2:
raise(E.message-exception("t-test-paired: input lists must have equal lengths"))
else if n == 0:
raise(E.message-exception("t-test-paired: input lists must have at least one element"))
else:
diffs = map2(lam(x1, x2): x1 - x2 end, l1, l2)
diffs-mean = mean(diffs)
s-hat = stdev-sample(diffs)
diffs-mean / (s-hat / num-sqrt(n))
end
end

fun t-test-pooled(sample-list-1 :: List, sample-list-2 :: List) -> Number:
doc: "given two independent samples of different sizes or variances, find the t-score of the difference of their means"
n1 = sample-list-1.length()
n2 = sample-list-2.length()
if (n1 == 0) or (n2 == 0):
raise(E.message-exception("t-test-pooled: input lists must have at least one element"))
else:
m1 = mean(sample-list-1)
m2 = mean(sample-list-2)
v1 = variance-sample(sample-list-1)
v2 = variance-sample(sample-list-2)
v = (((n1 - 1) * v1) + ((n2 - 1) * v2)) / ((n1 + n2) - 2)
(m1 - m2) / num-sqrt((v / n1) + (v / n2))
end
end

fun t-test-independent(sample-list-1 :: List, sample-list-2 :: List) -> Number:
doc: "given two independent samples of similar size or variance, find the t-score of the difference of their means"
n1 = sample-list-1.length()
n2 = sample-list-2.length()
if (n1 == 0) or (n2 == 0):
raise(E.message-exception("t-test-independent: input lists must have at least one element"))
else:
m1 = mean(sample-list-1)
m2 = mean(sample-list-2)
v1 = variance-sample(sample-list-1)
v2 = variance-sample(sample-list-2)
(m1 - m2) / num-sqrt((v1 / n1) + (v2 / n2))
end
end

fun chi-square(observed-values :: List, predicted-values :: List) -> Number:
doc: "given a list of observed and predicted values, returns the chi-square statistic"
for fold2(sum from 0, obs from observed-values, pred from predicted-values):
sum + (num-sqr(obs - pred) / pred)
end
end
4 changes: 4 additions & 0 deletions tests/pyret/tests/test-matrices.arr
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ check "Vector Operations":

M.vec-normalize([vector: 1, 2, 3]) is%(vector-within(0.001))
[vector: (1 / num-sqrt(14)), (2 / num-sqrt(14)), (3 / num-sqrt(14))]

M.list-to-vector([list: 1, 2, 3]) is [vector: 1, 2, 3]
end

check "Basic Matrix Accessors":
Expand Down Expand Up @@ -95,6 +97,8 @@ check "Basic Matrix Accessors":
M.mtx-row(mtx1, 0).to-vector() is [vector: 1, 2, 3]
M.mtx-col(mtx1, 1).to-vector() is [vector: 2, 5]

[row-matrix: 1, 2, 3].to-vector() is [vector: 1, 2, 3]
[col-matrix: 1, 2, 3].to-vector() is [vector: 1, 2, 3]
end

check "Submatrices":
Expand Down
52 changes: 49 additions & 3 deletions tests/pyret/tests/test-statistics.arr
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,45 @@ check "numeric helpers":
modes([list: -1, 2, -1, 2, -1]) is [list: -1]
modes([list: 1, 1, 2, 2, 3, 3, 3]) is [list: 3]

variance([list:]) raises "empty"
variance([list: 5]) is 0
variance([list: 3, 4, 5, 6, 7]) is%(within(0.01)) 2.0
variance([list: 1, 1, 1, 1]) is-roughly ~0

stdev([list:]) raises "empty"
stdev([list: 5]) is 0

stdev([list: 3, 4, 5, 6, 7]) is%(within(0.01)) 1.41
stdev([list: 1, 1, 1, 1]) is-roughly ~0
stdev([list:]) raises "empty"


variance-sample([list: 3, 4, 5, 6, 7]) is%(within(0.01)) (10 / 4)
variance-sample([list: 3]) raises "division by zero"
variance-sample([list: 1, 1, 1, 1]) is-roughly ~0
variance-sample([list:]) raises "empty"

stdev-sample([list: 3, 4, 5, 6, 7]) is%(within(0.01)) num-sqrt(10 / 4)
stdev-sample([list: 3]) raises "division by zero"
stdev-sample([list: 1, 1, 1, 1]) is-roughly ~0
stdev-sample([list:]) raises "empty"

z-test([list: 1, 2, 3], 1.58, 2.58) is-roughly ~-0.635816119

t-test([list: 1, 2, 3], 2.58) is-roughly ~-1.004589468

t-test-paired([list: 1], [list: 2, 3]) raises "lists must have equal lengths"
t-test-paired([list:], [list:]) raises "lists must have at least one element"
t-test-paired([list: 1, 2, 3], [list: 4, 6, 8]) is-roughly ~-6.928203230

t-test-pooled([list:], [list: 1, 2, 3]) raises "lists must have at least one element"
t-test-pooled([list: 1, 2, 3], [list: 4, 5, 6]) is%(within(0.01)) -3.674234614
t-test-pooled([list: 1, 2, 3], [list: 4, 5, 6, 7]) is-roughly ~-3.872983346

t-test-independent([list:], [list: 1, 2, 3]) raises "lists must have at least one element"
t-test-independent([list: 1, 2, 3], [list: 4, 5, 6]) is-roughly ~-3.674234614
t-test-independent([list: 1, 2, 3], [list: 4, 5, 6, 7]) is-roughly ~-4.041451884


chi-square([list: 1, 2, 3, 4], [list: 1, 2, 3, 4]) is 0
chi-square([list: 1, 2, 3, 4], [list: 0.9, 1.8, 3.5, 4.7]) is-roughly ~0.2090172239
end

check "polymorphic modes":
Expand Down Expand Up @@ -103,3 +130,22 @@ check "linear regression":

end

check "multiple regression":
# multiple-regression function for single independent variable
x-s-s = [list: [list: 4], [list: 4.5], [list: 5], [list: 5.5], [list: 6], [list: 6.5], [list: 7]]
y-s = [list: 33, 42, 45, 51, 53, 61, 62]
pf1 = multiple-regression(x-s-s, y-s)
pf1([list: 8]) is-roughly 73.3214
pf1([list: 8, 9]) raises "received"
#
# check it matches linear-regression function on the same single variable
x-s = [list: 4, 4.5, 5, 5.5, 6, 6.5, 7]
pf2 = linear-regression(x-s, y-s)
pf2(8) is-roughly 73.3214
#
# multiple-regression with two independent variables
x-s-s-i = [list: [list: 4, 3], [list: 4.5, 2], [list: 5, 1.2], [list: 5.5, 4.5], [list: 6, 3.3], [list: 6.5, 10], [list: 7, 0]]
y-s-i = [list: 33, 42, 45, 51, 53, 61, 62]
pf-i = multiple-regression(x-s-s-i, y-s-i)
pf-i([list: 8, 9]) is-roughly 74.52888
end

0 comments on commit 448cfdf

Please sign in to comment.