diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e95d429..ae2f449 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,67 +1,4 @@ -stages: - - test - - teardown - +--- include: - - https://gitlab.invenia.ca/infrastructure/gitlab-ci-helper/raw/master/templates/hidden-jobs.yml - - https://gitlab.invenia.ca/infrastructure/gitlab-ci-helper/raw/master/templates/teardown.yml - -"1.0 (Mac)": - tags: - - mac - - shell-ci - extends: .test_shell_1_0 - -"1.0 (Linux, 64-bit)": - tags: - - linux - - 64-bit - - docker-ci - extends: .test_docker_1_0 - -"1.0 (Linux, 32-bit)": - tags: - - linux - - 32-bit - - shell-ci - extends: .test_shell_1_0 - -"1.1 (Mac)": - tags: - - mac - - shell-ci - extends: .test_shell_1_1 - -"1.1 (Linux, 64-bit)": - tags: - - linux - - 64-bit - - docker-ci - extends: .test_docker_1_1 - -"1.1 (Linux, 32-bit)": - tags: - - linux - - 32-bit - - shell-ci - extends: .test_shell_1_1 - -"Nightly (Mac)": - tags: - - mac - - shell-ci - extends: .test_shell_nightly - -"Nightly (Linux, 64-bit)": - tags: - - linux - - 64-bit - - docker-ci - extends: .test_docker_nightly - -"Nightly (Linux, 32-bit)": - tags: - - linux - - 32-bit - - shell-ci - extends: .test_shell_nightly + - project: infrastructure/gitlab-ci-helper + file: /templates/julia.yml diff --git a/Project.toml b/Project.toml index 6bc86d1..e329ee7 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,16 @@ uuid = "e6388cff-ecff-480c-9b53-83211bf7812a" authors = ["Invenia Technical Computing Corporation"] version = "0.1.0" +[deps] +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + [compat] +Distributions = "0.16, 0.22" +NamedDims = "0.1, 0.2" +StatsBase = "0.32" julia = "1" [extras] diff --git a/README.md b/README.md index 78f5ae3..d9eff85 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,13 @@ [![Build Status](https://gitlab.invenia.ca/invenia/Models.jl/badges/master/build.svg)](https://gitlab.invenia.ca/invenia/Models.jl/commits/master) [![Coverage](https://gitlab.invenia.ca/invenia/Models.jl/badges/master/coverage.svg)](https://gitlab.invenia.ca/invenia/Models.jl/commits/master) -This package defines the `Model` type and a common API for constructing a generic model, including +## Why does this package exist? -* Model Fitting (`fit`, `predict`) -* Model Traits (`output_type`, `estimate_type`) -* Test utils for testing downstream interfaces (`FakeModel`) +[Models.jl](https://gitlab.invenia.ca/invenia/research/Models.jl) defines the [`Template`](@ref) and [`Model`](@ref) types as well as a common API for constructing a generic model in downstream packages, including: + +* Calling [`fit`](@ref) on a [`Template`](@ref). +* Calling [`predict`](@ref) on a [`Model`](@ref). +* Assigning traits such as [`EstimateTrait`](@ref) and [`OutputTrait`](@ref). +* Testing interfaces and downstream dependencies with [`TestUtils`](@ref). For common examples of the interface being implemented see [BaselineModels.jl](https://gitlab.invenia.ca/invenia/research/BaselineModels.jl). diff --git a/docs/Project.toml b/docs/Project.toml index dfa65cd..b60aceb 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,2 +1,3 @@ [deps] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Models = "e6388cff-ecff-480c-9b53-83211bf7812a" diff --git a/docs/make.jl b/docs/make.jl index f8b8435..61dd1ef 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,19 +1,24 @@ -using Documenter, Models +using Documenter +using Models +using Models.TestUtils makedocs(; modules=[Models], - format=Documenter.HTML(), + format=Documenter.HTML(; + prettyurls=false, + assets=[ + "assets/invenia.css", + ], + ), pages=[ - "Home" => "index.md", + "Index" => "index.md", + "API" => "api.md", + "Design" => "design.md", + "TestUtils" => "testutils.md", ], repo="https://gitlab.invenia.ca/invenia/Models.jl/blob/{commit}{path}#L{line}", sitename="Models.jl", authors="Invenia Technical Computing Corporation", - assets=[ - "assets/invenia.css", - "assets/logo.png", - ], strict=true, - html_prettyurls=false, - checkdocs=:none, + checkdocs=:exports, ) diff --git a/docs/src/api.md b/docs/src/api.md new file mode 100644 index 0000000..52e3cc8 --- /dev/null +++ b/docs/src/api.md @@ -0,0 +1,25 @@ +# API + +## Abstract Types +```@docs +Template +Model +``` + +## Common API +```@docs +fit +predict +estimate_type +output_type +``` + +## Traits +```@docs +EstimateTrait +PointEstimate +DistributionEstimate +OutputTrait +SingleOutput +MultiOutput +``` diff --git a/docs/src/design.md b/docs/src/design.md new file mode 100644 index 0000000..bfbccc4 --- /dev/null +++ b/docs/src/design.md @@ -0,0 +1,82 @@ +## Design Documentation + +This page details the key features of the design of BaselineModels. + +BaselineModels exists to solve the issue highlighted by following quote: + +> ML researchers tend to develop general purpose solutions as self-contained packages. +> A wide variety of these are available as open-source packages ... +> Using generic packages often results in a glue-code system design pattern, in which a massive amount of supporting code is written to get data into and out of general-purpose packages. +> Glue-code is costly in the long term because it tends to freeze a system to the peculiarities of a specific package; testing alternatives may become prohibitively expensive.... +> **An important strategy for combating glue-code is to wrap black-box packages into common API’s.** +> This allows supporting infrastructure to be more reusable and reduces the cost of changing packages. + +-- [Sculley et al 2015](https://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems) + +BaselineModels provides a common API for mostly preexisting models to allow them to all be used in the same way. +As such, the most important thing is that it itself has a common API. +Here are some facts about that API: + +### Models and Templates + +A **model** is an object that can be used to make predictions via calling `predict`. +A **template** is an object that can create a *model* by being `fit` to some data. + +All information about how to perform `fit`, such as hyper-parameters, is stored inside the *template*. +This is different from some other APIs which might for example pass those as keyword arguments to `fit`. +The template based API is superior to these as it means `fit` is always the same. +One does not have to carry both a model type, and a varying collection of keyword arguments, which would get complicated when composing wrapper models. + + +### `fit` and `predict` + +```julia +model = StatsBase.fit( + template, + outputs::AbstractMatrix, # always Features x Observations + inputs::AbstractMatrix, # always Variates x Observations + weights=uweights(Float32, size(outputs, 2)) +)::Model +``` + +```julia +outputs = StatsBase.predict( + model, + inputs::AbstractMatrix # always Features x Observations +)::AbstractMatrix # always Variates x Observations +``` + +`fit` takes in a *template* and some *data* and returns a `Model` that has been fit to the data. +`predict` takes a `Model` (that has been `fit` from a *template*) and produces a predicted output. + +Important facts about `fit` and `predict`: + - `outputs` and `inputs` always have observations as the second dimension -- even if it is [`SingleOutput`](@ref) (that just means that it will be a `1 x num_obs` output. (See [Docs on Julia being column-major](https://docs.julialang.org/en/v1/manual/performance-tips/#Access-arrays-in-memory-order,-along-columns-1)) + - The functions must accept any `AbstractMatrix` for the `inputs` and `outputs` (`fit` only). If the underlying implementation needs a plain dense `Matrix` then `fit`/`predict` should perform the conversion. + - `fit` always accepts a `weights` argument. If the underlying model does not support weighted fitting, then `fit` should throw and error if the weights that passed in and are not all equal. + - `fit`/`predict` take no keyword arguments, or any other arguments except the ones shown. + +### Traits + +This package largely avoids using complicated abstract types, or relying on a model having a particular abstract type. +Instead we use [traits](https://invenia.github.io/blog/2019/11/06/julialang-features-part-2/) to determine model behavior. + +Here are the current model traits in use and their possible values: + - `estimate_type` - determines what kinds of estimates the model outputs. + - `PointEstimate`: Predicts point-estimates of the most likely values. + - `DistributionEstimate`: Estimates distributions over possible values. + - `output_type` - determines how many output variates a model can learn + - `SingleOutput`: Fits and predicts on a single output only. + - `MultiOutput`: Fits and predicts on multiple outputs at a time. + +The traits always agree between the model and the template. +Every model and template should define all the listed traits. + +This package uses traits implemented such that the trait function returns an `abstract type` (rather than an instance). +That means to check a trait one uses: +```julia +if estimate_type(model) isa DistributionEstimate +``` +and to dispatch on a trait one uses: +``` +foo(::Type{<:DistributionEstimate}, ...) +``` diff --git a/docs/src/index.md b/docs/src/index.md index a890632..0773a1e 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,8 +1,17 @@ -# Models.jl +# Models -```@index -``` +## Why does this package exist? + +[Models.jl](https://gitlab.invenia.ca/invenia/research/Models.jl) defines the [`Template`](@ref) and [`Model`](@ref) types as well as a common API for constructing a generic model in downstream packages, including: + +* Calling [`fit`](@ref) on a [`Template`](@ref). +* Calling [`predict`](@ref) on a [`Model`](@ref). +* Assigning traits such as [`EstimateTrait`](@ref) and [`OutputTrait`](@ref). +* Testing interfaces and downstream dependencies with [`TestUtils`](@ref). + +For common examples of the interface being implemented see [BaselineModels.jl](https://gitlab.invenia.ca/invenia/research/BaselineModels.jl). -```@autodocs -Modules = [Models] +## Contents +```@contents +Pages = ["api.md", "testutils.md"] ``` diff --git a/docs/src/testutils.md b/docs/src/testutils.md new file mode 100644 index 0000000..dd0caa8 --- /dev/null +++ b/docs/src/testutils.md @@ -0,0 +1,9 @@ +# TestUtils + +Provides test fakes, [`FakeTemplate`](@ref) and [`FakeModel`](@ref), that are useful for +testing downstream dependencies, and [`test_interface`](@ref) for testing the Model's API +has been correctly implemented. + +```@autodocs +Modules = [Models.TestUtils] +``` diff --git a/src/Models.jl b/src/Models.jl index b1cb147..82ff847 100644 --- a/src/Models.jl +++ b/src/Models.jl @@ -1,5 +1,49 @@ module Models -greet() = print("Hello World!") +import StatsBase: fit, predict + +export Model, Template +export fit, predict, estimate_type, output_type +export EstimateTrait, PointEstimate, DistributionEstimate +export OutputTrait, SingleOutput, MultiOutput + +""" + Template + +A Template is an untrained [`Model`](@ref) that can be [`fit`](@ref) to data. +Defined as well are the traits: +- [`output_type`](@ref): SingleOutput or MultiOutput +- [`estimate_type`](@ref): PointEstimate or DistributionEstimate +""" +abstract type Template end + +""" + Model + +A Model is a trained [`Template`](@ref) with which one can [`predict`](@ref) on inputs. +Defined as well are the traits: +- [`output_type`](@ref): SingleOutput or MultiOutput +- [`estimate_type`](@ref): PointEstimate or DistributionEstimate +""" +abstract type Model end + +""" + fit(::Template, output, input) -> Model + +Fit the `Template` to the `output` and `input` data and return a trained `Model`. +""" +function fit end + +""" + predict(::Model, input) + +Predict targets for the provided `input` and `Model`. + +Returns a predictive distribution or point estimates depending on the `Model`. +""" +function predict end + +include("traits.jl") +include("test_utils.jl") end # module diff --git a/src/test_utils.jl b/src/test_utils.jl new file mode 100644 index 0000000..eced3cc --- /dev/null +++ b/src/test_utils.jl @@ -0,0 +1,216 @@ +""" + Models.TestUtils + +Provides test fakes, [`FakeTemplate`](@ref) and [`FakeModel`](@ref), that are useful for +testing downstream dependencies, and [`test_interface`](@ref) for testing the Models API has +been correctly implemented. +""" +module TestUtils +using Distributions: Normal, MultivariateNormal +using Models +using NamedDims +using StatsBase +using Test + +export FakeModel, FakeTemplate +export test_interface + +""" + FakeTemplate{E <: EstimateTrait, O <: OutputTrait} <: Template + +This template is a [test double](https://en.wikipedia.org/wiki/Test_double) for testing +purposes. It should be defined (before fitting) with a `predictor`, which can be changed by +mutating the field. + +## Fields +- `predictor::Function`: predicts the outputs of the FakeModel. + It is `(num_variates, inputs) -> outputs`, where the `num_variates` will be memorized + during `fit`. + +## Methods +- `fit` does not learn anything it just creates an instance of the model +- `predict` applies the `predictor` to the inputs +""" +mutable struct FakeTemplate{E<:EstimateTrait, O<:OutputTrait} <: Template + predictor::Function +end + +""" + FakeTemplate{PointEstimate, SingleOutput}() + +A [`Template`](@ref) whose [`Model`](@ref) will predict 0 for each observation. +""" +function FakeTemplate{PointEstimate, SingleOutput}() + FakeTemplate{PointEstimate, SingleOutput}() do num_variates, inputs + @assert(num_variates == 1, "$num_variates != 1") + inputs = NamedDimsArray{(:features, :observations)}(inputs) + return NamedDimsArray{(:variates, :observations)}( + zeros(1, size(inputs, :observations)) + ) + end +end + +""" + FakeTemplate{PointEstimate, SingleOutput}() + +A [`Template`](@ref) whose [`Model`](@ref) will predict a vector of 0s for each observation. +The input and output will have the same dimension. +""" +function FakeTemplate{PointEstimate, MultiOutput}() + FakeTemplate{PointEstimate, MultiOutput}() do num_variates, inputs + inputs = NamedDimsArray{(:features, :observations)}(inputs) + return NamedDimsArray{(:variates, :observations)}( + zeros(num_variates, size(inputs, :observations)) + ) + end +end + +""" + FakeTemplate{PointEstimate, SingleOutput}() + +A [`Template`](@ref) whose [`Model`](@ref) will predict a univariate normal posterior +distribution (with zero mean and unit standard deviation) for each observation. +""" +function FakeTemplate{DistributionEstimate, SingleOutput}() + FakeTemplate{DistributionEstimate, SingleOutput}() do num_variates, inputs + @assert(num_variates == 1, "$num_variates != 1") + inputs = NamedDimsArray{(:features, :observations)}(inputs) + return Normal.(zeros(size(inputs, :observations))) + end +end + +""" + FakeTemplate{PointEstimate, SingleOutput}() + +A [`Template`](@ref) whose [`Model`](@ref) will predict a multivariate normal posterior +distribution (with zero-vector mean and identity covariance matrix) for each observation. +""" +function FakeTemplate{DistributionEstimate, MultiOutput}() + FakeTemplate{DistributionEstimate, MultiOutput}() do num_variates, inputs + std_dev = ones(num_variates) + return [MultivariateNormal(std_dev) for _ in 1:size(inputs, 2)] + end +end + +""" + FakeModel + +A fake Model for testing purposes. See [`FakeTemplate`](@ref) for details. +""" +mutable struct FakeModel{E<:EstimateTrait, O<:OutputTrait} <: Model + predictor::Function + num_variates::Int +end + +estimate_type(::FakeModel{E, O}) where {E, O} = E +output_type(::FakeModel{E, O}) where {E, O} = O + +estimate_type(::FakeTemplate{E, O}) where {E, O} = E +output_type(::FakeTemplate{E, O}) where {E, O} = O + +function StatsBase.fit( + template::FakeTemplate{E, O}, + outputs, + inputs, + weights=uweights(Float32, size(outputs, 2)) +) where {E, O} + outputs = NamedDimsArray{(:variates, :observations)}(outputs) + num_variates = size(outputs, :variates) + return FakeModel{E, O}(template.predictor, num_variates) +end + +StatsBase.predict(m::FakeModel, inputs) = m.predictor(m.num_variates, inputs) + +""" + test_interface(temp::Template; inputs=rand(5, 5), outputs=rand(5, 5)) + +Test that subtypes of [`Template`](@ref) and [`Model`](@ref) implement the expected API. +Can be used as an initial test to verify the API has been correctly implemented. + +Returns the predictions of the `Model`. +""" +function test_interface(temp::Template; kwargs...) + return test_interface(temp, estimate_type(temp), output_type(temp); kwargs...) +end + +function test_interface( + temp::Template, ::Type{PointEstimate}, ::Type{SingleOutput}; + inputs=rand(5, 5), outputs=rand(1, 5), +) + predictions = test_common(temp, inputs, outputs) + + @test predictions isa NamedDimsArray{(:variates, :observations), <:Real, 2} + @test size(predictions) == size(outputs) + @test size(predictions, 1) == 1 +end + +function test_interface( + temp::Template, ::Type{PointEstimate}, ::Type{MultiOutput}; + inputs=rand(5, 5), outputs=rand(2, 5), +) + predictions = test_common(temp, inputs, outputs) + @test predictions isa NamedDimsArray{(:variates, :observations), <:Real, 2} + @test size(predictions) == size(outputs) +end + +function test_interface( + temp::Template, ::Type{DistributionEstimate}, ::Type{SingleOutput}; + inputs=rand(5, 5), outputs=rand(1, 5), +) + predictions = test_common(temp, inputs, outputs) + @test predictions isa Vector{<:Normal{<:Real}} + @test length(predictions) == size(outputs, 2) + @test all(length.(predictions) .== size(outputs, 1)) +end + +function test_interface( + temp::Template, ::Type{DistributionEstimate}, ::Type{MultiOutput}; + inputs=rand(5, 5), outputs=rand(3, 5) +) + predictions = test_common(temp, inputs, outputs) + @test predictions isa Vector{<:MultivariateNormal{<:Real}} + @test length(predictions) == size(outputs, 2) + @test all(length.(predictions) .== size(outputs, 1)) +end + +function test_common(temp, inputs, outputs) + + model = fit(temp, outputs, inputs) + + @test temp isa Template + @test model isa Model + + @testset "type names" begin + template_type_name = string(nameof(typeof(temp))) + template_base_name_match = match(r"(.*)Template", template_type_name) + @test template_base_name_match !== nothing # must have Template suffix + + model_type_name = string(nameof(typeof(model))) + model_base_name_match = match(r"(.*)Model", model_type_name) + @test model_base_name_match !== nothing # must have Model suffix + + # base_name must agreee + @test model_base_name_match[1] == template_base_name_match[1] + end + + @testset "test fit/predict errors" begin + @test_throws MethodError predict(temp, inputs) + @test_throws MethodError fit(model, outputs, inputs) + end + + @testset "test weights can also be passed" begin + weights = uweights(Float32, size(outputs, 2)) + model_weights = fit(temp, outputs, inputs, weights) + end + + @testset "traits" begin + @test estimate_type(temp) == estimate_type(model) + @test output_type(temp) == output_type(model) + end + + predictions = predict(model, inputs) + + return predictions +end + +end diff --git a/src/traits.jl b/src/traits.jl new file mode 100644 index 0000000..bf09fba --- /dev/null +++ b/src/traits.jl @@ -0,0 +1,60 @@ +# Estimate Type Trait - specifies if the model outputs a point or distribution estimate +""" + EstimateTrait + +The `EstimateTrait` specifies if the model outputs a point or distribution estimate, denoted +by [`PointEstimate`](@ref) or [`DistributionEstimate`](@ref), respectively. +""" +abstract type EstimateTrait end + +""" + PointEstimate <: EstimateTrait + +Specifies that the [`Model`](@ref) returns real-valued response variables. +""" +abstract type PointEstimate <: EstimateTrait end + +""" + DistributionEstimate <: EstimateTrait + +Specifies that the [`Model`](@ref) returns a posterior distribution over the response variables. +""" +abstract type DistributionEstimate <: EstimateTrait end + +""" + estimate_type(::T) where T = output_type(T) + +Return the [`EstimateTrait`] of the [`Model`](@ref) or [`Template`](@ref). +""" +estimate_type(::T) where T = estimate_type(T) +estimate_type(T::Type) = throw(MethodError(estimate_type, (T,))) # to prevent recursion + +""" + OutputTrait + +The `OutputTrait` specifies if the model supports single or multiple response variables, +denoted by [`SingleOutput`](@ref) or [`MultiOutput`](@ref), respectively. +""" +abstract type OutputTrait end + +""" + SingleOutput <: OutputTrait + +Specifies that the [`Model`](@ref) returns a single, univariate response variable. +""" +abstract type SingleOutput <: OutputTrait end + +""" + MultiOutput <: OutputTrait + +Specifies that the [`Model`](@ref) returns a multivariate response variable. +""" +abstract type MultiOutput <: OutputTrait end + +""" + output_type(::T) where T = output_type(T) + +Return the [`OutputTrait`] of the [`Model`](@ref) or [`Template`](@ref). +""" +output_type(::T) where T = output_type(T) +output_type(T::Type) = throw(MethodError(output_type, (T,))) # to prevent recursion diff --git a/test/runtests.jl b/test/runtests.jl index 607d752..d1de5ed 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,10 @@ +using Distributions: Normal, MvNormal using Models +using Models.TestUtils +using NamedDims: NamedDimsArray using Test @testset "Models.jl" begin - # Write your own tests here. + include("traits.jl") + include("test_utils.jl") end diff --git a/test/test_utils.jl b/test/test_utils.jl new file mode 100644 index 0000000..90efa0e --- /dev/null +++ b/test/test_utils.jl @@ -0,0 +1,23 @@ +@testset "test_utils.jl" begin + + @testset "FakeTemplate{PointEstimate, SingleOutput}" begin + temp = FakeTemplate{PointEstimate, SingleOutput}() + test_interface(temp) + end + + @testset "FakeTemplate{PointEstimate, MultiOutput}" begin + temp = FakeTemplate{PointEstimate, MultiOutput}() + test_interface(temp) + end + + @testset "FakeTemplate{DistributionEstimate, SingleOutput}" begin + temp = FakeTemplate{DistributionEstimate, SingleOutput}() + test_interface(temp) + end + + @testset "FakeTemplate{DistributionEstimate, MultiOutput}" begin + temp = FakeTemplate{DistributionEstimate, MultiOutput}() + test_interface(temp) + end + +end diff --git a/test/traits.jl b/test/traits.jl new file mode 100644 index 0000000..c052c28 --- /dev/null +++ b/test/traits.jl @@ -0,0 +1,33 @@ +@testset "traits.jl" begin + + struct DummyTemplate <: Template end + struct DummyModel <: Model end + + estimates = (PointEstimate, DistributionEstimate) + outputs = (SingleOutput, MultiOutput) + + @testset "$est, $out" for (est, out) in Iterators.product(estimates, outputs) + + @testset "Errors if traits are not defined" begin + @test_throws MethodError estimate_type(DummyTemplate) + @test_throws MethodError output_type(DummyTemplate) + + @test_throws MethodError estimate_type(DummyModel) + @test_throws MethodError output_type(DummyModel) + end + + @testset "Traits are defined" begin + estimate_type(m::Type{<:DummyTemplate}) = est + estimate_type(m::Type{<:DummyModel}) = est + + output_type(m::Type{<:DummyTemplate}) = out + output_type(m::Type{<:DummyModel}) = out + + @test estimate_type(DummyTemplate) == estimate_type(DummyModel) == est + @test output_type(DummyTemplate) == output_type(DummyModel) == out + end + + end + + +end