Merge branch 'gm/models' into 'master'

Initial functions See merge request invenia/research/Models.jl!2
invenia · Mar 20, 2020 · fda8a69 · fda8a69
2 parents c67bcd3 + 0f3a61f
commit fda8a69
Show file tree

Hide file tree

Showing 15 changed files with 546 additions and 86 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,67 +1,4 @@
-stages:
-  - test
-  - teardown
-
+---
 include:
-  - https://gitlab.invenia.ca/infrastructure/gitlab-ci-helper/raw/master/templates/hidden-jobs.yml
-  - https://gitlab.invenia.ca/infrastructure/gitlab-ci-helper/raw/master/templates/teardown.yml
-
-"1.0 (Mac)":
-  tags:
-    - mac
-    - shell-ci
-  extends: .test_shell_1_0
-
-"1.0 (Linux, 64-bit)":
-  tags:
-    - linux
-    - 64-bit
-    - docker-ci
-  extends: .test_docker_1_0
-
-"1.0 (Linux, 32-bit)":
-  tags:
-    - linux
-    - 32-bit
-    - shell-ci
-  extends: .test_shell_1_0
-
-"1.1 (Mac)":
-  tags:
-    - mac
-    - shell-ci
-  extends: .test_shell_1_1
-
-"1.1 (Linux, 64-bit)":
-  tags:
-    - linux
-    - 64-bit
-    - docker-ci
-  extends: .test_docker_1_1
-
-"1.1 (Linux, 32-bit)":
-  tags:
-    - linux
-    - 32-bit
-    - shell-ci
-  extends: .test_shell_1_1
-
-"Nightly (Mac)":
-  tags:
-    - mac
-    - shell-ci
-  extends: .test_shell_nightly
-
-"Nightly (Linux, 64-bit)":
-  tags:
-    - linux
-    - 64-bit
-    - docker-ci
-  extends: .test_docker_nightly
-
-"Nightly (Linux, 32-bit)":
-  tags:
-    - linux
-    - 32-bit
-    - shell-ci
-  extends: .test_shell_nightly
+  - project: infrastructure/gitlab-ci-helper
+    file: /templates/julia.yml
diff --git a/Project.toml b/Project.toml
@@ -3,7 +3,16 @@ uuid = "e6388cff-ecff-480c-9b53-83211bf7812a"
 authors = ["Invenia Technical Computing Corporation"]
 version = "0.1.0"
 
+[deps]
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+NamedDims = "356022a1-0364-5f58-8944-0da4b18d706f"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
 [compat]
+Distributions = "0.16, 0.22"
+NamedDims = "0.1, 0.2"
+StatsBase = "0.32"
 julia = "1"
 
 [extras]

diff --git a/README.md b/README.md
@@ -4,10 +4,13 @@
 [![Build Status](https://gitlab.invenia.ca/invenia/Models.jl/badges/master/build.svg)](https://gitlab.invenia.ca/invenia/Models.jl/commits/master)
 [![Coverage](https://gitlab.invenia.ca/invenia/Models.jl/badges/master/coverage.svg)](https://gitlab.invenia.ca/invenia/Models.jl/commits/master)
 
-This package defines the `Model` type and a common API for constructing a generic model, including
+## Why does this package exist?
 
-* Model Fitting (`fit`, `predict`)
-* Model Traits (`output_type`, `estimate_type`)
-* Test utils for testing downstream interfaces (`FakeModel`)
+[Models.jl](https://gitlab.invenia.ca/invenia/research/Models.jl) defines the [`Template`](@ref) and [`Model`](@ref) types as well as a common API for constructing a generic model in downstream packages, including:
+
+* Calling [`fit`](@ref) on a [`Template`](@ref).
+* Calling [`predict`](@ref) on a [`Model`](@ref).
+* Assigning traits such as [`EstimateTrait`](@ref) and [`OutputTrait`](@ref).
+* Testing interfaces and downstream dependencies with [`TestUtils`](@ref).
 
 For common examples of the interface being implemented see [BaselineModels.jl](https://gitlab.invenia.ca/invenia/research/BaselineModels.jl).
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,2 +1,3 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Models = "e6388cff-ecff-480c-9b53-83211bf7812a"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,19 +1,24 @@
-using Documenter, Models
+using Documenter
+using Models
+using Models.TestUtils
 
 makedocs(;
     modules=[Models],
-    format=Documenter.HTML(),
+    format=Documenter.HTML(;
+        prettyurls=false,
+        assets=[
+            "assets/invenia.css",
+        ],
+    ),
     pages=[
-        "Home" => "index.md",
+        "Index" => "index.md",
+        "API" => "api.md",
+        "Design" => "design.md",
+        "TestUtils" => "testutils.md",
     ],
     repo="https://gitlab.invenia.ca/invenia/Models.jl/blob/{commit}{path}#L{line}",
     sitename="Models.jl",
     authors="Invenia Technical Computing Corporation",
-    assets=[
-        "assets/invenia.css",
-        "assets/logo.png",
-    ],
     strict=true,
-    html_prettyurls=false,
-    checkdocs=:none,
+    checkdocs=:exports,
 )
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -0,0 +1,25 @@
+# API
+
+## Abstract Types
+```@docs
+Template
+Model
+```
+
+## Common API
+```@docs
+fit
+predict
+estimate_type
+output_type
+```
+
+## Traits
+```@docs
+EstimateTrait
+PointEstimate
+DistributionEstimate
+OutputTrait
+SingleOutput
+MultiOutput
+```
diff --git a/docs/src/design.md b/docs/src/design.md
@@ -0,0 +1,82 @@
+## Design Documentation
+
+This page details the key features of the design of BaselineModels.
+
+BaselineModels exists to solve the issue highlighted by following quote:
+
+> ML researchers tend to develop general purpose solutions as self-contained packages.
+> A wide variety of these are available as open-source packages ...
+> Using generic packages often results in a glue-code system design pattern, in which a massive amount of supporting code is written to get data into and out of general-purpose packages.
+> Glue-code is costly in the long term because it tends to freeze a system to the peculiarities of a specific package; testing alternatives may become prohibitively expensive....
+> **An important strategy for combating glue-code is to wrap black-box packages into common API’s.**
+> This allows supporting infrastructure to be more reusable and reduces the cost of changing packages.
+
+-- [Sculley et al 2015](https://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems)
+
+BaselineModels provides a common API for mostly preexisting models to allow them to all be used in the same way.
+As such, the most important thing is that it itself has a common API.
+Here are some facts about that API:
+
+### Models and Templates
+
+A **model** is an object that can be used to make predictions via calling `predict`.
+A **template** is an object that can create a *model* by being `fit` to some data.
+
+All information about how to perform `fit`, such as hyper-parameters, is stored inside the *template*.
+This is different from some other APIs which might for example pass those as keyword arguments to `fit`.
+The template based API is superior to these as it means `fit` is always the same.
+One does not have to carry both a model type, and a varying collection of keyword arguments, which would get complicated when composing wrapper models.
+
+
+### `fit` and `predict`
+
+```julia
+model = StatsBase.fit(
+    template,
+    outputs::AbstractMatrix,  # always Features x Observations
+    inputs::AbstractMatrix,   # always Variates x Observations
+    weights=uweights(Float32, size(outputs, 2))
+)::Model
+```
+
+```julia
+outputs = StatsBase.predict(
+    model,
+    inputs::AbstractMatrix  # always Features x Observations
+)::AbstractMatrix  # always Variates x Observations
+```
+
+`fit` takes in a *template* and some *data* and returns a `Model` that has been fit to the data.
+`predict` takes a `Model`  (that has been `fit` from a *template*) and produces a predicted output.
+
+Important facts about `fit` and `predict`:
+ - `outputs` and `inputs` always have observations as the second dimension -- even if it is  [`SingleOutput`](@ref) (that just means that it will be a `1 x num_obs` output. (See [Docs on Julia being column-major](https://docs.julialang.org/en/v1/manual/performance-tips/#Access-arrays-in-memory-order,-along-columns-1))
+ - The functions must accept any `AbstractMatrix` for the `inputs` and `outputs` (`fit` only). If the underlying implementation needs a plain dense `Matrix` then `fit`/`predict` should perform the conversion.
+ - `fit` always accepts a `weights` argument. If the underlying model does not support weighted fitting, then `fit` should throw and error if the weights that passed in and are not all equal.
+ - `fit`/`predict` take no keyword arguments, or any other arguments except the ones shown.
+
+### Traits
+
+This package largely avoids using complicated abstract types, or relying on a model having a particular abstract type.
+Instead we use [traits](https://invenia.github.io/blog/2019/11/06/julialang-features-part-2/) to determine model behavior.
+
+Here are the current model traits in use and their possible values:
+ - `estimate_type` -  determines what kinds of estimates the model outputs.
+   - `PointEstimate`: Predicts point-estimates of the most likely values.
+   - `DistributionEstimate`: Estimates distributions over possible values.
+ - `output_type` - determines how many output variates a model can learn
+   - `SingleOutput`: Fits and predicts on a single output only.
+   - `MultiOutput`: Fits and predicts on multiple outputs at a time.
+
+The traits always agree between the model and the template.
+Every model and template should define all the listed traits.
+
+This package uses traits implemented such that the trait function returns an `abstract type` (rather than an instance).
+That means to check a trait one uses:
+```julia
+if estimate_type(model) isa DistributionEstimate
+```
+and to dispatch on a trait one uses:
+```
+foo(::Type{<:DistributionEstimate}, ...)
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,8 +1,17 @@
-# Models.jl
+# Models
 
-```@index
-```
+## Why does this package exist?
+
+[Models.jl](https://gitlab.invenia.ca/invenia/research/Models.jl) defines the [`Template`](@ref) and [`Model`](@ref) types as well as a common API for constructing a generic model in downstream packages, including:
+
+* Calling [`fit`](@ref) on a [`Template`](@ref).
+* Calling [`predict`](@ref) on a [`Model`](@ref).
+* Assigning traits such as [`EstimateTrait`](@ref) and [`OutputTrait`](@ref).
+* Testing interfaces and downstream dependencies with [`TestUtils`](@ref).
+
+For common examples of the interface being implemented see [BaselineModels.jl](https://gitlab.invenia.ca/invenia/research/BaselineModels.jl).
 
-```@autodocs
-Modules = [Models]
+## Contents
+```@contents
+Pages = ["api.md", "testutils.md"]
 ```
diff --git a/docs/src/testutils.md b/docs/src/testutils.md
@@ -0,0 +1,9 @@
+# TestUtils
+
+Provides test fakes, [`FakeTemplate`](@ref) and [`FakeModel`](@ref), that are useful for
+testing downstream dependencies, and [`test_interface`](@ref) for testing the Model's API
+has been correctly implemented.
+
+```@autodocs
+Modules = [Models.TestUtils]
+```
diff --git a/src/Models.jl b/src/Models.jl
@@ -1,5 +1,49 @@
 module Models
 
-greet() = print("Hello World!")
+import StatsBase: fit, predict
+
+export Model, Template
+export fit, predict, estimate_type, output_type
+export EstimateTrait, PointEstimate, DistributionEstimate
+export OutputTrait, SingleOutput, MultiOutput
+
+"""
+   Template
+
+A Template is an untrained [`Model`](@ref) that can be [`fit`](@ref) to data.
+Defined as well are the traits:
+- [`output_type`](@ref): SingleOutput or MultiOutput
+- [`estimate_type`](@ref): PointEstimate or DistributionEstimate
+"""
+abstract type Template end
+
+"""
+   Model
+
+A Model is a trained [`Template`](@ref) with which one can [`predict`](@ref) on inputs.
+Defined as well are the traits:
+- [`output_type`](@ref): SingleOutput or MultiOutput
+- [`estimate_type`](@ref): PointEstimate or DistributionEstimate
+"""
+abstract type Model end
+
+"""
+   fit(::Template, output, input) -> Model
+
+Fit the `Template` to the `output` and `input` data and return a trained `Model`.
+"""
+function fit end
+
+"""
+    predict(::Model, input)
+
+Predict targets for the provided `input` and `Model`.
+
+Returns a predictive distribution or point estimates depending on the `Model`.
+"""
+function predict end
+
+include("traits.jl")
+include("test_utils.jl")
 
 end # module