diff --git a/Bundle.ecl b/Bundle.ecl index e500c8e..90c1eb3 100644 --- a/Bundle.ecl +++ b/Bundle.ecl @@ -9,6 +9,6 @@ EXPORT Bundle := MODULE(Std.BundleBase) EXPORT License := 'http://www.apache.org/licenses/LICENSE-2.0'; EXPORT Copyright := 'Copyright (C) 2022 HPCC Systems®'; EXPORT DependsOn := ['ML_Core']; - EXPORT Version := '1.0'; - EXPORT PlatformVersion := '8.4.0'; + EXPORT Version := '2.0'; + EXPORT PlatformVersion := '8.10.6'; END; diff --git a/Causality.ecl b/Causality.ecl index d5c3a32..cd1384d 100644 --- a/Causality.ecl +++ b/Causality.ecl @@ -31,6 +31,8 @@ NumericField := cTypes.NumericField; * intervention on one or more variable. See Query for details. * - Metrics -- Evaluate various causal metrics on desgnated pairs * [source, destination] of variables. + * - DiscoverModel -- Utilize a range of causal discovery methods to discover causal relationships + * between variables. * * @param mod A causal model in DATASET(cModel) format. The dataset should * contain only a single record, defining the model. @@ -216,7 +218,33 @@ EXPORT Causality(DATASET(cModelTyp) mod, UNSIGNED PS) := MODULE RETURN metrics_S; END; + /** + * Analyze the data to estimate the causal relationships between variables. + * + * @param vars A set of variable names among which to discover relationships. If omitted, + * will use all variables in dataset. + * + * @param pwr The power to use for statisitical queries. Range [1, 100]. The higher power, + * the more accuracy, but longer runtime. Power=1 suffices for liner relationships. + * Power > 10 is not recommended due to very long runtimes. Default = 1. + * @param sensitivity The sensitivity of dependence detection to use. Range 1.0 -10.0. Default is 10 + * (Maximum Sensitivity). It can be useful to reduce sensitivity in real-world datasets, + * to restrict the number of relationships found. + * @param depth Determines how many simultaneous conditional variables will be evaluated. Default = 2. + * values above 3 may be problematic due to long run times, and possibly exceding the sensitivity + * of the instruments. + * @return A DATASET(DiscResult) with a single record representing the results + * of the discovery. + * @see Types.DiscResult + */ + EXPORT DATASET(DiscResult) DiscoverModel(SET OF STRING vars=[], REAL pwr=powerDefault, REAL sensitivity=10, UNSIGNED depth=2) := FUNCTION + result := cModel.DiscoverModel(vars, pwr, sensitivity, depth, CM); + RETURN result; + END; + + /** + * This function is Deprecated. Use DiscoverModel instead. * Analyze the data to estimate the causal relationships between variables. * * Produces information that is useful for understanding the variables' relationships, @@ -244,9 +272,4 @@ EXPORT Causality(DATASET(cModelTyp) mod, UNSIGNED PS) := MODULE RETURN rpt; END; - EXPORT DATASET(DiscResult) DiscoverModel(SET OF STRING vars, REAL pwr=powerDefault, REAL sensitivity=10, UNSIGNED depth=2) := FUNCTION - result := cModel.DiscoverModel(vars, pwr, sensitivity, depth, CM); - RETURN result; - END; - END; \ No newline at end of file diff --git a/README.md b/README.md index 2fea381..16a6bc7 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ ecl bundle install https://github.com/RogerDev/HPCC_Causality.git Each of the main modules provides documentation and examples of use. Documentation is provided within the module and examples are in the test folder. +For an in-depth tutorial, please see: https://hpccsystems.com/resources/hpcc-causality-tookit-version-2-0. ### Probability Queries The probability, causality, and Visualization modules share a common query format. Natural textual queries are used as they allow both simple and sophisticated queries to be composed without complex nested data structures. @@ -61,4 +62,6 @@ There are minor differences in the query semantics between the three use cases: - Probability supports two types of query: - Scalar Query -- Returns a single value. For example: P(A = 5), E(A) - Distribution Query -- Returns a 'distribution' structure. For example: P(A), P(A | B > 0). -- Causality provides a superset of probability queries, that may also contain a 'do()' clause, specifying one or more causal intervention. Causal queries require the specification of a 'causal model' in addtion to the query. +- Causality provides a superset of probability queries, that may also contain a 'do()' clause, specifying one or more causal intervention. Causal queries require the specification of a 'causal model' in addtion to the query. + + diff --git a/Types.ecl b/Types.ecl index 8963f15..db71d87 100644 --- a/Types.ecl +++ b/Types.ecl @@ -24,12 +24,20 @@ EXPORT Types := MODULE * Natural Language Query. * * Supports probability queries in a simple format. + * @field id A unique id for this query. + * @field query A string representing the probability query e.g. 'P(A=1)'. */ EXPORT nlQuery := RECORD UNSIGNED id; STRING query; END; + /** + * Result of a causal query. Includes the orginal query in addition + * to the AnyField result. + * + * @field query The original query that produced this result. + */ EXPORT nlQueryRslt := RECORD(AnyField) STRING query; END; @@ -106,11 +114,21 @@ EXPORT Types := MODULE REAL P; END; + /** + * Child dataset of Distribution to hold the mapping between string values and + * their numeric eqivalent + */ EXPORT StrValEntry := RECORD UNSIGNED numVal; STRING strVal; END; + /** + * Input to a causal metric query. + * @field id A unique id for this query + * @field cause The name of the causal variable for the query + * @field effect The name of the effect variable for the query + */ EXPORT MetricQuery := RECORD UNSIGNED id; STRING cause; @@ -323,6 +341,17 @@ EXPORT Types := MODULE DATASET(SetMembers) VarGraph; END; + /** + * Results of the DiscoveryModel function + * Provides the discovered causal model as a list of edges [cause, effect], and + * associated metrics. + * + * @field causeVar The name of the causal variable in the relationship + * @field effectVar The name of the effect variable in the relationship + * @field strength The strength of the dependence between the variables + * @field correlation The statistical correlation between the variables + * @field MDE The Maximum Direct Effect of the cause on the effect variable. + */ EXPORT DiscoveryResult := RECORD STRING causeVar; STRING effectVar; @@ -331,11 +360,54 @@ EXPORT Types := MODULE REAL MDE; END; +/** + * Child data type for DatasetSummary below. Describes a single variable + * in the dataset. + * + * @field name The name of the variable. + * @field isDiscrete True if the variable is discrete, otherwise False. + * @field isCategorical True if the variable is categorical, otherwise False. + * @field isTextual True if the variable is a text-based categorical, otherwise False. + * @field cardinality The number of unique values which the discrete variable takes in the dataset. + * @field numValues The numeric values the discrete variable takes in the dataset. + * @field textValues The textual values the textual categorical variables takes in the dataset + */ + EXPORT VarSummary := RECORD + STRING name; + BOOLEAN isDiscrete; + BOOLEAN isCategorical; + BOOLEAN isTextual; + UNSIGNED cardinality; + SET OF REAL numValues; + SET OF STRING textValues; + END; + /** + * Dataset Summary returned from Probability.Summary. + * + * Provides an overview of the dataset. + * @field numRecords The number of records in the dataset. + * @field varNames A set of the variable names in the dataset + * @field varDetails A set of Var Summary records describing each variable in the dataset. + */ + EXPORT DatasetSummary := RECORD + UNSIGNED numRecords; + SET OF STRING varNames; + DATASET(VarSummary) varDetails; + END; + + + /** + * @internal + * Internal data type used by visualization + */ EXPORT ChartGrid := RECORD UNSIGNED id; DATASET(AnyField) gridItem; END; - + /** + * @internal + * Internal data type used by visualization + */ EXPORT ChartData := RECORD UNSIGNED id; STRING x_; @@ -346,7 +418,10 @@ EXPORT Types := MODULE REAL range2low := 0.0; REAL range2high := 0.0; END; - + /** + * @internal + * Internal data type used by visualization + */ EXPORT ChartInfo := RECORD STRING dataname; STRING qtype; @@ -362,19 +437,4 @@ EXPORT Types := MODULE REAL range2high; END; - EXPORT VarSummary := RECORD - STRING name; - BOOLEAN isDiscrete; - BOOLEAN isCategorical; - BOOLEAN isTextual; - UNSIGNED cardinality; - SET OF REAL numValues; - SET OF STRING textValues; - - END; - EXPORT DatasetSummary := RECORD - UNSIGNED numRecords; - SET OF STRING varNames; - DATASET(VarSummary) varDetails; - END; END;