Skip to content

Commit

Permalink
Final documentation and version change.
Browse files Browse the repository at this point in the history
Signed-off-by: Roger Dev <[email protected]>
  • Loading branch information
RogerDevLN committed Dec 22, 2023
1 parent fd65953 commit 1d1d3af
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 25 deletions.
4 changes: 2 additions & 2 deletions Bundle.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ EXPORT Bundle := MODULE(Std.BundleBase)
EXPORT License := 'http://www.apache.org/licenses/LICENSE-2.0';
EXPORT Copyright := 'Copyright (C) 2022 HPCC Systems®';
EXPORT DependsOn := ['ML_Core'];
EXPORT Version := '1.0';
EXPORT PlatformVersion := '8.4.0';
EXPORT Version := '2.0';
EXPORT PlatformVersion := '8.10.6';
END;
33 changes: 28 additions & 5 deletions Causality.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ NumericField := cTypes.NumericField;
* intervention on one or more variable. See Query for details.
* - Metrics -- Evaluate various causal metrics on desgnated pairs
* [source, destination] of variables.
* - DiscoverModel -- Utilize a range of causal discovery methods to discover causal relationships
* between variables.
*
* @param mod A causal model in DATASET(cModel) format. The dataset should
* contain only a single record, defining the model.
Expand Down Expand Up @@ -216,7 +218,33 @@ EXPORT Causality(DATASET(cModelTyp) mod, UNSIGNED PS) := MODULE
RETURN metrics_S;
END;


/**
* Analyze the data to estimate the causal relationships between variables.
*
* @param vars A set of variable names among which to discover relationships. If omitted,
* will use all variables in dataset.
*
* @param pwr The power to use for statisitical queries. Range [1, 100]. The higher power,
* the more accuracy, but longer runtime. Power=1 suffices for liner relationships.
* Power > 10 is not recommended due to very long runtimes. Default = 1.
* @param sensitivity The sensitivity of dependence detection to use. Range 1.0 -10.0. Default is 10
* (Maximum Sensitivity). It can be useful to reduce sensitivity in real-world datasets,
* to restrict the number of relationships found.
* @param depth Determines how many simultaneous conditional variables will be evaluated. Default = 2.
* values above 3 may be problematic due to long run times, and possibly exceding the sensitivity
* of the instruments.
* @return A DATASET(DiscResult) with a single record representing the results
* of the discovery.
* @see Types.DiscResult
*/
EXPORT DATASET(DiscResult) DiscoverModel(SET OF STRING vars=[], REAL pwr=powerDefault, REAL sensitivity=10, UNSIGNED depth=2) := FUNCTION
result := cModel.DiscoverModel(vars, pwr, sensitivity, depth, CM);
RETURN result;
END;

/**
* This function is Deprecated. Use DiscoverModel instead.
* Analyze the data to estimate the causal relationships between variables.
*
* Produces information that is useful for understanding the variables' relationships,
Expand Down Expand Up @@ -244,9 +272,4 @@ EXPORT Causality(DATASET(cModelTyp) mod, UNSIGNED PS) := MODULE
RETURN rpt;
END;

EXPORT DATASET(DiscResult) DiscoverModel(SET OF STRING vars, REAL pwr=powerDefault, REAL sensitivity=10, UNSIGNED depth=2) := FUNCTION
result := cModel.DiscoverModel(vars, pwr, sensitivity, depth, CM);
RETURN result;
END;

END;
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ ecl bundle install https://github.com/RogerDev/HPCC_Causality.git

Each of the main modules provides documentation and examples of use.
Documentation is provided within the module and examples are in the test folder.
For an in-depth tutorial, please see: https://hpccsystems.com/resources/hpcc-causality-tookit-version-2-0.

### Probability Queries
The probability, causality, and Visualization modules share a common query format. Natural textual queries are used as they allow both simple and sophisticated queries to be composed without complex nested data structures.
Expand All @@ -61,4 +62,6 @@ There are minor differences in the query semantics between the three use cases:
- Probability supports two types of query:
- Scalar Query -- Returns a single value. For example: P(A = 5), E(A)
- Distribution Query -- Returns a 'distribution' structure. For example: P(A), P(A | B > 0).
- Causality provides a superset of probability queries, that may also contain a 'do()' clause, specifying one or more causal intervention. Causal queries require the specification of a 'causal model' in addtion to the query.
- Causality provides a superset of probability queries, that may also contain a 'do()' clause, specifying one or more causal intervention. Causal queries require the specification of a 'causal model' in addtion to the query.


94 changes: 77 additions & 17 deletions Types.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,20 @@ EXPORT Types := MODULE
* Natural Language Query.
*
* Supports probability queries in a simple format.
* @field id A unique id for this query.
* @field query A string representing the probability query e.g. 'P(A=1)'.
*/
EXPORT nlQuery := RECORD
UNSIGNED id;
STRING query;
END;

/**
* Result of a causal query. Includes the orginal query in addition
* to the AnyField result.
*
* @field query The original query that produced this result.
*/
EXPORT nlQueryRslt := RECORD(AnyField)
STRING query;
END;
Expand Down Expand Up @@ -106,11 +114,21 @@ EXPORT Types := MODULE
REAL P;
END;

/**
* Child dataset of Distribution to hold the mapping between string values and
* their numeric eqivalent
*/
EXPORT StrValEntry := RECORD
UNSIGNED numVal;
STRING strVal;
END;

/**
* Input to a causal metric query.
* @field id A unique id for this query
* @field cause The name of the causal variable for the query
* @field effect The name of the effect variable for the query
*/
EXPORT MetricQuery := RECORD
UNSIGNED id;
STRING cause;
Expand Down Expand Up @@ -323,6 +341,17 @@ EXPORT Types := MODULE
DATASET(SetMembers) VarGraph;
END;

/**
* Results of the DiscoveryModel function
* Provides the discovered causal model as a list of edges [cause, effect], and
* associated metrics.
*
* @field causeVar The name of the causal variable in the relationship
* @field effectVar The name of the effect variable in the relationship
* @field strength The strength of the dependence between the variables
* @field correlation The statistical correlation between the variables
* @field MDE The Maximum Direct Effect of the cause on the effect variable.
*/
EXPORT DiscoveryResult := RECORD
STRING causeVar;
STRING effectVar;
Expand All @@ -331,11 +360,54 @@ EXPORT Types := MODULE
REAL MDE;
END;

/**
* Child data type for DatasetSummary below. Describes a single variable
* in the dataset.
*
* @field name The name of the variable.
* @field isDiscrete True if the variable is discrete, otherwise False.
* @field isCategorical True if the variable is categorical, otherwise False.
* @field isTextual True if the variable is a text-based categorical, otherwise False.
* @field cardinality The number of unique values which the discrete variable takes in the dataset.
* @field numValues The numeric values the discrete variable takes in the dataset.
* @field textValues The textual values the textual categorical variables takes in the dataset
*/
EXPORT VarSummary := RECORD
STRING name;
BOOLEAN isDiscrete;
BOOLEAN isCategorical;
BOOLEAN isTextual;
UNSIGNED cardinality;
SET OF REAL numValues;
SET OF STRING textValues;
END;
/**
* Dataset Summary returned from Probability.Summary.
*
* Provides an overview of the dataset.
* @field numRecords The number of records in the dataset.
* @field varNames A set of the variable names in the dataset
* @field varDetails A set of Var Summary records describing each variable in the dataset.
*/
EXPORT DatasetSummary := RECORD
UNSIGNED numRecords;
SET OF STRING varNames;
DATASET(VarSummary) varDetails;
END;


/**
* @internal
* Internal data type used by visualization
*/
EXPORT ChartGrid := RECORD
UNSIGNED id;
DATASET(AnyField) gridItem;
END;

/**
* @internal
* Internal data type used by visualization
*/
EXPORT ChartData := RECORD
UNSIGNED id;
STRING x_;
Expand All @@ -346,7 +418,10 @@ EXPORT Types := MODULE
REAL range2low := 0.0;
REAL range2high := 0.0;
END;

/**
* @internal
* Internal data type used by visualization
*/
EXPORT ChartInfo := RECORD
STRING dataname;
STRING qtype;
Expand All @@ -362,19 +437,4 @@ EXPORT Types := MODULE
REAL range2high;
END;

EXPORT VarSummary := RECORD
STRING name;
BOOLEAN isDiscrete;
BOOLEAN isCategorical;
BOOLEAN isTextual;
UNSIGNED cardinality;
SET OF REAL numValues;
SET OF STRING textValues;

END;
EXPORT DatasetSummary := RECORD
UNSIGNED numRecords;
SET OF STRING varNames;
DATASET(VarSummary) varDetails;
END;
END;

0 comments on commit 1d1d3af

Please sign in to comment.