Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-15991: Infogram pydocs updates #15992

Open
wants to merge 10 commits into
base: rel-3.46.0
Choose a base branch
from
119 changes: 118 additions & 1 deletion h2o-bindings/bin/custom/python/gen_infogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def _extract_x_from_model(self):

def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
"""
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Note that the frame rel_cmi_frame contains the following columns:
- 0: predictor names
- 1: admissible
Expand Down Expand Up @@ -435,3 +435,120 @@ def train_subset_models(self, model_class, y, training_frame, test_frame, protec
feature set. Admissible models are also less susceptible to overfitting and train faster, while providing similar accuracy as models built using all available features.
"""
)
examples = dict(
algorithm_params="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)[0]
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> gbm_params = {'ntrees':3}
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols)
>>> ig.train(y=y, x=x, training_frame=train, algorithm_params=gbm_params)
>>> ig.plot()
""",
data_fraction="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, data_fraction=0.7)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
net_information_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, net_information_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
relevance_index_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, relevance_index_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
safety_index_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, safety_index_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
top_n_features="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, top_n_features=30)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
""",
total_information_threshold="""
>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, total_information_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
)


122 changes: 121 additions & 1 deletion h2o-py/h2o/estimators/infogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,24 @@ def algorithm_params(self):
Customized parameters for the machine learning algorithm specified in the algorithm parameter.

Type: ``dict``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)[0]
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> gbm_params = {'ntrees':3}
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols)
>>> ig.train(y=y, x=x, training_frame=train, algorithm_params=gbm_params)
>>> ig.plot()
shaunyogeshwaran marked this conversation as resolved.
Show resolved Hide resolved
"""
if self._parms.get("algorithm_params") != None:
algorithm_params_dict = ast.literal_eval(self._parms.get("algorithm_params"))
Expand Down Expand Up @@ -745,6 +763,23 @@ def total_information_threshold(self):
information is the x-axis of the Core Infogram. Default is -1 which gets set to 0.1.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of scope but I don't understand Default is -1 which gets set to 0.1.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the first line is not correct since the default is -0.1.

A number between 0 and 1 representing a threshold for total information, defaulting to 0.1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wendycwong any ideas about what's going on with the default values here?

(I can update the schema and fix the first line issue)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hannah-tillman I understand it now.

It comes from here:

https://github.com/h2oai/h2o-3/blob/master/h2o-admissibleml/src/main/java/hex/Infogram/Infogram.java#L185-L187

@wendycwong any reason why we not set it directly here?

Nevertheless, its out of scope of this PR. @shaunyogeshwaran.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@valenad1 @hannah-tillman

The value of -1 or -0.1 is used to denote that the user has not set any value. If the user has not set any value, we will set it to a default value of 0.1. There is a reason that the code needs to know if the user set that value. I cannot remember what it is now.


Type: ``float``, defaults to ``-1.0``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, total_information_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("total_information_threshold")

Expand All @@ -768,6 +803,23 @@ def net_information_threshold(self):
the y-axis of the Core Infogram. Default is -1 which gets set to 0.1.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. First line says number between 0-1,... Default -1 sets to 0.1..


Type: ``float``, defaults to ``-1.0``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, net_information_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("net_information_threshold")

Expand All @@ -792,6 +844,23 @@ def relevance_index_threshold(self):
which gets set to 0.1.

Type: ``float``, defaults to ``-1.0``.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. First line says number between 0-1,... Default -1 sets to 0.1..

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think @valenad1 has a good point. We could just set those values to 0.1 as default instead of setting it to -1 and then set it to 0.1.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me make a quick change in a new PR.


:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, relevance_index_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("relevance_index_threshold")

Expand All @@ -816,6 +885,23 @@ def safety_index_threshold(self):
gets set to 0.1.

Type: ``float``, defaults to ``-1.0``.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. First line says number between 0-1,... Default -1 sets to 0.1..


:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, safety_index_threshold=0.5)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("safety_index_threshold")

Expand All @@ -837,6 +923,23 @@ def data_fraction(self):
and less than or equal to 1.0 is acceptable.

Type: ``float``, defaults to ``1.0``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, data_fraction=0.7)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work the same as split_frame?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("data_fraction")

Expand All @@ -852,6 +955,23 @@ def top_n_features(self):
importance, and the top N are evaluated. Defaults to 50.

Type: ``int``, defaults to ``50``.

:examples:

>>> import h2o
>>> from h2o.estimators.infogram import H2OInfogram
>>> h2o.init()
>>> f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
>>> col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
>>> df = h2o.import_file(path=f, col_types=col_types)
>>> train = df.split_frame(seed=1)
>>> y = "default_payment_next_month"
>>> x = train.columns
>>> x.remove(y)
>>> pcols = ["SEX", "MARRIAGE", "AGE"]
>>> ig = H2OInfogram(protected_columns=pcols, top_n_features=30)
>>> ig.train(y=y, x=x, training_frame=train)
>>> ig.plot()
"""
return self._parms.get("top_n_features")

Expand All @@ -874,7 +994,7 @@ def _extract_x_from_model(self):

def plot(self, train=True, valid=False, xval=False, figsize=(10, 10), title="Infogram", legend_on=False, server=False):
"""
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Plot the infogram. By default, it will plot the infogram calculated from training dataset.
Note that the frame rel_cmi_frame contains the following columns:
- 0: predictor names
- 1: admissible
Expand Down
Loading