-
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from WenjieDu/dev
Refactor the code and update the doc
- Loading branch information
Showing
9 changed files
with
252 additions
and
180 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# ignore special files or folds | ||
*~ | ||
.idea | ||
.DS_Store | ||
|
||
# ignore all building results | ||
dist | ||
build | ||
docs/_build | ||
*.egg-info | ||
|
||
# ignore all testing/running results | ||
.run | ||
.coverage | ||
.pytest_cache | ||
*__pycache__* | ||
*testing_results* | ||
|
||
# ignore specific kinds of files like all PDFs | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
""" | ||
PyCorruptor package | ||
PyCorruptor package. | ||
""" | ||
|
||
# Created by Wenjie Du <[email protected]> | ||
|
@@ -24,12 +24,11 @@ | |
__version__ = "0.0.4" | ||
|
||
try: | ||
from pycorruptor.corrupt import ( | ||
from pycorruptor.mcar import mcar | ||
from pycorruptor.utils import ( | ||
cal_missing_rate, | ||
masked_fill, | ||
mcar, | ||
) | ||
|
||
except Exception as e: | ||
print(e) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
""" | ||
Corrupt data by adding missing values to it with MAR (missing at random) pattern. | ||
""" | ||
|
||
# Created by Wenjie Du <[email protected]> | ||
# License: GLP-v3 | ||
|
||
|
||
def mar(X, rate, nan=0): | ||
"""Create random missing values (MAR case). | ||
Parameters | ||
---------- | ||
X : array, | ||
Data vector. If X has any missing values, they should be numpy.nan. | ||
rate : float, in (0,1), | ||
Artificially missing rate, rate of the observed values which will be artificially masked as missing. | ||
Note that, | ||
`rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)), | ||
not (number of artificially missing values) / np.product(self.data.shape), | ||
considering that the given data may already contain missing values, | ||
the latter way may be confusing because if the original missing rate >= `rate`, | ||
the function will do nothing, i.e. it won't play the role it has to be. | ||
nan : int/float, optional, default=0 | ||
Value used to fill NaN values. | ||
Returns | ||
------- | ||
""" | ||
# TODO: Create missing values in MAR case | ||
raise NotImplementedError("MAR case has not been implemented yet.") | ||
|
||
|
||
def _mar_numpy(X, rate, nan=0): | ||
pass | ||
|
||
|
||
def _mar_torch(X, rate, nan=0): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
""" | ||
Corrupt data by adding missing values to it with optional missing patterns (MCAR,MAR,MNAR). | ||
Corrupt data by adding missing values to it with MCAR (missing completely at random) pattern. | ||
""" | ||
|
||
# Created by Wenjie Du <[email protected]> | ||
|
@@ -13,89 +13,6 @@ | |
pass | ||
|
||
|
||
def cal_missing_rate(X): | ||
"""Calculate the originally missing rate of the raw data. | ||
Parameters | ||
---------- | ||
X : array-like, | ||
Data array that may contain missing values. | ||
Returns | ||
------- | ||
originally_missing_rate, float, | ||
The originally missing rate of the raw data. | ||
""" | ||
if isinstance(X, list): | ||
X = np.asarray(X) | ||
|
||
if isinstance(X, np.ndarray): | ||
originally_missing_rate = np.sum(np.isnan(X)) / np.product(X.shape) | ||
elif isinstance(X, torch.Tensor): | ||
originally_missing_rate = torch.sum(torch.isnan(X)) / np.product(X.shape) | ||
originally_missing_rate = originally_missing_rate.item() | ||
else: | ||
raise TypeError( | ||
"X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}" | ||
) | ||
|
||
return originally_missing_rate | ||
|
||
|
||
def masked_fill(X, mask, val): | ||
"""Like torch.Tensor.masked_fill(), fill elements in given `X` with `val` where `mask` is True. | ||
Parameters | ||
---------- | ||
X : array-like, | ||
The data vector. | ||
mask : array-like, | ||
The boolean mask. | ||
val : float | ||
The value to fill in with. | ||
Returns | ||
------- | ||
array, | ||
mask | ||
""" | ||
assert X.shape == mask.shape, ( | ||
"Shapes of X and mask must match, " | ||
f"but X.shape={X.shape}, mask.shape={mask.shape}" | ||
) | ||
assert isinstance(X, type(mask)), ( | ||
"Data types of X and mask must match, " f"but got {type(X)} and {type(mask)}" | ||
) | ||
|
||
if isinstance(X, list): | ||
X = np.asarray(X) | ||
mask = np.asarray(mask) | ||
|
||
if isinstance(X, np.ndarray): | ||
mask = mask.astype(bool) | ||
X[mask] = val | ||
elif isinstance(X, torch.Tensor): | ||
mask = mask.type(torch.bool) | ||
X[mask] = val | ||
else: | ||
raise TypeError( | ||
"X must be type of list/numpy.ndarray/torch.Tensor, " f"but got {type(X)}" | ||
) | ||
|
||
return X | ||
|
||
|
||
def little_mcar_test(X): | ||
"""Little's MCAR Test. | ||
Refer to :cite:`little1988TestMCAR` | ||
""" | ||
# TODO: Little's MCAR test | ||
raise NotImplementedError("MCAR test has not been implemented yet.") | ||
|
||
|
||
def mcar(X, rate, nan=0): | ||
"""Create completely random missing values (MCAR case). | ||
|
@@ -195,59 +112,10 @@ def _mcar_torch(X, rate, nan=0): | |
return X_intact, X, missing_mask, indicating_mask | ||
|
||
|
||
def mar(X, rate, nan=0): | ||
"""Create random missing values (MAR case). | ||
Parameters | ||
---------- | ||
X : array, | ||
Data vector. If X has any missing values, they should be numpy.nan. | ||
rate : float, in (0,1), | ||
Artificially missing rate, rate of the observed values which will be artificially masked as missing. | ||
Note that, | ||
`rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)), | ||
not (number of artificially missing values) / np.product(self.data.shape), | ||
considering that the given data may already contain missing values, | ||
the latter way may be confusing because if the original missing rate >= `rate`, | ||
the function will do nothing, i.e. it won't play the role it has to be. | ||
nan : int/float, optional, default=0 | ||
Value used to fill NaN values. | ||
Returns | ||
------- | ||
""" | ||
# TODO: Create missing values in MAR case | ||
raise NotImplementedError("MAR case has not been implemented yet.") | ||
|
||
|
||
def mnar(X, rate, nan=0): | ||
"""Create not-random missing values (MNAR case). | ||
Parameters | ||
---------- | ||
X : array, | ||
Data vector. If X has any missing values, they should be numpy.nan. | ||
rate : float, in (0,1), | ||
Artificially missing rate, rate of the observed values which will be artificially masked as missing. | ||
Note that, | ||
`rate` = (number of artificially missing values) / np.sum(~np.isnan(self.data)), | ||
not (number of artificially missing values) / np.product(self.data.shape), | ||
considering that the given data may already contain missing values, | ||
the latter way may be confusing because if the original missing rate >= `rate`, | ||
the function will do nothing, i.e. it won't play the role it has to be. | ||
nan : int/float, optional, default=0 | ||
Value used to fill NaN values. | ||
Returns | ||
------- | ||
def little_mcar_test(X): | ||
"""Little's MCAR Test. | ||
Refer to :cite:`little1988TestMCAR` | ||
""" | ||
# TODO: Create missing values in MNAR case | ||
raise NotImplementedError("MNAR case has not been implemented yet.") | ||
# TODO: Little's MCAR test | ||
raise NotImplementedError("MCAR test has not been implemented yet.") |
Oops, something went wrong.