-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split component implementation and execution (#302)
This PR follows up on the PoC presented in #268 --- Fixes #257 It splits the implementation and execution of components, this has some advantages: - Pandas components can use `__init__` instead of setup, which is probably more familiar to users - Other components can use `__init__` as well instead of receiving all arguments to their transform or equivalent method, aligning implementation of different component types - Component implementation and execution should be easier to test separately I borrowed the executor terminology from KfP. --- Fixes #203 Since I had to update all the components, I also switched some of them to subclass `PandasTransformComponent` instead of `DaskTransformComponent`. --- These changes open some opportunities for additional improvements, but I propose to tackle those as separate PRs as this PR is already quite huge due to all the changes to the components. - [ ] #300 - [ ] #301
- Loading branch information
1 parent
7d43515
commit 6d2251e
Showing
29 changed files
with
799 additions
and
669 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,45 @@ | ||
"""This component filters code based on a set of metadata associated with it.""" | ||
import logging | ||
|
||
import dask.dataframe as dd | ||
from fondant.component import DaskTransformComponent | ||
import pandas as pd | ||
from fondant.component import PandasTransformComponent | ||
from fondant.executor import PandasTransformExecutor | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class FilterLineLengthComponent(DaskTransformComponent): | ||
class FilterLineLengthComponent(PandasTransformComponent): | ||
""" | ||
This component filters code based on a set of metadata associated with it: | ||
average line length, maximum line length and alphanum fraction. | ||
""" | ||
|
||
def transform( | ||
self, | ||
*, | ||
dataframe: dd.DataFrame, | ||
def __init__(self, *_, | ||
avg_line_length_threshold: int, | ||
max_line_length_threshold: int, | ||
alphanum_fraction_threshold: float, | ||
) -> dd.DataFrame: | ||
) -> None: | ||
""" | ||
Args: | ||
dataframe: Dask dataframe | ||
avg_line_length_threshold: Threshold for average line length to filter on | ||
max_line_length_threshold: Threshold for max line length to filter on | ||
alphanum_fraction_threshold: Alphanum fraction to filter on | ||
Returns: | ||
Filtered dask dataframe. | ||
alphanum_fraction_threshold: Alphanum fraction to filter on. | ||
""" | ||
self.avg_line_length_threshold = avg_line_length_threshold | ||
self.max_line_length_threshold = max_line_length_threshold | ||
self.alphanum_fraction_threshold = alphanum_fraction_threshold | ||
|
||
def transform( | ||
self, | ||
dataframe: pd.DataFrame, | ||
) -> pd.DataFrame: | ||
return dataframe[ | ||
(dataframe["code_avg_line_length"] > avg_line_length_threshold) | ||
& (dataframe["code_max_line_length"] > max_line_length_threshold) | ||
& (dataframe["code_alphanum_fraction"] > alphanum_fraction_threshold) | ||
(dataframe["code_avg_line_length"] > self.avg_line_length_threshold) | ||
& (dataframe["code_max_line_length"] > self.max_line_length_threshold) | ||
& (dataframe["code_alphanum_fraction"] > self.alphanum_fraction_threshold) | ||
] | ||
|
||
|
||
if __name__ == "__main__": | ||
component = FilterLineLengthComponent.from_args() | ||
component.run() | ||
executor = PandasTransformExecutor.from_args() | ||
executor.execute(FilterLineLengthComponent) |
Oops, something went wrong.