Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pipeline transform #602

Open
wants to merge 12 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data-processing-lib/doc/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ runtime interfacee expected to be implemented by each runtime ([python](python-r
* [DataAccessFactory](../python/src/data_processing/data_access/data_access_factory_base.py) - is
used to configure the input and output data files to be processed and creates
the `DataAccess` instance (see below) according to the CLI parameters.
* [TransformRuntimeConfiguration](../python/src/data_processing/runtime/runtime_configuration.py) - captures
* [TransformRuntimeConfiguration](../python/src/data_processing/transform/runtime_configuration.py) - captures
the `TransformConfiguration` and runtime-specific configuration.
* [DataAccess](../python/src/data_processing/data_access/data_access.py) - is
the interface defining data i/o methods and selection. Implementations for local
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from data_processing.runtime.execution_configuration import TransformExecutionConfiguration, runtime_cli_prefix
from data_processing.runtime.runtime_configuration import TransformRuntimeConfiguration
from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher
from data_processing.runtime.transform_file_processor import AbstractTransformFileProcessor
from data_processing.runtime.transform_launcher import AbstractTransformLauncher, multi_launcher
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
# limitations under the License.
################################################################################

from data_processing.runtime import TransformRuntimeConfiguration
from data_processing.runtime.pure_python import DefaultPythonTransformRuntime
from data_processing.transform import TransformConfiguration
from data_processing.transform import TransformConfiguration, TransformRuntimeConfiguration


class PythonTransformRuntimeConfiguration(TransformRuntimeConfiguration):
Expand All @@ -26,12 +25,5 @@ def __init__(
:param transform_config - base configuration class
:param runtime_class: implementation of the transform runtime
"""
self.runtime_class = runtime_class
super().__init__(transform_config=transform_config)
super().__init__(transform_config=transform_config, runtime_class=runtime_class)

def create_transform_runtime(self) -> DefaultPythonTransformRuntime:
"""
Create transform runtime with the parameters captured during apply_input_params()
:return: transform runtime object
"""
return self.runtime_class(self.transform_config.get_transform_params())
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from typing import Any

from data_processing.data_access import DataAccessFactoryBase
from data_processing.transform import TransformStatistics
from data_processing.transform import TransformStatistics, BaseTransformRuntime


class DefaultPythonTransformRuntime:
class DefaultPythonTransformRuntime(BaseTransformRuntime):
"""
Transformer runtime used by processor to to create Transform specific environment
"""
Expand All @@ -26,7 +26,7 @@ def __init__(self, params: dict[str, Any]):
Create/config this runtime.
:param params: parameters, often provided by the CLI arguments as defined by a TableTansformConfiguration.
"""
self.params = params
super().__init__(params)

def get_transform_config(
self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
self.data_access = data_access_factory.create_data_access()
# Add data access and statistics to the processor parameters
self.transform_params = transform_parameters
self.transform_params["data_access"] = self.data_access
self.transform_params["data_access_factory"] = data_access_factory

def process_file(self, f_name: str) -> None:
"""
Expand Down Expand Up @@ -205,7 +205,7 @@ def _submit_file(self, t_start: float, out_files: list[tuple[bytes, str]], stats
def _publish_stats(self, stats: dict[str, Any]) -> None:
"""
Publishing execution statistics
:param stats: Statistics
:param stats: dictionary
:return: None
"""
raise ValueError("must be implemented by subclass")
raise NotImplemented("must be implemented by subclass")
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import argparse

from data_processing.data_access import DataAccessFactory, DataAccessFactoryBase
from data_processing.runtime import TransformRuntimeConfiguration
from data_processing.transform import TransformRuntimeConfiguration
from data_processing.utils import ParamsUtils, get_logger


Expand All @@ -36,6 +36,7 @@ def __init__(
self.runtime_config = runtime_config
self.name = self.runtime_config.get_name()
self.data_access_factory = data_access_factory
self.execution_config = None

def _get_parser(self) -> argparse.ArgumentParser:
"""
Expand All @@ -56,9 +57,9 @@ def _get_arguments(self, parser: argparse.ArgumentParser) -> argparse.Namespace:
:return: list of arguments
"""
# add additional arguments
self.runtime_config.add_input_params(parser=parser)
self.data_access_factory.add_input_params(parser=parser)
self.execution_config.add_input_params(parser=parser)
self.runtime_config.add_input_params(parser=parser)
return parser.parse_args()

def _get_parameters(self, args: argparse.Namespace) -> bool:
Expand All @@ -67,11 +68,10 @@ def _get_parameters(self, args: argparse.Namespace) -> bool:
and does parameters validation
:return: True if validation passes or False, if not
"""
return (
self.runtime_config.apply_input_params(args=args)
return (self.runtime_config.apply_input_params(args=args)
and self.execution_config.apply_input_params(args=args)
and self.data_access_factory.apply_input_params(args=args)
)
)

def _submit_for_execution(self) -> int:
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
from .table_transform_test import AbstractTableTransformTest
from .binary_transform_test import AbstractBinaryTransformTest
from .noop_transform import (
from data_processing.test_support.transform.table_transform_test import AbstractTableTransformTest
from data_processing.test_support.transform.binary_transform_test import AbstractBinaryTransformTest
from data_processing.test_support.transform.noop_transform import (
NOOPTransform,
NOOPPythonTransformConfiguration,
)
from data_processing.test_support.transform.resize_transform import (
ResizeTransform,
ResizePythonTransformConfiguration,
)

from data_processing.test_support.transform.pipeline_transform import (
ResizeNOOPPythonTransformConfiguration,
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
from typing import Any

import pyarrow as pa
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.runtime.pure_python.runtime_configuration import (
PythonTransformRuntimeConfiguration,
)
from data_processing.runtime.pure_python import PythonTransformLauncher, PythonTransformRuntimeConfiguration
from data_processing.transform import AbstractTableTransform, TransformConfiguration
from data_processing.utils import CLIArgumentProvider, get_logger

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

from data_processing.runtime.pure_python import PythonTransformLauncher, PythonTransformRuntimeConfiguration
from data_processing.transform.pure_python import PythonPipelineTransform
from data_processing.transform import PipelineTransformConfiguration
from data_processing.utils import get_logger
from data_processing.test_support.transform import NOOPPythonTransformConfiguration, ResizePythonTransformConfiguration

logger = get_logger(__name__)


class ResizeNOOPPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
"""
Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher.
NOOP does not use a RayRuntime class so the superclass only needs the base
python-only configuration.
"""

def __init__(self):
"""
Initialization
"""
super().__init__(transform_config=PipelineTransformConfiguration(
config={"transforms": [ResizePythonTransformConfiguration(),
NOOPPythonTransformConfiguration()]},
transform_class=PythonPipelineTransform))


if __name__ == "__main__":
# launcher = NOOPRayLauncher()
launcher = PythonTransformLauncher(ResizeNOOPPythonTransformConfiguration())
logger.info("Launching resize/noop transform")
launcher.launch()
Loading