Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(utils/decorators): rewrite remove task decorator to use ast #43383

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 31 additions & 36 deletions airflow/utils/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,55 +17,50 @@
# under the License.
from __future__ import annotations

import ast
import sys
from collections import deque
from typing import Callable, TypeVar

T = TypeVar("T", bound=Callable)


class _TaskDecoratorRemover(ast.NodeTransformer):
def __init__(self, task_decorator_name):
self.decorators_to_remove = {
"setup",
"teardown",
"task.skip_if",
"task.run_if",
task_decorator_name,
}

def visit_FunctionDef(self, node):
node.decorator_list = [
decorator for decorator in node.decorator_list if not self._is_task_decorator(decorator)
]
return self.generic_visit(node)

def _is_task_decorator(self, decorator):
if isinstance(decorator, ast.Name):
return decorator.id in self.decorators_to_remove
elif isinstance(decorator, ast.Attribute):
return f"{decorator.value.id}.{decorator.attr}" in self.decorators_to_remove
elif isinstance(decorator, ast.Call):
return self._is_task_decorator(decorator.func)
return False


def remove_task_decorator(python_source: str, task_decorator_name: str) -> str:
"""
Remove @task or similar decorators as well as @setup and @teardown.

:param python_source: The python source code
:param task_decorator_name: the decorator name

TODO: Python 3.9+: Rewrite this to use ast.parse and ast.unparse
"""

def _remove_task_decorator(py_source, decorator_name):
# if no line starts with @decorator_name, we can early exit
for line in py_source.split("\n"):
if line.startswith(decorator_name):
break
else:
return python_source
split = python_source.split(decorator_name, 1)
before_decorator, after_decorator = split[0], split[1]
if after_decorator[0] == "(":
after_decorator = _balance_parens(after_decorator)
if after_decorator[0] == "\n":
after_decorator = after_decorator[1:]
return before_decorator + after_decorator

decorators = ["@setup", "@teardown", "@task.skip_if", "@task.run_if", task_decorator_name]
for decorator in decorators:
python_source = _remove_task_decorator(python_source, decorator)
return python_source


def _balance_parens(after_decorator):
num_paren = 1
after_decorator = deque(after_decorator)
after_decorator.popleft()
while num_paren:
current = after_decorator.popleft()
if current == "(":
num_paren = num_paren + 1
elif current == ")":
num_paren = num_paren - 1
return "".join(after_decorator)
tree = ast.parse(python_source)
remover = _TaskDecoratorRemover(task_decorator_name.strip("@"))
mutated_tree = remover.visit(tree)
return ast.unparse(mutated_tree)


class _autostacklevel_warn:
Expand Down
14 changes: 7 additions & 7 deletions tests/utils/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_task_decorator_using_source(decorator: TaskDecorator):
def f():
return ["some_task"]

assert parse_python_source(f, "decorator") == 'def f():\n return ["some_task"]\n'
assert parse_python_source(f, "decorator") == "def f():\n return ['some_task']"


@pytest.mark.parametrize("decorator", DECORATORS, indirect=["decorator"])
Expand All @@ -59,7 +59,7 @@ def test_skip_if(decorator: TaskDecorator):
def f():
return "hello world"

assert parse_python_source(f, "decorator") == 'def f():\n return "hello world"\n'
assert parse_python_source(f, "decorator") == "def f():\n return 'hello world'"


@pytest.mark.parametrize("decorator", DECORATORS, indirect=["decorator"])
Expand All @@ -69,7 +69,7 @@ def test_run_if(decorator: TaskDecorator):
def f():
return "hello world"

assert parse_python_source(f, "decorator") == 'def f():\n return "hello world"\n'
assert parse_python_source(f, "decorator") == "def f():\n return 'hello world'"


def test_skip_if_and_run_if():
Expand All @@ -79,7 +79,7 @@ def test_skip_if_and_run_if():
def f():
return "hello world"

assert parse_python_source(f) == 'def f():\n return "hello world"\n'
assert parse_python_source(f) == "def f():\n return 'hello world'"


def test_run_if_and_skip_if():
Expand All @@ -89,7 +89,7 @@ def test_run_if_and_skip_if():
def f():
return "hello world"

assert parse_python_source(f) == 'def f():\n return "hello world"\n'
assert parse_python_source(f) == "def f():\n return 'hello world'"


def test_skip_if_allow_decorator():
Expand All @@ -102,7 +102,7 @@ def non_task_decorator(func):
def f():
return "hello world"

assert parse_python_source(f) == '@non_task_decorator\ndef f():\n return "hello world"\n'
assert parse_python_source(f) == "@non_task_decorator\ndef f():\n return 'hello world'"


def test_run_if_allow_decorator():
Expand All @@ -115,7 +115,7 @@ def non_task_decorator(func):
def f():
return "hello world"

assert parse_python_source(f) == '@non_task_decorator\ndef f():\n return "hello world"\n'
assert parse_python_source(f) == "@non_task_decorator\ndef f():\n return 'hello world'"


def parse_python_source(task: Task, custom_operator_name: str | None = None) -> str:
Expand Down
16 changes: 8 additions & 8 deletions tests/utils/test_preexisting_python_virtualenv_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,20 @@

class TestExternalPythonDecorator:
def test_remove_task_decorator(self):
py_source = "@task.external_python(use_dill=True)\ndef f():\nimport funcsigs"
py_source = "@task.external_python(use_dill=True)\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.external_python")
assert res == "def f():\nimport funcsigs"
assert res == "def f():\n ...\nimport funcsigs"

def test_remove_decorator_no_parens(self):
py_source = "@task.external_python\ndef f():\nimport funcsigs"
py_source = "@task.external_python\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.external_python")
assert res == "def f():\nimport funcsigs"
assert res == "def f():\n ...\nimport funcsigs"

def test_remove_decorator_nested(self):
py_source = "@foo\[email protected]_python\n@bar\ndef f():\nimport funcsigs"
py_source = "@foo\[email protected]_python\n@bar\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.external_python")
assert res == "@foo\n@bar\ndef f():\nimport funcsigs"
assert res == "@foo\n@bar\ndef f():\n ...\nimport funcsigs"

py_source = "@foo\[email protected]_python()\n@bar\ndef f():\nimport funcsigs"
py_source = "@foo\[email protected]_python()\n@bar\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.external_python")
assert res == "@foo\n@bar\ndef f():\nimport funcsigs"
assert res == "@foo\n@bar\ndef f():\n ...\nimport funcsigs"
21 changes: 8 additions & 13 deletions tests/utils/test_python_virtualenv.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,25 +116,20 @@ def test_should_create_virtualenv_with_extra_packages(self, mock_execute_in_subp
mock_execute_in_subprocess.assert_called_with(["/VENV/bin/pip", "install", "apache-beam[gcp]"])

def test_remove_task_decorator(self):
py_source = "@task.virtualenv(use_dill=True)\ndef f():\nimport funcsigs"
py_source = "@task.virtualenv(use_dill=True)\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.virtualenv")
assert res == "def f():\nimport funcsigs"
assert res == "def f():\n ...\nimport funcsigs"

def test_remove_decorator_no_parens(self):
py_source = "@task.virtualenv\ndef f():\nimport funcsigs"
py_source = "@task.virtualenv\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.virtualenv")
assert res == "def f():\nimport funcsigs"

def test_remove_decorator_including_comment(self):
py_source = "@task.virtualenv\ndef f():\n# @task.virtualenv\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.virtualenv")
assert res == "def f():\n# @task.virtualenv\nimport funcsigs"
Comment on lines -128 to -131
Copy link
Contributor Author

@josix josix Oct 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm unsure if it is fine to remove the comments after executing ast.unparse. If it is not an allowed behavior, we might not be able to adopt ast as a solution, since it would not preserve comments.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be better to use libcst https://pypi.org/project/libcst/ - which we alredy consider to use for our "upgrade check" tooling -> #41641 (comment)

this way we could preserve comments, line numbers etc. I think the important thing we might want to keep here is debuggability and particularly line number references, and with AST we will likely loose it - not only the comments.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no expertise in this matter so I can not judge this.

In general I'd favor is we refactor to ensure pytests are staying the same. I am also not sure why the line feed was removed in the other tests. Is this a compatibility limitation or just a cleanup?

Copy link
Member

@potiuk potiuk Oct 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's the difference of AST vs. CST.

  • (AST) Abstract Syntax Tree - > is the tree of syntax of Python code that contains only the "meaningful" tokens - that represent a "runtime" python bytecode (with stripped out comments and other irrelevant source code tokens - such as punctuations, idents, parentheses etc.) - https://en.wikipedia.org/wiki/Abstract_syntax_tree

  • (CST) Concrete Syntax Tree -> is the tree of syntax of Python code that represent the source code - not Python runtime bytecode (CST includes EOL characters and comments, indents and parentheses and punctuation marks) - but also all other non-runtime tokens - https://en.wikipedia.org/wiki/Parse_tree

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(that's a bit simplified definition of course - but I think it describes the difference well).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're doing code modification instead of checking only, we probably should go with libcst I think 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I happened to share this topic at PyCon Taiwan this year. If anyone is interested, this was the slide I used. The content around page 67 is the most relevant.

https://speakerdeck.com/leew/unleash-the-chaos-developing-a-linter-for-un-pythonic-code?slide=67

Copy link
Contributor Author

@josix josix Oct 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think it's better to leave dag authors' code information as much as possible in the rendered script.py of virtualenv operator, ideally the unit tests should not be changed, let me try LibCST instead of AST here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you need any example, I used it to check default_deferrable here https://github.com/apache/airflow/blob/main/scripts/ci/pre_commit/check_deferrable_default.py

assert res == "def f():\n ...\nimport funcsigs"

def test_remove_decorator_nested(self):
py_source = "@foo\[email protected]\n@bar\ndef f():\nimport funcsigs"
py_source = "@foo\[email protected]\n@bar\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.virtualenv")
assert res == "@foo\n@bar\ndef f():\nimport funcsigs"
assert res == "@foo\n@bar\ndef f():\n ...\nimport funcsigs"

py_source = "@foo\[email protected]()\n@bar\ndef f():\nimport funcsigs"
py_source = "@foo\[email protected]()\n@bar\ndef f(): ...\nimport funcsigs"
res = remove_task_decorator(python_source=py_source, task_decorator_name="@task.virtualenv")
assert res == "@foo\n@bar\ndef f():\nimport funcsigs"
assert res == "@foo\n@bar\ndef f():\n ...\nimport funcsigs"