feat: Sped up graph building by implementing own solution

zenml-io · strickvl · Jul 16, 2024 · Jul 3, 2024 · Jul 5, 2024 · Jul 5, 2024
commit c46dfd20ec49a497111e17ad8015fa2d20ade7ad
diff --git a/bundled/tool/zenml_grapher.py b/bundled/tool/zenml_grapher.py
@@ -0,0 +1,97 @@
+#  Copyright (c) ZenML GmbH 2024. All Rights Reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+"""This module contains a tool to mimic LineageGraph output for pipeline runs"""
+
+class Grapher:
+    """Quick and dirty implementation of ZenML/LineageGraph to reduce number of api calls"""
+
+    def __init__(self, run):
+        self.run = run
+        self.nodes = []
+        self.artifacts = {}
+        self.edges = {}
+
+    def build_nodes_from_steps(self) -> None:
+        """Builds internal node list from run steps"""
+        self.nodes = []
+        self.artifacts = {}
+
+        for step in self.run.metadata.steps:
+            step_data = self.run.metadata.steps[step]
+            self.nodes.append({
+                "id": str(step_data.id),
+                "type": "step",
+                "data": {
+                    "execution_id": str(step_data.id),
+                    "name": step,
+                    "status": step_data.body.status,
+                },
+            })
+            self.add_artifacts_from_list(step_data.body.inputs)
+            self.add_artifacts_from_list(step_data.body.outputs)
+
-    def build_nodes_from_steps(self) -> None:
-        """Builds internal node list from run steps"""
-        self.nodes = []
-        self.artifacts = {}
-
-        for step in self.run.metadata.steps:
-            step_data = self.run.metadata.steps[step]
-            self.nodes.append({
-                "id": str(step_data.id),
-                "type": "step",
-                "data": {
-                    "execution_id": str(step_data.id),
-                    "name": step,
-                    "status": step_data.body.status,
-                },
-            })
-            self.add_artifacts_from_list(step_data.body.inputs)
-            self.add_artifacts_from_list(step_data.body.outputs)
+    def build_nodes_from_steps(self) -> None:
+        """Builds internal node list from run steps"""
+        if not self.nodes and not self.artifacts:
+            self.nodes = []
+            self.artifacts = {}
+
+        for step in self.run.metadata.steps:
+            step_data = self.run.metadata.steps[step]
+            self.nodes.append({
+                "id": str(step_data.id),
+                "type": "step",
+                "data": {
+                    "execution_id": str(step_data.id),
+                    "name": step,
+                    "status": step_data.body.status,
+                },
+            })
+            self.add_artifacts_from_list(step_data.body.inputs)
+            self.add_artifacts_from_list(step_data.body.outputs)
-    def build_nodes_from_steps(self) -> None:
-        """Builds internal node list from run steps"""
-        self.nodes = []
-        self.artifacts = {}
-
-        for step in self.run.metadata.steps:
-            step_data = self.run.metadata.steps[step]
-            self.nodes.append({
-                "id": str(step_data.id),
-                "type": "step",
-                "data": {
-                    "execution_id": str(step_data.id),
-                    "name": step,
-                    "status": step_data.body.status,
-                },
-            })
-            self.add_artifacts_from_list(step_data.body.inputs)
-            self.add_artifacts_from_list(step_data.body.outputs)
+    def build_nodes_from_steps(self) -> None:
+        """Builds internal node list from run steps"""
+        if not self.nodes and not self.artifacts:
+            self.nodes = []
+            self.artifacts = {}
+
+        for step in self.run.metadata.steps:
+            step_data = self.run.metadata.steps[step]
+            self.nodes.append({
+                "id": str(step_data.id),
+                "type": "step",
+                "data": {
+                    "execution_id": str(step_data.id),
+                    "name": step,
+                    "status": step_data.body.status,
+                },
+            })
+            self.add_artifacts_from_list(step_data.body.inputs)
+            self.add_artifacts_from_list(step_data.body.outputs)
+
+    def add_artifacts_from_list(self, list) -> None:
+        """Used to add unique artifacts to the internal nodes list by build_nodes_from_steps"""
+        for artifact in list:
+            id = str(list[artifact].body.artifact.id)
+            if id in self.artifacts:
+                continue
+
+            self.artifacts[id] = True
+
+            self.nodes.append({
+                "type": "artifact",
+                "id": id,
+                "data": {
+                    "name": artifact,
+                    "artifact_type": list[artifact].body.type,
+                    "execution_id": str(list[artifact].id),
+                },
+            })
-    def add_artifacts_from_list(self, list) -> None:
-        """Used to add unique artifacts to the internal nodes list by build_nodes_from_steps"""
-        for artifact in list:
-            id = str(list[artifact].body.artifact.id)
-            if id in self.artifacts:
-                continue
-
-            self.artifacts[id] = True
-
-            self.nodes.append({
-                "type": "artifact",
-                "id": id,
-                "data": {
-                    "name": artifact,
-                    "artifact_type": list[artifact].body.type,
-                    "execution_id": str(list[artifact].id),
-                },
-            })
+    def add_artifacts_from_list(self, artifacts_list: Dict[str, Artifact]) -> None:
+        """Used to add unique artifacts to the internal nodes list by build_nodes_from_steps"""
+        for artifact in artifacts_list:
+            id = str(artifacts_list[artifact].body.artifact.id)
+            if id in self.artifacts:
+                continue
+
+            self.artifacts[id] = True
+
+            self.nodes.append({
+                "type": "artifact",
+                "id": id,
+                "data": {
+                    "name": artifact,
+                    "artifact_type": artifacts_list[artifact].body.type,
+                    "execution_id": str(artifacts_list[artifact].id),
+                },
+            })
-    def add_artifacts_from_list(self, list) -> None:
-        """Used to add unique artifacts to the internal nodes list by build_nodes_from_steps"""
-        for artifact in list:
-            id = str(list[artifact].body.artifact.id)
-            if id in self.artifacts:
-                continue
-
-            self.artifacts[id] = True
-
-            self.nodes.append({
-                "type": "artifact",
-                "id": id,
-                "data": {
-                    "name": artifact,
-                    "artifact_type": list[artifact].body.type,
-                    "execution_id": str(list[artifact].id),
-                },
-            })
+    def add_artifacts_from_list(self, artifacts_list: Dict[str, Artifact]) -> None:
+        """Used to add unique artifacts to the internal nodes list by build_nodes_from_steps"""
+        for artifact in artifacts_list:
+            id = str(artifacts_list[artifact].body.artifact.id)
+            if id in self.artifacts:
+                continue
+
+            self.artifacts[id] = True
+
+            self.nodes.append({
+                "type": "artifact",
+                "id": id,
+                "data": {
+                    "name": artifact,
+                    "artifact_type": artifacts_list[artifact].body.type,
+                    "execution_id": str(artifacts_list[artifact].id),
+                },
+            })
+
+
+    def build_edges_from_steps(self) -> None:
+        """Builds internal edges list from run steps"""
+        self.edges = []
+
+        for step in self.run.metadata.steps:
+            step_data = self.run.metadata.steps[step]
+            step_id = str(step_data.id)
+
+            for artifact in step_data.body.inputs:
+                input_id = str(step_data.body.inputs[artifact].body.artifact.id)
+                self.add_edge(input_id, step_id)
+
+            for artifact in step_data.body.outputs:
+                output_id = str(step_data.body.outputs[artifact].body.artifact.id)
+                self.add_edge(step_id, output_id)
+
-    def build_edges_from_steps(self) -> None:
-        """Builds internal edges list from run steps"""
-        self.edges = []
-
-        for step in self.run.metadata.steps:
-            step_data = self.run.metadata.steps[step]
-            step_id = str(step_data.id)
-
-            for artifact in step_data.body.inputs:
-                input_id = str(step_data.body.inputs[artifact].body.artifact.id)
-                self.add_edge(input_id, step_id)
-
-            for artifact in step_data.body.outputs:
-                output_id = str(step_data.body.outputs[artifact].body.artifact.id)
-                self.add_edge(step_id, output_id)
+    def build_edges_from_steps(self) -> None:
+        """Builds internal edges list from run steps"""
+        self.edges = []
+
+        for step in self.run.metadata.steps:
+            step_data = self.run.metadata.steps[step]
+            step_id = str(step_data.id)
+
+            for artifact in step_data.body.inputs:
+                input_id = str(step_data.body.inputs[artifact].body.artifact.id)
+                self.add_edge(input_id, step_id)
+
+            for artifact in step_data.body.outputs:
+                output_id = str(step_data.body.outputs[artifact].body.artifact.id)
+                self.add_edge(step_id, output_id)
+
+    def add_edge(self, v: str, w: str) -> None:
+        """Helper method to add an edge to the internal edges list"""
+        edge_id = f"{v}_{w}"
+        if any(edge['id'] == edge_id for edge in self.edges):
+            return
+        self.edges.append({
+            "id": edge_id,
+            "source": v,
+            "target": w,
+        })
-    def build_edges_from_steps(self) -> None:
-        """Builds internal edges list from run steps"""
-        self.edges = []
-
-        for step in self.run.metadata.steps:
-            step_data = self.run.metadata.steps[step]
-            step_id = str(step_data.id)
-
-            for artifact in step_data.body.inputs:
-                input_id = str(step_data.body.inputs[artifact].body.artifact.id)
-                self.add_edge(input_id, step_id)
-
-            for artifact in step_data.body.outputs:
-                output_id = str(step_data.body.outputs[artifact].body.artifact.id)
-                self.add_edge(step_id, output_id)
+    def build_edges_from_steps(self) -> None:
+        """Builds internal edges list from run steps"""
+        self.edges = []
+
+        for step in self.run.metadata.steps:
+            step_data = self.run.metadata.steps[step]
+            step_id = str(step_data.id)
+
+            for artifact in step_data.body.inputs:
+                input_id = str(step_data.body.inputs[artifact].body.artifact.id)
+                self.add_edge(input_id, step_id)
+
+            for artifact in step_data.body.outputs:
+                output_id = str(step_data.body.outputs[artifact].body.artifact.id)
+                self.add_edge(step_id, output_id)
+
+    def add_edge(self, v: str, w: str) -> None:
+        """Helper method to add an edge to the internal edges list"""
+        edge_id = f"{v}_{w}"
+        if any(edge['id'] == edge_id for edge in self.edges):
+            return
+        self.edges.append({
+            "id": edge_id,
+            "source": v,
+            "target": w,
+        })
+
+    def add_edge(self, v, w) -> None:
+        """Helper method to add an edge to the internal edges list"""
+        self.edges.append({
+            "id": f"{v}_{w}",
+            "source": v,
+            "target": w,
+        })
-    def add_edge(self, v, w) -> None:
-        """Helper method to add an edge to the internal edges list"""
-        self.edges.append({
-            "id": f"{v}_{w}",
-            "source": v,
-            "target": w,
-        })
+    def add_edge(self, v: str, w: str) -> None:
+        """Helper method to add an edge to the internal edges list"""
+        self.edges.append({
+            "id": f"{v}_{w}",
+            "source": v,
+            "target": w,
+        })
-    def add_edge(self, v, w) -> None:
-        """Helper method to add an edge to the internal edges list"""
-        self.edges.append({
-            "id": f"{v}_{w}",
-            "source": v,
-            "target": w,
-        })
+    def add_edge(self, v: str, w: str) -> None:
+        """Helper method to add an edge to the internal edges list"""
+        self.edges.append({
+            "id": f"{v}_{w}",
+            "source": v,
+            "target": w,
+        })
+
+    def to_dict(self) -> dict:
+        """Returns dictionary containing graph data"""
+        return {
+            "nodes": self.nodes,
+            "edges": self.edges,
+            "status": self.run.body.status,
+            "name": self.run.body.pipeline.name,
+            "version": self.run.body.pipeline.body.version,
+        }
diff --git a/bundled/tool/zenml_wrappers.py b/bundled/tool/zenml_wrappers.py
@@ -15,6 +15,7 @@
 import json
 import pathlib
 from typing import Any
+from zenml_grapher import Grapher
 
 
 class GlobalConfigWrapper:
@@ -260,11 +261,6 @@ def ValidationError(self):
     def ZenMLBaseException(self):
         """Returns the ZenML ZenMLBaseException class."""
         return self.lazy_import("zenml.exceptions", "ZenMLBaseException")
-
-    @property
-    def LineageGraph(self):
-        """Returns the ZenML LineageGraph class."""
-        return self.lazy_import("zenml.lineage_graph.lineage_graph", "LineageGraph")
 
     def fetch_pipeline_runs(self, args):
         """Fetches all ZenML pipeline runs.
@@ -377,36 +373,11 @@ def get_pipeline_run_graph(self, args) -> dict:
         """
         try:
             run_id = args[0]
-            run = self.client.get_pipeline_run(run_id, hydrate=False)
-            graph = self.LineageGraph()
-            graph.generate_run_nodes_and_edges(run)
-
-            dag_data = {
-                "nodes": [
-                    {
-                        "id": node.id,
-                        "type": node.type,
-                        "data": {
-                            "execution_id": node.data.execution_id,
-                            "name": node.data.name,
-                            "status": node.data.status if node.type == 'step' else None,
-                            "artifact_type": node.data.artifact_type if node.type == 'artifact' else None,
-                        }
-                    } for node in graph.nodes
-                ],
-                "edges": [
-                    {
-                        "id": edge.id,
-                        "source": edge.source,
-                        "target": edge.target
-                    } for edge in graph.edges
-                ],
-                "status": run.body.status,
-                "name": run.body.pipeline.name,
-                "version": run.body.pipeline.body.version,
-            }
-
-            return dag_data
+            run = self.client.get_pipeline_run(run_id, hydrate=True)
+            graph = Grapher(run)
+            graph.build_nodes_from_steps()
+            graph.build_edges_from_steps()
+            return graph.to_dict()
         except self.ZenMLBaseException as e:
             return {"error": f"Failed to retrieve pipeline run graph: {str(e)}"}