datahub-project · anshbansal · Jan 16, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/metadata-ingestion/build.gradle b/metadata-ingestion/build.gradle
@@ -110,14 +110,16 @@ task lint(type: Exec, dependsOn: installDev) {
     "source ${venv_name}/bin/activate && set -x && " +
     "black --check --diff src/ tests/ examples/ && " +
     "isort --check --diff src/ tests/ examples/ && " +
-    "flake8 --count --statistics src/ tests/ examples/ && " +
+    "ruff check src/ tests/ examples/ && " +
     "mypy --show-traceback --show-error-codes src/ tests/ examples/"
 }
+
 task lintFix(type: Exec, dependsOn: installDev) {
   commandLine 'bash', '-c',
     "source ${venv_name}/bin/activate && set -x && " +
     "black src/ tests/ examples/ && " +
-    "isort src/ tests/ examples/"
+    "isort src/ tests/ examples/ && " +
+    "ruff check --fix src/ tests/ examples/"
 }
 
 def pytest_default_env = "PYTHONDEVMODE=1"

diff --git a/metadata-ingestion/developing.md b/metadata-ingestion/developing.md
@@ -183,7 +183,7 @@ We use black, isort, flake8, and mypy to ensure consistent code style and qualit
 # Assumes: pip install -e '.[dev]' and venv is activated
 black src/ tests/
 isort src/ tests/
-flake8 src/ tests/
+ruff check src/ tests/
 mypy src/ tests/
 ```
 

diff --git a/metadata-ingestion/pyproject.toml b/metadata-ingestion/pyproject.toml
@@ -9,7 +9,7 @@ extend-exclude = '''
 ^/tmp
 '''
 include = '\.pyi?$'
-target-version = ['py37', 'py38', 'py39', 'py310']
+target-version = ['py38', 'py39', 'py310', 'py311']
 
 [tool.isort]
 combine_as_imports = true
@@ -26,6 +26,48 @@ extraPaths = ['tests']
 exclude = ["src/datahub/metadata/"]
 ignore_decorators = ["@click.*", "@validator", "@root_validator", "@pydantic.validator", "@pydantic.root_validator", "@pytest.fixture"]
 ignore_names = ["*Source", "*Sink", "*Report"]
-# min_confidence = 80
 paths = ["src"]
 sort_by_size = true
+
+[tool.ruff]
+# Same as Black.
+line-length = 88
+# Exclude directories matching these patterns.
+exclude = [
+    ".git",
+    "src/datahub/metadata",
+    "venv",
+    ".tox",
+    "__pycache__",
+]
+
+[tool.ruff.lint]
+select = [
+    "B",
+    "C90",
+    "E",
+    "F",
+]
+ignore = [
+    # Ignore line length violations (handled by Black)
+    "E501",
+    # Ignore whitespace before ':' (matches Black)
+    "E203",
+    # Allow usages of functools.lru_cache
+    "B019",
+    # Allow function call in argument defaults
+    "B008",
+    # TODO: Enable these later as mentioned in original config
+    "B006",  # Mutable args
+    "B007",  # Unused loop control variable
+    # Below these were added while migrating flake8 to ruff to avoid making lot of changes
+    "B904", 
+    "B017",
+]
+
+[tool.ruff.lint.mccabe]
+max-complexity = 20
+
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
diff --git a/metadata-ingestion/setup.cfg b/metadata-ingestion/setup.cfg
@@ -1,39 +1,3 @@
-[flake8]
-max-complexity = 20
-ignore =
-    # Ignore: line length issues, since black's formatter will take care of them.
-    E501,
-    # Ignore compound statements, since they're used for ellipsis by black
-    # See https://github.com/psf/black/issues/3887
-    E704,
-    # Ignore: 1 blank line required before class docstring.
-    D203,
-    # See https://stackoverflow.com/a/57074416.
-    W503,
-    # See https://github.com/psf/black/issues/315.
-    E203,
-    # Allow usages of functools.lru_cache.
-    B019,
-    # This rule flags the use of function calls in argument defaults.
-    # There's some good reasons to do this, so we're ok with it.
-    B008,
-    # TODO: However, we should enable B006 to catch issues with mutable args.
-    B006,
-    # TODO: Enable B007 - unused loop control variable.
-    B007
-    # TODO: Enable B902 - require self/cls naming.
-    # TODO: Enable B904 - use raise from in except clauses.
-exclude =
-    .git,
-    src/datahub/metadata,
-    venv,
-    .tox,
-    __pycache__
-per-file-ignores =
-    # imported but unused
-    __init__.py: F401, I250
-ban-relative-imports = true
-
 [mypy]
 plugins =
     ./tests/test_helpers/sqlalchemy_mypy_plugin.py,

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -590,12 +590,8 @@
 }
 
 lint_requirements = {
-    # This is pinned only to avoid spurious errors in CI.
-    # We should make an effort to keep it up to date.
     "black==23.3.0",
-    "flake8>=6.0.0",
-    "flake8-tidy-imports>=4.3.0",
-    "flake8-bugbear==23.3.12",
+    "ruff==0.9.1",
     "isort>=5.7.0",
     "mypy==1.10.1",
 }

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
         "get_column_max",
         "get_column_mean",
         "get_column_stdev",
-        "get_column_stdev",
         "get_column_nonnull_count",
         "get_column_unique_count",
     }

diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py
@@ -893,11 +893,11 @@ def normalize_mode_query(self, query: str) -> str:
                         jinja_params[key] = parameters[key].get("default", "")
 
                 normalized_query = re.sub(
-                    r"{% form %}(.*){% endform %}",
-                    "",
-                    query,
-                    0,
-                    re.MULTILINE | re.DOTALL,
+                    pattern=r"{% form %}(.*){% endform %}",
+                    repl="",
+                    string=query,
+                    count=0,
+                    flags=re.MULTILINE | re.DOTALL,
                 )
 
             # Wherever we don't resolve the jinja params, we replace it with NULL

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py
@@ -96,7 +96,7 @@ def log_http_error(self, message: str) -> Any:
             url: str = e.request.url if e.request else "URL not available"
             self.reporter.warning(
                 title="Metadata API Timeout",
-                message=f"Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
+                message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
                 context=f"url={url}",
             )
 
@@ -173,7 +173,7 @@ def _get_entity_users(
                 entity=entity_name,
                 entity_id=entity_id,
             )
-        except:  # It will catch all type of exception
+        except Exception:
             e = self.log_http_error(
                 message=f"Unable to fetch users for {entity_name}({entity_id})."
             )
@@ -210,7 +210,7 @@ def get_reports(self, workspace: Workspace) -> List[Report]:
                             message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
                             context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
                         )
-        except:
+        except Exception:
             self.log_http_error(
                 message=f"Unable to fetch reports for workspace {workspace.name}"
             )
@@ -260,7 +260,7 @@ def get_workspaces(self) -> List[Workspace]:
 
             groups = self._get_resolver().get_groups(filter_=filter_)
 
-        except:
+        except Exception:
             self.log_http_error(message="Unable to fetch list of workspaces")
             # raise  # we want this exception to bubble up
 
@@ -292,7 +292,7 @@ def get_modified_workspaces(self) -> List[str]:
             modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
                 self.__config.modified_since
             )
-        except:
+        except Exception:
             self.log_http_error(message="Unable to fetch list of modified workspaces.")
 
         return modified_workspace_ids
@@ -303,8 +303,8 @@ def _get_scan_result(self, workspace_ids: List[str]) -> Any:
             scan_id = self.__admin_api_resolver.create_scan_job(
                 workspace_ids=workspace_ids
             )
-        except:
-            e = self.log_http_error(message=f"Unable to fetch get scan result.")
+        except Exception:
+            e = self.log_http_error(message="Unable to fetch get scan result.")
             if data_resolver.is_permission_error(cast(Exception, e)):
                 logger.warning(
                     "Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_types.py
@@ -384,7 +384,6 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
     "varchar": StringType,
     "char": StringType,
     "varbinary": BytesType,
-    "json": RecordType,
     "date": DateType,
     "time": TimeType,
     "timestamp": TimeType,

diff --git a/metadata-ingestion/src/datahub/utilities/memory_footprint.py b/metadata-ingestion/src/datahub/utilities/memory_footprint.py
@@ -1,7 +1,7 @@
 from collections import deque
 from itertools import chain
 from sys import getsizeof
-from typing import Any, Callable
+from typing import Any
 
 
 def total_size(o: Any, handlers: Any = {}) -> int:
@@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
     Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
     """
 
-    dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
+    def dict_handler(d: dict) -> chain[Any]:
+        return chain.from_iterable(d.items())
 
     all_handlers = {
         tuple: iter,

diff --git a/metadata-ingestion/tests/unit/utilities/test_cli_logging.py b/metadata-ingestion/tests/unit/utilities/test_cli_logging.py
@@ -36,7 +36,7 @@ def my_logging_fn():
     logger.warning("This is a warning message")
     logger.error("this is an error with no stack trace")
     try:
-        1 / 0
+        _ = 1 / 0
     except ZeroDivisionError:
         logger.exception("failed to divide by zero")
 

diff --git a/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py b/metadata-ingestion/tests/unit/utilities/test_lossy_collections.py
@@ -9,36 +9,36 @@
 
 @pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
 def test_lossylist_sampling(length, sampling):
-    l: LossyList[str] = LossyList()
+    l_dict: LossyList[str] = LossyList()
     for i in range(0, length):
-        l.append(f"{i} Hello World")
+        l_dict.append(f"{i} Hello World")
 
-    assert len(l) == length
-    assert l.sampled is sampling
+    assert len(l_dict) == length
+    assert l_dict.sampled is sampling
     if sampling:
-        assert f"... sampled of {length} total elements" in str(l)
+        assert f"... sampled of {length} total elements" in str(l_dict)
     else:
-        assert "sampled" not in str(l)
+        assert "sampled" not in str(l_dict)
 
-    list_version = [int(i.split(" ")[0]) for i in l]
+    list_version = [int(i.split(" ")[0]) for i in l_dict]
     print(list_version)
     assert sorted(list_version) == list_version
 
 
 @pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
 def test_lossyset_sampling(length, sampling):
-    l: LossySet[str] = LossySet()
+    l_dict: LossySet[str] = LossySet()
     for i in range(0, length):
-        l.add(f"{i} Hello World")
+        l_dict.add(f"{i} Hello World")
 
-    assert len(l) == min(10, length)
-    assert l.sampled is sampling
+    assert len(l_dict) == min(10, length)
+    assert l_dict.sampled is sampling
     if sampling:
-        assert f"... sampled with at most {length-10} elements missing" in str(l)
+        assert f"... sampled with at most {length-10} elements missing" in str(l_dict)
     else:
-        assert "sampled" not in str(l)
+        assert "sampled" not in str(l_dict)
 
-    list_version = [int(i.split(" ")[0]) for i in l]
+    list_version = [int(i.split(" ")[0]) for i in l_dict]
     set_version = set(list_version)
 
     assert len(list_version) == len(set_version)
@@ -49,35 +49,36 @@ def test_lossyset_sampling(length, sampling):
     "length, sampling, sub_length", [(4, False, 4), (10, False, 14), (100, True, 1000)]
 )
 def test_lossydict_sampling(length, sampling, sub_length):
-    l: LossyDict[int, LossyList[str]] = LossyDict()
+    l_dict: LossyDict[int, LossyList[str]] = LossyDict()
     elements_added = 0
     element_length_map = {}
     for i in range(0, length):
         list_length = random.choice(range(1, sub_length))
         element_length_map[i] = 0
         for _num_elements in range(0, list_length):
-            if not l.get(i):
+            if not l_dict.get(i):
                 elements_added += 1
                 # reset to 0 until we get it back
                 element_length_map[i] = 0
             else:
-                element_length_map[i] = len(l[i])
+                element_length_map[i] = len(l_dict[i])
 
-            current_list = l.get(i, LossyList())
+            current_list = l_dict.get(i, LossyList())
             current_list.append(f"{i}:{round(time.time(),2)} Hello World")
-            l[i] = current_list
+            l_dict[i] = current_list
             element_length_map[i] += 1
 
-    assert len(l) == min(l.max_elements, length)
-    assert l.sampled is sampling
+    assert len(l_dict) == min(l_dict.max_elements, length)
+    assert l_dict.sampled is sampling
     if sampling:
-        assert re.search("sampled of at most .* entries.", str(l))
-        assert f"{l.max_elements} sampled of at most {elements_added} entries." in str(
-            l
+        assert re.search("sampled of at most .* entries.", str(l_dict))
+        assert (
+            f"{l_dict.max_elements} sampled of at most {elements_added} entries."
+            in str(l_dict)
         )
     else:
         # cheap way to determine that the dict isn't reporting sampled keys
-        assert not re.search("sampled of at most .* entries.", str(l))
+        assert not re.search("sampled of at most .* entries.", str(l_dict))
 
-    for k, v in l.items():
+    for k, v in l_dict.items():
         assert len(v) == element_length_map[k]