Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dev(ingest): use ruff instead of flake8 #12359

Merged
merged 7 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions metadata-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,16 @@ task lint(type: Exec, dependsOn: installDev) {
"source ${venv_name}/bin/activate && set -x && " +
"black --check --diff src/ tests/ examples/ && " +
"isort --check --diff src/ tests/ examples/ && " +
"flake8 --count --statistics src/ tests/ examples/ && " +
"ruff check src/ tests/ examples/ && " +
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
}

task lintFix(type: Exec, dependsOn: installDev) {
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"black src/ tests/ examples/ && " +
"isort src/ tests/ examples/"
"isort src/ tests/ examples/ && " +
"ruff check --fix src/ tests/ examples/"
}

def pytest_default_env = "PYTHONDEVMODE=1"
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/developing.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ We use black, isort, flake8, and mypy to ensure consistent code style and qualit
# Assumes: pip install -e '.[dev]' and venv is activated
black src/ tests/
isort src/ tests/
flake8 src/ tests/
ruff check src/ tests/
mypy src/ tests/
```

Expand Down
46 changes: 44 additions & 2 deletions metadata-ingestion/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ extend-exclude = '''
^/tmp
'''
include = '\.pyi?$'
target-version = ['py37', 'py38', 'py39', 'py310']
target-version = ['py38', 'py39', 'py310', 'py311']

[tool.isort]
combine_as_imports = true
Expand All @@ -26,6 +26,48 @@ extraPaths = ['tests']
exclude = ["src/datahub/metadata/"]
ignore_decorators = ["@click.*", "@validator", "@root_validator", "@pydantic.validator", "@pydantic.root_validator", "@pytest.fixture"]
ignore_names = ["*Source", "*Sink", "*Report"]
# min_confidence = 80
paths = ["src"]
sort_by_size = true

[tool.ruff]
# Same as Black.
line-length = 88
# Exclude directories matching these patterns.
exclude = [
".git",
"src/datahub/metadata",
"venv",
".tox",
"__pycache__",
]

[tool.ruff.lint]
select = [
"B",
"C90",
anshbansal marked this conversation as resolved.
Show resolved Hide resolved
"E",
"F",
]
ignore = [
# Ignore line length violations (handled by Black)
"E501",
# Ignore whitespace before ':' (matches Black)
"E203",
# Allow usages of functools.lru_cache
"B019",
# Allow function call in argument defaults
"B008",
# TODO: Enable these later as mentioned in original config
"B006", # Mutable args
"B007", # Unused loop control variable
# Below these were added while migrating flake8 to ruff to avoid making lot of changes
"B904",
anshbansal marked this conversation as resolved.
Show resolved Hide resolved
"B017",
]

[tool.ruff.lint.mccabe]
max-complexity = 20


[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]
36 changes: 0 additions & 36 deletions metadata-ingestion/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,39 +1,3 @@
[flake8]
max-complexity = 20
ignore =
# Ignore: line length issues, since black's formatter will take care of them.
E501,
# Ignore compound statements, since they're used for ellipsis by black
# See https://github.com/psf/black/issues/3887
E704,
# Ignore: 1 blank line required before class docstring.
D203,
# See https://stackoverflow.com/a/57074416.
W503,
# See https://github.com/psf/black/issues/315.
E203,
# Allow usages of functools.lru_cache.
B019,
# This rule flags the use of function calls in argument defaults.
# There's some good reasons to do this, so we're ok with it.
B008,
# TODO: However, we should enable B006 to catch issues with mutable args.
B006,
# TODO: Enable B007 - unused loop control variable.
B007
# TODO: Enable B902 - require self/cls naming.
# TODO: Enable B904 - use raise from in except clauses.
exclude =
.git,
src/datahub/metadata,
venv,
.tox,
__pycache__
per-file-ignores =
# imported but unused
__init__.py: F401, I250
ban-relative-imports = true

[mypy]
plugins =
./tests/test_helpers/sqlalchemy_mypy_plugin.py,
Expand Down
6 changes: 1 addition & 5 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,12 +590,8 @@
}

lint_requirements = {
# This is pinned only to avoid spurious errors in CI.
# We should make an effort to keep it up to date.
anshbansal marked this conversation as resolved.
Show resolved Hide resolved
"black==23.3.0",
"flake8>=6.0.0",
"flake8-tidy-imports>=4.3.0",
"flake8-bugbear==23.3.12",
"ruff==0.9.1",
"isort>=5.7.0",
"mypy==1.10.1",
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,6 @@ def _is_single_row_query_method(query: Any) -> bool:
"get_column_max",
"get_column_mean",
"get_column_stdev",
"get_column_stdev",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

Copy link
Collaborator Author

@anshbansal anshbansal Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There were duplicates in this just above on line 269.

"get_column_nonnull_count",
"get_column_unique_count",
}
Expand Down
10 changes: 5 additions & 5 deletions metadata-ingestion/src/datahub/ingestion/source/mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,11 +893,11 @@ def normalize_mode_query(self, query: str) -> str:
jinja_params[key] = parameters[key].get("default", "")

normalized_query = re.sub(
r"{% form %}(.*){% endform %}",
"",
query,
0,
re.MULTILINE | re.DOTALL,
pattern=r"{% form %}(.*){% endform %}",
repl="",
string=query,
count=0,
flags=re.MULTILINE | re.DOTALL,
)

# Wherever we don't resolve the jinja params, we replace it with NULL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def log_http_error(self, message: str) -> Any:
url: str = e.request.url if e.request else "URL not available"
self.reporter.warning(
title="Metadata API Timeout",
message=f"Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
message="Metadata endpoints are not reachable. Check network connectivity to PowerBI Service.",
context=f"url={url}",
)

Expand Down Expand Up @@ -173,7 +173,7 @@ def _get_entity_users(
entity=entity_name,
entity_id=entity_id,
)
except: # It will catch all type of exception
except Exception:
e = self.log_http_error(
message=f"Unable to fetch users for {entity_name}({entity_id})."
)
Expand Down Expand Up @@ -210,7 +210,7 @@ def get_reports(self, workspace: Workspace) -> List[Report]:
message="A cross-workspace reference that failed to be resolved. Please ensure that no global workspace is being filtered out due to the workspace_id_pattern.",
context=f"report-name: {report.name} and dataset-id: {report.dataset_id}",
)
except:
except Exception:
self.log_http_error(
message=f"Unable to fetch reports for workspace {workspace.name}"
)
Expand Down Expand Up @@ -260,7 +260,7 @@ def get_workspaces(self) -> List[Workspace]:

groups = self._get_resolver().get_groups(filter_=filter_)

except:
except Exception:
self.log_http_error(message="Unable to fetch list of workspaces")
# raise # we want this exception to bubble up

Expand Down Expand Up @@ -292,7 +292,7 @@ def get_modified_workspaces(self) -> List[str]:
modified_workspace_ids = self.__admin_api_resolver.get_modified_workspaces(
self.__config.modified_since
)
except:
except Exception:
self.log_http_error(message="Unable to fetch list of modified workspaces.")

return modified_workspace_ids
Expand All @@ -303,8 +303,8 @@ def _get_scan_result(self, workspace_ids: List[str]) -> Any:
scan_id = self.__admin_api_resolver.create_scan_job(
workspace_ids=workspace_ids
)
except:
e = self.log_http_error(message=f"Unable to fetch get scan result.")
except Exception:
e = self.log_http_error(message="Unable to fetch get scan result.")
if data_resolver.is_permission_error(cast(Exception, e)):
logger.warning(
"Dataset lineage can not be ingestion because this user does not have access to the PowerBI Admin "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,6 @@ def resolve_snowflake_modified_type(type_string: str) -> Any:
"varchar": StringType,
"char": StringType,
"varbinary": BytesType,
"json": RecordType,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why'd we remove this?

Copy link
Collaborator Author

@anshbansal anshbansal Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicate in this dictionary. See the last element

"date": DateType,
"time": TimeType,
"timestamp": TimeType,
Expand Down
5 changes: 3 additions & 2 deletions metadata-ingestion/src/datahub/utilities/memory_footprint.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import deque
from itertools import chain
from sys import getsizeof
from typing import Any, Callable
from typing import Any


def total_size(o: Any, handlers: Any = {}) -> int:
Expand All @@ -15,7 +15,8 @@ def total_size(o: Any, handlers: Any = {}) -> int:
Based on https://github.com/ActiveState/recipe-577504-compute-mem-footprint/blob/master/recipe.py
"""

dict_handler: Callable[[Any], chain[Any]] = lambda d: chain.from_iterable(d.items())
def dict_handler(d: dict) -> chain[Any]:
return chain.from_iterable(d.items())

all_handlers = {
tuple: iter,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def my_logging_fn():
logger.warning("This is a warning message")
logger.error("this is an error with no stack trace")
try:
1 / 0
_ = 1 / 0
except ZeroDivisionError:
logger.exception("failed to divide by zero")

Expand Down
53 changes: 27 additions & 26 deletions metadata-ingestion/tests/unit/utilities/test_lossy_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,36 @@

@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
def test_lossylist_sampling(length, sampling):
l: LossyList[str] = LossyList()
l_dict: LossyList[str] = LossyList()
anshbansal marked this conversation as resolved.
Show resolved Hide resolved
for i in range(0, length):
l.append(f"{i} Hello World")
l_dict.append(f"{i} Hello World")

assert len(l) == length
assert l.sampled is sampling
assert len(l_dict) == length
assert l_dict.sampled is sampling
if sampling:
assert f"... sampled of {length} total elements" in str(l)
assert f"... sampled of {length} total elements" in str(l_dict)
else:
assert "sampled" not in str(l)
assert "sampled" not in str(l_dict)

list_version = [int(i.split(" ")[0]) for i in l]
list_version = [int(i.split(" ")[0]) for i in l_dict]
print(list_version)
assert sorted(list_version) == list_version


@pytest.mark.parametrize("length, sampling", [(10, False), (100, True)])
def test_lossyset_sampling(length, sampling):
l: LossySet[str] = LossySet()
l_dict: LossySet[str] = LossySet()
anshbansal marked this conversation as resolved.
Show resolved Hide resolved
for i in range(0, length):
l.add(f"{i} Hello World")
l_dict.add(f"{i} Hello World")

assert len(l) == min(10, length)
assert l.sampled is sampling
assert len(l_dict) == min(10, length)
assert l_dict.sampled is sampling
if sampling:
assert f"... sampled with at most {length-10} elements missing" in str(l)
assert f"... sampled with at most {length-10} elements missing" in str(l_dict)
else:
assert "sampled" not in str(l)
assert "sampled" not in str(l_dict)

list_version = [int(i.split(" ")[0]) for i in l]
list_version = [int(i.split(" ")[0]) for i in l_dict]
set_version = set(list_version)

assert len(list_version) == len(set_version)
Expand All @@ -49,35 +49,36 @@ def test_lossyset_sampling(length, sampling):
"length, sampling, sub_length", [(4, False, 4), (10, False, 14), (100, True, 1000)]
)
def test_lossydict_sampling(length, sampling, sub_length):
l: LossyDict[int, LossyList[str]] = LossyDict()
l_dict: LossyDict[int, LossyList[str]] = LossyDict()
anshbansal marked this conversation as resolved.
Show resolved Hide resolved
elements_added = 0
element_length_map = {}
for i in range(0, length):
list_length = random.choice(range(1, sub_length))
element_length_map[i] = 0
for _num_elements in range(0, list_length):
if not l.get(i):
if not l_dict.get(i):
elements_added += 1
# reset to 0 until we get it back
element_length_map[i] = 0
else:
element_length_map[i] = len(l[i])
element_length_map[i] = len(l_dict[i])

current_list = l.get(i, LossyList())
current_list = l_dict.get(i, LossyList())
current_list.append(f"{i}:{round(time.time(),2)} Hello World")
l[i] = current_list
l_dict[i] = current_list
element_length_map[i] += 1

assert len(l) == min(l.max_elements, length)
assert l.sampled is sampling
assert len(l_dict) == min(l_dict.max_elements, length)
assert l_dict.sampled is sampling
if sampling:
assert re.search("sampled of at most .* entries.", str(l))
assert f"{l.max_elements} sampled of at most {elements_added} entries." in str(
l
assert re.search("sampled of at most .* entries.", str(l_dict))
assert (
f"{l_dict.max_elements} sampled of at most {elements_added} entries."
in str(l_dict)
)
else:
# cheap way to determine that the dict isn't reporting sampled keys
assert not re.search("sampled of at most .* entries.", str(l))
assert not re.search("sampled of at most .* entries.", str(l_dict))

for k, v in l.items():
for k, v in l_dict.items():
assert len(v) == element_length_map[k]
Loading