Skip to content

Commit

Permalink
Enable more Ruff rules (#231)
Browse files Browse the repository at this point in the history
My IDE was highlighting additional warnings not covered by Ruff, so I
added some more rules. 90% of this was autofixed, the rest was
straightforward and useful. I even found an issue with a test because of
it.
  • Loading branch information
RobbeSneyders authored Jun 23, 2023
1 parent d072434 commit 8169bc1
Show file tree
Hide file tree
Showing 42 changed files with 428 additions and 321 deletions.
6 changes: 5 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ repos:
fondant/.*|
tests/.*|
)$
args: [--fix, --exit-non-zero-on-fix]
args: [
"--target-version=py38",
"--fix",
"--exit-non-zero-on-fix",
]


- repo: https://github.com/PyCQA/bandit
Expand Down
8 changes: 4 additions & 4 deletions components/caption_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def caption_image_batch(
*,
model: BlipForConditionalGeneration,
processor: BlipProcessor,
max_new_tokens: int
max_new_tokens: int,
) -> pd.Series:
"""Caption a batch of images."""
input_batch = torch.cat(image_batch.tolist())
Expand All @@ -67,7 +67,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
images = dataframe["images"]["data"].apply(
process_image,
processor=self.processor,
device=self.device
device=self.device,
)

results: t.List[pd.Series] = []
Expand All @@ -78,8 +78,8 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
batch,
model=self.model,
processor=self.processor,
max_new_tokens=self.max_new_tokens
).T
max_new_tokens=self.max_new_tokens,
).T,
)

return pd.concat(results).to_frame(name=("captions", "text"))
Expand Down
6 changes: 3 additions & 3 deletions components/download_images/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def download_image(url, timeout, user_agent_token, disallowed_header_directives)
f"+https://github.com/rom1504/img2dataset)"
try:
request = urllib.request.Request(
url, data=None, headers={"User-Agent": user_agent_string}
url, data=None, headers={"User-Agent": user_agent_string},
)
with urllib.request.urlopen(request, timeout=timeout) as r:
if disallowed_header_directives and is_disallowed(
Expand Down Expand Up @@ -77,7 +77,7 @@ def download_image_with_retry(
):
for _ in range(retries + 1):
img_stream = download_image(
url, timeout, user_agent_token, disallowed_header_directives
url, timeout, user_agent_token, disallowed_header_directives,
)
if img_stream is not None:
# resize the image
Expand Down Expand Up @@ -114,7 +114,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
dataframe[[
("images", "data"),
("images", "width"),
("images", "height")
("images", "height"),
]] = dataframe.apply(
lambda example: download_image_with_retry(
url=example["images"]["url"],
Expand Down
6 changes: 3 additions & 3 deletions components/embedding_based_laion_retrieval/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setup(
*,
num_images: int,
aesthetic_score: int,
aesthetic_weight: float
aesthetic_weight: float,
) -> None:
"""
Expand Down Expand Up @@ -54,7 +54,7 @@ async def async_query():
futures = [
loop.run_in_executor(
executor,
functools.partial(self.client.query, embedding_input=embedding.tolist())
functools.partial(self.client.query, embedding_input=embedding.tolist()),
)
for embedding in dataframe["embeddings"]["data"]
]
Expand All @@ -64,7 +64,7 @@ async def async_query():
loop.run_until_complete(async_query())

results_df = pd.DataFrame(results)[["id", "url"]]
results_df.set_index("id", inplace=True)
results_df = results_df.set_index("id")
results_df.columns = [["images"], ["url"]]

return results_df
Expand Down
10 changes: 4 additions & 6 deletions components/filter_comments/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def transform(
*,
dataframe: dd.DataFrame,
min_comments_ratio: float,
max_comments_ratio: float
max_comments_ratio: float,
) -> dd.DataFrame:
"""
Args:
Expand All @@ -31,16 +31,14 @@ def transform(
Filtered dask dataframe.
"""
# Apply the function to the desired column and filter the DataFrame
filtered_df = dataframe[
return dataframe[
dataframe["code_content"].map_partitions(
lambda example: example.map(get_comments_to_code_ratio).between(
min_comments_ratio, max_comments_ratio
)
min_comments_ratio, max_comments_ratio,
),
)
]

return filtered_df


if __name__ == "__main__":
component = FilterCommentsComponent.from_args()
Expand Down
12 changes: 5 additions & 7 deletions components/filter_comments/src/utils/text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,7 @@ def get_comments(source: str) -> str:
for toknum, tokval, _, _, _ in g:
if toknum == tokenize.COMMENT:
comments.append((toknum, tokval))
result = tokenize.untokenize(comments).replace("#", "")

return result
return tokenize.untokenize(comments).replace("#", "")


def get_docstrings(source: str) -> t.List[str]:
Expand All @@ -88,13 +86,13 @@ def get_docstrings(source: str) -> t.List[str]:
source = source.read()

docstrings = sorted(
parse_docstrings(source), key=lambda x: (NODE_TYPES.get(type(x[0])), x[1])
parse_docstrings(source), key=lambda x: (NODE_TYPES.get(type(x[0])), x[1]),
)

grouped = groupby(docstrings, key=lambda x: NODE_TYPES.get(type(x[0])))
results = []
for _, group in grouped:
for _, name, docstring in group:
for _, _name, docstring in group:
if docstring:
results.append(docstring)
return results
Expand All @@ -116,7 +114,7 @@ def get_text_python(source: str, extract_comments: bool = True) -> str:
except Exception:
docstrings = ""
warnings.warn(
"code couldn't be parsed due to compilation failure, no docstring is extracted"
"code couldn't be parsed due to compilation failure, no docstring is extracted",
)

if extract_comments:
Expand All @@ -142,4 +140,4 @@ def get_comments_to_code_ratio(text: str) -> float:
"""
comments = get_text_python(text)

return len(comments) / len(text)
return len(comments) / len(text)
6 changes: 2 additions & 4 deletions components/filter_line_length/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def transform(
dataframe: dd.DataFrame,
avg_line_length_threshold: int,
max_line_length_threshold: int,
alphanum_fraction_threshold: float
alphanum_fraction_threshold: float,
) -> dd.DataFrame:
"""
Args:
Expand All @@ -31,14 +31,12 @@ def transform(
Returns:
Filtered dask dataframe.
"""
filtered_df = dataframe[
return dataframe[
(dataframe["code_avg_line_length"] > avg_line_length_threshold)
& (dataframe["code_max_line_length"] > max_line_length_threshold)
& (dataframe["code_alphanum_fraction"] > alphanum_fraction_threshold)
]

return filtered_df


if __name__ == "__main__":
component = FilterLineLengthComponent.from_args()
Expand Down
6 changes: 3 additions & 3 deletions components/image_cropping/src/image_crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def get_image_borders(image: Image.Image) -> t.Tuple:


def remove_borders(
image_bytes: bytes, cropping_threshold: int = -30, padding: int = 10
image_bytes: bytes, cropping_threshold: int = -30, padding: int = 10,
) -> bytes:
"""This method removes borders by checking the overlap between
a color and the original image. By subtracting these two
Expand Down Expand Up @@ -89,12 +89,12 @@ def remove_borders(
if image_crop.size[0] > image_crop.size[1]:
padding = int((image_crop.size[0] - image_crop.size[1]) / 2)
image_crop = ImageOps.expand(
image_crop, border=(0, padding), fill=color_common
image_crop, border=(0, padding), fill=color_common,
)
else:
padding = int((image_crop.size[1] - image_crop.size[0]) / 2)
image_crop = ImageOps.expand(
image_crop, border=(padding, 0), fill=color_common
image_crop, border=(padding, 0), fill=color_common,
)

# serialize image to JPEG
Expand Down
2 changes: 1 addition & 1 deletion components/image_cropping/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def transform(
*,
dataframe: dd.DataFrame,
cropping_threshold: int = -30,
padding: int = 10
padding: int = 10,
) -> dd.DataFrame:
"""
Args:
Expand Down
2 changes: 1 addition & 1 deletion components/image_embedding/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
images = dataframe["images"]["data"].apply(
process_image,
processor=self.processor,
device=self.device
device=self.device,
)
results: t.List[pd.Series] = []
for batch in np.split(images, np.arange(self.batch_size, len(images), self.batch_size)):
Expand Down
4 changes: 2 additions & 2 deletions components/image_resolution_filtering/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ImageFilterComponent(DaskTransformComponent):
"""Component that filters images based on height and width."""

def transform(
self, *, dataframe: dd.DataFrame, min_width: int, min_height: int
self, *, dataframe: dd.DataFrame, min_width: int, min_height: int,
) -> dd.DataFrame:
"""
Args:
Expand All @@ -38,4 +38,4 @@ def transform(

if __name__ == "__main__":
component = ImageFilterComponent.from_args()
component.run()
component.run()
2 changes: 1 addition & 1 deletion components/load_from_hf_hub/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def load(self,
if image_column_names is not None:
for image_column_name in image_column_names:
dask_df[image_column_name] = dask_df[image_column_name].map(
lambda x: x["bytes"], meta=("bytes", bytes)
lambda x: x["bytes"], meta=("bytes", bytes),
)

# 3) Rename columns
Expand Down
4 changes: 2 additions & 2 deletions components/pii_redaction/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def transform(

# redact PII
# we use random replacements by default
with open("replacements.json", "r") as f:
with open("replacements.json") as f:
replacements = json.load(f)

dataframe["code_content"] = dataframe.apply(
Expand All @@ -54,7 +54,7 @@ def transform(
meta=(None, "str"),
)
dataframe = dataframe.drop(
["code_secrets", "code_has_secrets", "code_number_secrets"], axis=1
["code_secrets", "code_has_secrets", "code_number_secrets"], axis=1,
)

return dataframe
Expand Down
4 changes: 2 additions & 2 deletions components/pii_redaction/src/pii_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ def scan_pii(text, key_detector="other"):
if key_detector == "regex":
# use a regex to detect keys + emails + ips
secrets = secrets + detect_email_addresses(
text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"}
text, tag_types={"KEY", "EMAIL", "IP_ADDRESS"},
)
else:
# detect emails and ip addresses with regexes
secrets = secrets + detect_email_addresses(
text, tag_types={"EMAIL", "IP_ADDRESS"}
text, tag_types={"EMAIL", "IP_ADDRESS"},
)
# for keys use detect-secrets tool
secrets = secrets + detect_keys(text)
Expand Down
10 changes: 4 additions & 6 deletions components/pii_redaction/src/pii_redaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
],
}

# providergs = ["google", "cloudfare", "alternate-dns", "quad9","open-dns", "comodo", "adguard"]
POPULAR_DNS_SERVERS = [
"8.8.8.8",
"8.8.4.4",
Expand Down Expand Up @@ -113,7 +112,7 @@ def redact_pii_text(text, secrets, replacements, add_references=False):
last_text = text
for secret in secrets:
# skip secret if it's an IP address for private networks or popular DNS servers
if secret["tag"] == "IP_ADDRESS":
if secret["tag"] == "IP_ADDRESS": # ruff: noqa: SIM102
# if secret value in popular DNS servers, skip it
if is_private_ip(secret["value"]) or (
secret["value"] in POPULAR_DNS_SERVERS
Expand Down Expand Up @@ -146,10 +145,9 @@ def redact_pii_text(text, secrets, replacements, add_references=False):
else:
new_text = text
references = ""
result = (
return (
(new_text, references, modified) if add_references else (new_text, modified)
)
return result


def redact_pii(text, secrets, has_secrets, replacements):
Expand All @@ -160,5 +158,5 @@ def redact_pii(text, secrets, has_secrets, replacements):
if has_secrets:
new_text, _ = redact_pii_text(text, secrets, replacements)
return new_text
else:
return text

return text
Loading

0 comments on commit 8169bc1

Please sign in to comment.