adamrtalbot · mashehu · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/.github/actions/nf-core-lint/action.yml b/.github/actions/nf-core-lint/action.yml
@@ -0,0 +1,38 @@
+name: nf-core-lint
+description: "Lint nf-core modules and workflows"
+inputs:
+  component_type:
+    description: "The type of component to lint, e.g. 'module' or 'subworkflow'"
+    required: true
+  components:
+    description: "List of components to lint"
+    required: true
+runs:
+  using: "composite"
+  steps:
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+
+    - name: Set up Python
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5
+      with:
+        python-version: "3.11"
+
+    - name: Install pip
+      shell: bash
+      run: python -m pip install --upgrade pip
+
+    - uses: actions/setup-java@99b8673ff64fbf99d8d325f52d9a5bdedb8483e9 # v4
+      with:
+        distribution: "temurin"
+        java-version: "17"
+
+    - name: Setup Nextflow
+      uses: nf-core/setup-nextflow@v2
+
+    - name: Install nf-core tools development version
+      shell: bash
+      run: python -m pip install --upgrade --force-reinstall git+https://github.com/nf-core/tools.git@dev
+
+    - name: Lint ${{inputs.component_type}} ${{ inputs.tags }}
+      shell: bash
+      run: nf-core ${{inputs.component_type}}s lint ${{ inputs.tags }}
diff --git a/.github/check_duplicate_md5s.py b/.github/check_duplicate_md5s.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 
-from rich import print
-from rich.table import Table
-import click
 import glob
 import os
+
+import click
 import yaml
+from rich import print
+from rich.table import Table
 
 
 @click.command()
@@ -32,30 +33,25 @@ def find_duplicate_md5s(min_dups, search_dir):
     # Loop through all files in tests/ called test.yml
     for test_yml in glob.glob(search_dir, recursive=True):
         # Open file and parse YAML
-        with open(test_yml, "r") as fh:
+        with open(test_yml) as fh:
             test_config = yaml.safe_load(fh)
             # Loop through tests and check for duplicate md5s
             for test in test_config:
                 for test_file in test.get("files", []):
                     if "md5sum" in test_file:
                         md5 = test_file["md5sum"]
-                        md5_filenames[md5] = md5_filenames.get(md5, []) + [
-                            os.path.basename(test_file.get("path"))
-                        ]
+                        md5_filenames[md5] = md5_filenames.get(md5, []) + [os.path.basename(test_file.get("path"))]
                         md5_output_fn_counts[md5] = md5_output_fn_counts.get(md5, 0) + 1
                         # Log the module that this md5 was in
                         modname = os.path.basename(os.path.dirname(test_yml))
                         # If tool/subtool show the whole thing
                         # Ugly code but trying to stat os-agnostic
-                        if os.path.basename(
-                            os.path.dirname(os.path.dirname(test_yml))
-                        ) not in ["modules", "config", "subworkflows"]:
-                            modname = "{}/{}".format(
-                                os.path.basename(
-                                    os.path.dirname(os.path.dirname(test_yml))
-                                ),
-                                os.path.basename(os.path.dirname(test_yml)),
-                            )
+                        if os.path.basename(os.path.dirname(os.path.dirname(test_yml))) not in [
+                            "modules",
+                            "config",
+                            "subworkflows",
+                        ]:
+                            modname = f"{os.path.basename(os.path.dirname(os.path.dirname(test_yml)))}/{os.path.basename(os.path.dirname(test_yml))}"
                         module_counts[md5] = module_counts.get(md5, []) + [modname]
 
     # Set up rich table

diff --git a/.github/python/find_changed_files.py b/.github/python/find_changed_files.py
@@ -7,10 +7,9 @@
 import json
 import logging
 import re
-import yaml
-
-from itertools import chain
 from pathlib import Path
+
+import yaml
 from git import Repo
 
 
@@ -71,8 +70,8 @@ def parse_args() -> argparse.Namespace:
         "-t",
         "--types",
         nargs="+",
-        choices=["function", "process", "workflow", "pipeline"],
-        default=["function", "process", "workflow", "pipeline"],
+        choices=["function", "process", "workflow", "pipeline", "tag"],
+        default=["function", "process", "workflow", "pipeline", "tag"],
         help="Types of tests to include.",
     )
     return parser.parse_args()
@@ -88,7 +87,7 @@ def read_yaml_inverted(file_path: str) -> dict:
     Returns:
         dict: The contents of the YAML file as a dictionary inverted.
     """
-    with open(file_path, "r") as f:
+    with open(file_path) as f:
         data = yaml.safe_load(f)
 
     # Invert dictionary of lists into contents of lists are keys, values are the original keys
@@ -114,20 +113,16 @@ def find_changed_files(
     """
     # create repo
     repo = Repo(".")
-    # identify commit on branch1
-    branch1_commit = repo.commit(branch1)
-    # identify commit on branch2
-    branch2_commit = repo.commit(branch2)
-    # compare two branches
-    diff_index = branch1_commit.diff(branch2_commit)
+    # Get the diff between two branches
+    diff = repo.git.diff(f"{branch1}..{branch2}", name_only=True)
 
     # Start empty list of changed files
     changed_files = []
 
     # For every file that has changed between commits
-    for file in diff_index:
+    for file in diff.splitlines():
         # Get pathlib.Path object
-        filepath = Path(file.a_path)
+        filepath = Path(file)
         # If file does not match any in the ignore list, add containing directory to changed_files
         if not any(filepath.match(ignored_path) for ignored_path in ignore):
             changed_files.append(filepath)
@@ -136,9 +131,7 @@ def find_changed_files(
     return list(set(changed_files))
 
 
-def detect_include_files(
-    changed_files: list[Path], include_files: dict[str, str]
-) -> list[Path]:
+def detect_include_files(changed_files: list[Path], include_files: dict[str, str]) -> list[Path]:
     """
     Detects the include files based on the changed files.
 
@@ -197,10 +190,9 @@ def process_files(files: list[Path]) -> list[str]:
     """
     result = []
     for file in files:
-        with open(file, "r") as f:
+        with open(file) as f:
             is_pipeline_test = True
-            lines = f.readlines()
-            for line in lines:
+            for line in f:
                 line = line.strip()
                 if line.startswith(("workflow", "process", "function")):
                     words = line.split()
@@ -217,7 +209,7 @@ def process_files(files: list[Path]) -> list[str]:
 
 
 def convert_nf_test_files_to_test_types(
-    lines: list[str], types: list[str] = ["function", "process", "workflow", "pipeline"]
+    lines: list[str], types: list[str] = ["function", "process", "workflow", "pipeline", "tag"]
 ) -> dict[str, list[str]]:
     """
     Generate a dictionary of function, process and workflow lists from the lines.
@@ -229,16 +221,16 @@ def convert_nf_test_files_to_test_types(
     Returns:
         dict: Dictionary with function, process and workflow lists.
     """
-    # Populate empty dict from types
     result: dict[str, list[str]] = {key: [] for key in types}
 
     for line in lines:
-        words = line.split()
-        if len(words) == 2 and re.match(r'^".*"$', words[1]):
-            keyword = words[0]
-            name = words[1].strip("'\"")  # Strip both single and double quotes
+        match = re.match(r"^(workflow|process|function|pipeline|tag)\s+(.*)$", line)
+        if match:
+            keyword = match.group(1)
+            name = match.group(2).strip("'\"")
             if keyword in types:
                 result[keyword].append(name)
+
     return result
 
 
@@ -260,28 +252,23 @@ def find_changed_dependencies(paths: list[Path], tags: list[str]) -> list[Path]:
 
     # find nf-test files with changed dependencies
     for nf_test_file in nf_test_files:
-        with open(nf_test_file, "r") as f:
+        with open(nf_test_file) as f:
             lines = f.readlines()
             # Get all tags from nf-test file
             # Make case insensitive with .casefold()
             tags_in_nf_test_file = [
                 tag.casefold().replace("/", "_")
-                for tag in convert_nf_test_files_to_test_types(lines, types=["tag"])[
-                    "tag"
-                ]
+                for tag in convert_nf_test_files_to_test_types(lines, types=["tag"])["tag"]
             ]
             # Check if tag in nf-test file appears in a tag.
             # Use .casefold() to be case insensitive
-            if any(
-                tag.casefold().replace("/", "_") in tags_in_nf_test_file for tag in tags
-            ):
+            if any(tag.casefold().replace("/", "_") in tags_in_nf_test_file for tag in tags):
                 result.append(nf_test_file)
 
     return result
 
 
 if __name__ == "__main__":
-
     # Utility stuff
     args = parse_args()
     logging.basicConfig(level=args.log_level)
@@ -292,27 +279,20 @@ def find_changed_dependencies(paths: list[Path], tags: list[str]) -> list[Path]:
     # If an additional include YAML is added, we detect additional changed dirs to include
     if args.include:
         include_files = read_yaml_inverted(args.include)
-        changed_files = changed_files + detect_include_files(
-            changed_files, include_files
-        )
+        changed_files = changed_files + detect_include_files(changed_files, include_files)
     nf_test_files = detect_nf_test_files(changed_files)
     lines = process_files(nf_test_files)
-    result = convert_nf_test_files_to_test_types(lines)
+    result = convert_nf_test_files_to_test_types(lines, args.types)  # Get only relevant results (specified by -t)
 
-    # Get only relevant results (specified by -t)
     # Unique using a set
-    target_results = list(
-        {item for sublist in map(result.get, args.types) for item in sublist}
-    )
+    target_results = list({item for sublist in result[args.types] for item in sublist})
 
     # Parse files to identify nf-tests with changed dependencies
     changed_dep_files = find_changed_dependencies([Path(".")], target_results)
 
     # Combine target nf-test files and nf-test files with changed dependencies
     # Go back one dir so we get the module or subworkflow path
-    all_nf_tests = [
-        str(test_path.parent.parent) for test_path in set(changed_dep_files + nf_test_files)
-    ]
+    all_nf_tests = [str(test_path.parent.parent) for test_path in set(changed_dep_files + nf_test_files)]
 
     # Print to stdout
     print(json.dumps(all_nf_tests))