.github/workflows/fuzzy-ci.yml

name: Fuzzy CI

on:
  pull_request:
    branches: [master]
    types: [opened, synchronize, reopened, unlabeled, labeled]
    paths-ignore:
      - "**.md"
      - "**.txt"
      - ".git*"
      - "doc/**"
      - "emacs/**"
      - "vim/**"
      - "**/emacs-lint.yml"
      - "bench/**"
      - "upstream/**"
      - "tests/**"

env:
  # Artifact names need to be consistent across jobs:
  BASE_BRANCH_ARTIFACT_NAME: base-branch-data-${{ github.event.pull_request.base.sha }}-pr${{ github.event.pull_request.number }}
  MERGE_BRANCH_ARTIFACT_NAME: merge-branch-data-${{ github.event.pull_request.base.sha }}-${{ github.event.pull_request.head.sha }}-pr${{ github.event.pull_request.number }}
  DIFF_ARTIFACT_NAME: diff-${{ github.event.pull_request.base.sha }}-${{ github.event.pull_request.head.sha }}

  # File names also need to be consistant across jobs:
  FULL_DIFF_FILE: full_responses.diff
  DISTILLED_DIFF_FILE: distilled_data.diff
  # Note: FULL_DATA_FILE and DISTILLED_DATA_FILE need to be the file names of the files generated by `merl-an behavior`
  FULL_DATA_FILE: full_responses.json
  DISTILLED_DATA_FILE: distilled_data.json

  # GitHub API related short-hands:
  GH_API_COMMENTS: ${{ github.event.pull_request.comments_url }}
  GH_API_LABELS: ${{ github.event.pull_request.issue_url }}/labels
  GH_API_ARTIFACTS: ${{ github.event.pull_request.base.repo.url }}/actions/artifacts

  # URL short-hands
  ACTIONS_RUNS_ENDPOINT: ${{ github.event.repository.html_url }}/actions/runs
  CURRENT_ACTION_URL: ${{ github.event.repository.html_url }}/actions/runs/${{ github.run_id }}

  # Irmin version and merl-an version need to be consistent for reproducibility (Irmin is used as the test code base to test `ocamlmerlin` on)
  IRMIN_VERSION: 3.9.0
  # TODO: Release merl-an and install a certain version instead of pinning it to a certain commit
  MERL_AN_SHA: 6411f0d3847e8b7e66362bcb1f9345a5d3e851ca

  # The compiler version used on the respective branches. It also needs to form part of Irmin's build cache key.
  # Bump either of these whenever the compiler version is bumped on either of the two branches.
  merge_branch_COMPILER_VERSION: ocaml-base-compiler.4.14.1
  base_branch_COMPILER_VERSION: ocaml-base-compiler.4.14.1

jobs:
  data:
    name: Generate data
    runs-on: ubuntu-22.04
    if: >
      github.event.action == 'opened' ||
      github.event.action == 'synchronize' ||
      github.event.action == 'reopened' ||
      (
        github.event.action == 'unlabeled' &&
        github.event.label.name == 'fuzzy-diff-looks-good'
      )
    env:
      data_dir: data
    strategy:
      matrix:
        commit: ["merge_branch", "base_branch"]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Checking out ${{ matrix.commit }}
        env:
          base_branch_sha: ${{ github.event.pull_request.base.sha }}
          merge_branch_sha: ${{ github.sha }}
        run: |
          sha=$${{ matrix.commit }}_sha
          echo "Check out $sha"
          git checkout $sha

      - name: Get desired compiler version
        id: compiler
        run: |
          v=$${{ matrix.commit }}_COMPILER_VERSION
          echo "version=$v" | tee -a $GITHUB_OUTPUT

      - name: Install OCaml
        uses: ocaml/setup-ocaml@v3
        with:
          ocaml-compiler: ${{ steps.compiler.outputs.version }}
          dune-cache: true

      - name: Install merlin dependencies
        run: |
          opam pin menhirLib 20201216 --no-action
          opam install . --deps-only --yes

      - name: Install merlin
        run: |
          # Running `subst` to have the current commit in the data produced by `merl-an`
          opam exec -- dune subst
          opam exec -- dune build -p merlin-lib,dot-merlin-reader,merlin
          opam exec -- dune install -p merlin-lib,dot-merlin-reader,merlin

      - name: Pull irmin and its deps from cache if possible
        uses: actions/cache@v4
        id: irmin-cache
        with:
          path: irmin/
          key: os${{ runner.os }}+arch${{ runner.arch }}+${{ hashFiles('fuzzy-ci-helpers/irmin.3.9.0.opam.locked') }}+${{ env.IRMIN_VERSION }}+${{ steps.compiler.outputs.version }}

      - name: Download Irmin tarball
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: |
          wget https://github.com/mirage/irmin/releases/download/$IRMIN_VERSION/irmin-$IRMIN_VERSION.tbz

      - name: Create irmin dir
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: mkdir -p irmin

      - name: Decompress Irmin tarball
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: tar xvf irmin-$IRMIN_VERSION.tbz -C irmin --strip-components=1

      - name: Get Irmin's lock files
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: |
          cp .github/fuzzy-ci-helpers/irmin.3.9.0.opam.locked irmin/irmin.opam.locked

      - name: Install opam monorepo
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: opam install opam-monorepo --yes

      - name: Pull in Irmin's dependencies
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: |
          git checkout ${{ github.sha }}
          opam monorepo pull --lockfile=irmin.opam.locked --yes
        working-directory: irmin

      - name: Prune Irmin
        if: steps.irmin-cache.outputs.cache-hit != 'true'
        run: |
          rm -r examples/ bench/
          find test/ -mindepth 1 -maxdepth 1 -type d -not -name 'irmin-pack' -exec rm -r {} \;
          find src/ -mindepth 1 -maxdepth 1 -type d \
          -not -name 'irmin-pack' \
          -not -name 'irmin' \
          -not -name 'irmin-tezos' \
          -not -name ppx_irmin \
          -not -name irmin_test \
          -not -name irmin-test \
          -exec rm -r {} \;
        working-directory: irmin

      - name: Build Irmin
        run: |
          opam exec -- dune build @check
        working-directory: irmin

      - name: Pull merl-an from cache if possible
        uses: actions/cache@v4
        id: merl-an-cache
        with:
          path: /usr/local/bin/merl-an
          key: os${{ runner.os }}+arch${{ runner.arch }}+merl-an-sha$MERL_AN_SHA

      - name: Install merl-an
        if: steps.merl-an-cache.outputs.cache-hit != 'true'
        run: opam pin -y merl-an https://github.com/pitag-ha/merl-an.git#$MERL_AN_SHA

      - name: Add merl-an to /usr/local/bin/
        if: steps.merl-an-cache.outputs.cache-hit != 'true'
        run: opam exec -- cp $GITHUB_WORKSPACE/_opam/bin/merl-an /usr/local/bin/merl-an

      - name: Create data set of Merlin responses
        run: |
          # Note: The parameters with most influence on the execution time are
          # `--sample-size`: Number of samples per file defined by `--project` (and per local query).
          # `--project`: List of dirs/files to create samples on. In the case of a dirs, all ml(i) files recursively in the dir are used.
          # `--queries`: The `ocamlmerlin` queries that are being run.
          opam exec -- merl-an behavior \
          --queries=type-enclosing,occurrences,locate,complete-prefix,errors \
          --sample-size=30 \
          --data=${{ env.data_dir }} \
          --merlin=ocamlmerlin \
          --project=irmin/src/irmin,irmin/src/irmin-pack,irmin/test/irmin-pack

      - name: Remove varying components from data
        run: |
          # TODO: This could be done on the `merl-an` side
          jq '.responses |= map(del(.heap_mbytes, .timings, .cache))' \
            ${{ env.data_dir }}/$FULL_DATA_FILE > temp.json && \
            mv temp.json ${{ env.data_dir }}/$FULL_DATA_FILE

      - name: Create name for data artifact
        id: artifact_name
        env:
          base_branch_artifact_name: ${{ env.BASE_BRANCH_ARTIFACT_NAME }}
          merge_branch_artifact_name: ${{ env. MERGE_BRANCH_ARTIFACT_NAME }}
        run: echo "name=$${{ matrix.commit }}_artifact_name" >> $GITHUB_OUTPUT

      - name: Upload data
        uses: actions/upload-artifact@v4
        with:
          name: ${{ steps.artifact_name.outputs.name }}
          path: ${{ env.data_dir }}

      - name: Compile diff tool
        if: ${{ matrix.commit == 'merge_branch' }}
        run: |
          # Taking advantage that ocamlopt is installed on this runner: compile the diff tool here and share it with the next job where it's needed.
          # All GH runners are hosted on x86 machines and all jobs in this workflow declare the same OS, so this should workTM.
          opam exec -- ocamlopt -o create_diff .github/fuzzy-ci-helpers/create_diff.ml

      - name: Upload diff tool
        if: ${{ matrix.commit == 'merge_branch' }}
        uses: actions/upload-artifact@v4
        with:
          name: diff_tool
          path: create_diff

  diff:
    name: Generate diffs
    runs-on: ubuntu-22.04
    outputs:
      diff_exits: ${{steps.full_responses_diff.outputs.diff_exists}}
    needs: data
    env:
      base_data_dir: base_data
      merge_data_dir: merge_data
      diff_dir: diff
    steps:
      - name: Download base branch data
        uses: actions/download-artifact@v4
        with:
          name: ${{ env.BASE_BRANCH_ARTIFACT_NAME }}
          path: ${{ env.base_data_dir }}

      - name: Download merge branch data
        uses: actions/download-artifact@v4
        with:
          name: ${{ env.MERGE_BRANCH_ARTIFACT_NAME }}
          path: ${{ env.merge_data_dir }}

      - name: Create diff dir
        run: mkdir -p "$diff_dir"

      - name: Download diff tool
        uses: actions/download-artifact@v4
        with:
          name: diff_tool

      - name: Give diff tool execute permissions
        run: chmod +x create_diff

      - name: Generate full responses diff
        id: full_responses_diff
        run: |
          jq -r -n \
          --slurpfile data1 "$base_data_dir/$FULL_DATA_FILE" \
          --slurpfile data2 "$merge_data_dir/$FULL_DATA_FILE" \
          'def process_json($branch; $data):
            ($branch + ": " + $data.cmd + " (id=" + ($data.sample_id | tostring) + ")"), $data;
            range($data1|length) as $i |
            process_json("base branch"; $data1[$i]),
            "--input-separator--",
            process_json("merge branch"; $data2[$i]),
            "--diff-cmd-separator--"' \
          | ./create_diff "--input-separator--" "--diff-cmd-separator--" "$diff_dir/$FULL_DIFF_FILE"
          if [ -s "$diff_dir/$FULL_DIFF_FILE" ]; then
            echo "diff_exists=true" | tee -a $GITHUB_OUTPUT
          else
            echo "diff_exists=false" | tee -a $GITHUB_OUTPUT
          fi

      - name: Generate distilled data diff
        # If there's no full reponses diff, there also won't be a distilled data diff
        if: ${{ steps.full_responses_diff.outputs.diff_exists == 'true' }}
        run: |
          jq -r -n \
          --slurpfile data1 "$base_data_dir/$DISTILLED_DATA_FILE" \
          --slurpfile data2 "$merge_data_dir/$DISTILLED_DATA_FILE" \
          'def process_json($branch; $data):
            ($branch + ": " + $data.cmd + " (id=" + ($data.sample_id | tostring) + ")"), $data;
            range($data1|length) as $i |
            process_json("base branch"; $data1[$i]),
            "--input-separator--",
            process_json("merge branch"; $data2[$i]),
            "--diff-cmd-separator--"' \
          | ./create_diff "--input-separator--" "--diff-cmd-separator--" "$diff_dir/$DISTILLED_DIFF_FILE"

      - name: Upload diff(s)
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.DIFF_ARTIFACT_NAME }}
          path: ${{ env.diff_dir }}

  output:
    name: Evaluate diffs
    runs-on: ubuntu-22.04
    needs: diff
    env:
      earlier_diff_was_approved: ${{ contains(github.event.pull_request.labels.*.name, 'fuzzy-diff-looks-good') }}
      current_diff_exists: ${{ needs.diff.outputs.diff_exits }}
      diff_dir: ${{ needs.artifact_names.outputs.diff_dir }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Download current diff(s)
        if: ${{ env.current_diff_exists == 'true' }}
        uses: actions/download-artifact@v4
        with:
          name: ${{ env.DIFF_ARTIFACT_NAME }}

      - name: Retreive hash of approved diff
        if: ${{ env.earlier_diff_was_approved == 'true' }}
        id: approved_diff
        run: |
          msg_start=$(head -c 50 .github/fuzzy-ci-helpers/msg.txt)

          next_page_endpoint="$GH_API_COMMENTS?per_page=100&page=1"
          latest_comment="{}"

          while [ -n "$next_page_endpoint" ]; do
            latest_comment=$(
              curl -s -D "headers.txt" -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" "$next_page_endpoint" |
              jq --arg msg_start "$msg_start" --argjson latest "{}" '
                map(
                  select(
                    (.body | startswith($msg_start)) and .user.login == "github-actions[bot]"
                  )
                ) + [$latest] | max_by(.created_at)'
            )

            next_page_endpoint=$(
              rg '^link:' headers.txt |
              tr ',' '\n' |
              rg 'rel="next"' |
              cut -d'<' -f2 |
              cut -d'>' -f1
            )
          done

          hash=$(echo "$latest_comment" | jq '.body' -r | grep '256-sha' | awk '{print $NF}')
          echo "hash='$hash'" | tee -a $GITHUB_OUTPUT

      - name: Analyze current diff
        id: current_diff
        run: |
          hash=$(sha256sum "$FULL_DIFF_FILE" | awk '{print $1}')
          echo "hash='$hash'" | tee -a $GITHUB_OUTPUT

      - name: Write instruction to delete PR label
        # When this workflow is triggered by a PR from a fork, it doesn't have
        # the permissions to delete PR labels. Instead, we forward the
        # instruction to delete the label to fuzzy-ci-privileged.yml.
        if: ${{ env.earlier_diff_was_approved == 'true' && steps.approved_diff.outputs.hash != steps.current_diff.outputs.hash }}
        run: |
          echo ${{ steps.approved_diff.outputs.hash }}
          echo ${{ steps.current_diff.outputs.hash }}
          mkdir -p ./forward
          jq -n \
            --arg instruction "delete_label" \
            --arg endpoint "$GH_API_LABELS" \
            '{instruction: $instruction, endpoint: $endpoint}' > ./forward/instruction.json

      - name: Upload instruction to delete label
        if: ${{ env.earlier_diff_was_approved == 'true' &&  steps.approved_diff.outputs.hash != steps.current_diff.outputs.hash }}
        uses: actions/upload-artifact@v4
        with:
          name: forwarded_instructions
          path: forward/

      - name: Return
        id: return
        env:
          github_api_labels_url: ${{ github.event.pull_request.base.repo.url }}/issues/${{ github.event.pull_request.number }}/labels
        run: |
          print_head_of_diffs () {
            echo "--------beginning of full responses diff head--------"
            head -n 100 "$FULL_DIFF_FILE"
            echo "--------end of full responses diff head--------"
            echo "--------beginning of distilled data diff head--------"
            head -n 100 "$DISTILLED_DIFF_FILE"
            echo "--------end of distilled data diff head--------"
          }

          # FIXME (?): Are nested conditionals always so ugly in Bash, or is there a better way? Option types and the possibility to match would help a lot.
          LABEL_NAME=$(cat .github/fuzzy-ci-helpers/label_name.txt)
          if $earlier_diff_was_approved; then
            echo "Earlier diff was approved."
            if [ ${{ steps.current_diff.outputs.hash }} == ${{ steps.approved_diff.outputs.hash }} ]; then
              echo "This diff has been approved earlier. Everything ok."
              exit 0
            else
              print_head_of_diffs
              printf "The diff has changed since it was approved. So I'm removing the $LABEL_NAME label. If the new diff looks good, please set the label again.\n\
              There's a head of the new diffs printed above. The whole diffs can be downloaded from $CURRENT_ACTION_URL .\n\
              Previous sha256: ${{ steps.approved_diff.outputs.hash }}\n\
              Current sha256: ${{ steps.current_diff.outputs.hash }}"
              echo "delete_label=true" >> $GITHUB_OUTPUT
              exit 1
            fi
          else
            if $current_diff_exists; then
              print_head_of_diffs
              printf "There's a head of the diffs printed above. The diffs can be downloaded from $CURRENT_ACTION_URL .\nIf it looks good, please set the $LABEL_NAME label on the PR."
              exit 1
            else
              echo "No diff. All good."
              exit 0
            fi
          fi

  approve:
    name: Approve diff
    if: >
      github.event_name == 'pull_request' &&
      github.event.action == 'labeled' &&
      github.event.label.name == 'fuzzy-diff-looks-good'
    runs-on: ubuntu-22.04
    steps:
      - name: Retreive diff artifact meta-data
        id: diff_metadata
        run: |
          all_artifacts=$(curl -sSL  "$GH_API_ARTIFACTS")
          diff_artifact=$(echo "$all_artifacts" | jq "first(.artifacts[] | select(.name == \"$DIFF_ARTIFACT_NAME\")  )")
          id=$(echo "$diff_artifact" | jq ".id")
          echo "id=$id" | tee -a $GITHUB_OUTPUT
          workflow_run=$(echo "$diff_artifact" | jq ".workflow_run | .id")
          echo "workflow_run=$workflow_run" | tee -a $GITHUB_OUTPUT
          if [ -z $id ]; then
            echo "exists=false" | tee -a $GITHUB_OUTPUT
          else
            echo "exists=true" | tee -a $GITHUB_OUTPUT
          fi

      - name: Write instruction to delete PR label
        # When this workflow is triggered by a PR from a fork, it doesn't have
        # the permissions to delete PR labels. Instead, we forward the
        # instruction to delete the label to fuzzy-ci-privileged.yml.
        if: ${{ steps.diff_metadata.outputs.exists == 'false' }}
        run: |
          mkdir -p ./forward
          jq -n \
            --arg instruction "delete_label" \
            --arg endpoint "$GH_API_LABELS" \
            '{instruction: $instruction, endpoint: $endpoint}' > ./forward/instruction.json

      - name: Upload instruction to delete label
        if: ${{ steps.diff_metadata.outputs.exists == 'false' }}
        uses: actions/upload-artifact@v4
        with:
          name: forwarded_instructions
          path: forward/

      - name: Fail due to diff not existing yet
        if: ${{ steps.diff_metadata.outputs.exists == 'false' }}
        run: |
          printf "You seem to have tried to approve a diff that doesn't exist yet.\nWait for the diff to have been generated and then try again."
          exit 1

      - name: Download diff
        env:
          id: ${{ steps.diff_metadata.outputs.id }}
        run: |
          # Doing this manually, since actions/download-artifact only works on the same workflow run on which the artifact was uploaded
          curl -sSLO -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" "$GH_API_ARTIFACTS/$id/zip" -D headers.txt

      - name: Unzip downloaded diff
        run: |
          unzip zip || (echo "Download of diff artifact failed" && cat headers.txt && cat zip && exit 1)

      - name: Compute full responses diff hash
        id: diff_hash
        run: |
          hash=$(sha256sum "$FULL_DIFF_FILE" | awk '{print $1}')
          echo "hash=$hash" | tee -a $GITHUB_OUTPUT

      - name: Write instruction to comment on PR
        # When this workflow is triggered by a PR from a fork, it doesn't have
        # the permissions to comment on PRs. Instead, we forward the
        # instruction to comment on the PR to fuzzy-ci-privileged.yml.
        env:
          approved_diffs_workflow_run: ${{ steps.diff_metadata.outputs.workflow_run }}
          approved_diffs_hash: ${{ steps.diff_hash.outputs.hash }}
        run: |
          mkdir -p ./forward
          jq -n \
            --arg instruction "comment" \
            --arg endpoint "$GH_API_COMMENTS" \
            --arg artifacts_url "$ACTIONS_RUNS_ENDPOINT/$approved_diffs_workflow_run" \
            --arg hash "$approved_diffs_hash" \
            '{instruction: $instruction, endpoint: $endpoint, artifacts_url: $artifacts_url, hash: $hash}' > ./forward/instruction.json

      - name: Upload instruction to comment on PR
        uses: actions/upload-artifact@v4
        with:
          name: forwarded_instructions
          path: forward/