From ed096cf8f32069036ea534fbcf8b4a757678db10 Mon Sep 17 00:00:00 2001
From: Shubham Panth <GamingCraft52@gmail.com>
Date: Sun, 17 Dec 2023 18:22:18 +0100
Subject: [PATCH] Implemented Automate Plagiarism / Copy Detection (#1285)

* Implemented Automate Plagiarism / Copy Detection #1177

* Refine PR Workflow Trigger

As suggested by @SamDev-7 in issue #1177 , the filter has been implemented to trigger the PR only when changes occur within the 'games' folder.

* Refine PR Workflow Trigger

As suggested by @SamDev-7 in issue #1177 , the filter has been implemented to trigger the PR only when changes occur within the 'games' folder.

* Auto format + Added Support To Prettier

As @grymmy commented on #1177  , have added support to Prettier and formatted document.

* Implemented fast-diff on PlagarismChecker

as @grymmy talked on #1177 , we are now using package fast-diff to check for Plagiarism. and we also added those packages.

* install dependencies on workflow

* Fix PreprocessCode

* Few fix and try catch

* adding logs to understand the problem

* added async and await

* more async

* Improved PlagiarismChecker

Improved time by only filtering original code, the old one was trimming from gallery code to.

Improved CalculateSimilarity to ignore whitespace and now also show context on what is being flagged (can be used both by sprig reviewer and me for debug)

improved the output file for more context

* Adding Logs For Debbuging

* Added Context

Added a log that shows both % + context meaning/word so we can see which one is being flagged wrong.

* Testing Lowering Threshold

* few improve

* more filter

* more filter?

* Removed fast-diff | Implement line by line check

* Impliment line by line system

* Remove fast-diff , fix string error

* debug

* preprocessCode fixed!

* moving to 0% for debug purpose

* add fast-levenshtein for complex plagarsim

* Implement token based plagarsim

* Implementing Compare50

* checking path dir

* path issue

* compare50 auto creates , making this crash! fixed!

* adding more filter

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* test

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* fix

* test

* Update plagiarism_check.py

* Update plagiarism_check.py

* Update plagiarism_check.py

* removing dir before running new one

* add passes to only get information needed

* Extracting usefull data from compare50

* added log as there is no output

* extract data from html

* Update check_plagiarism.yml

* filter plagiarism highest to lowest

* post result after getting result

* name syntax error

* show only filtered result on comment

* Implemented Automate Plagiarism / Copy Detection #1177

* Cleaning PR

* Remove Prettier

* Remove Prettier from yarn.lock
---
 .github/scripts/extract_percentages.py | 43 +++++++++++++++++
 .github/scripts/plagiarism_check.py    | 62 ++++++++++++++++++++++++
 .github/workflows/check_plagiarism.yml | 66 ++++++++++++++++++++++++++
 3 files changed, 171 insertions(+)
 create mode 100644 .github/scripts/extract_percentages.py
 create mode 100644 .github/scripts/plagiarism_check.py
 create mode 100644 .github/workflows/check_plagiarism.yml

diff --git a/.github/scripts/extract_percentages.py b/.github/scripts/extract_percentages.py
new file mode 100644
index 0000000000..28e1888041
--- /dev/null
+++ b/.github/scripts/extract_percentages.py
@@ -0,0 +1,43 @@
+from bs4 import BeautifulSoup
+import os
+import sys
+
+def extract_similarity_percentage(html_file):
+    with open(html_file, 'r', encoding='utf-8') as file:
+        soup = BeautifulSoup(file, 'html.parser')
+        file_name_tag = soup.select_one("#textright > div > h4")
+        if file_name_tag:
+            percentage_text = file_name_tag.find("span", class_="text-secondary small").text.strip("()%")
+            return int(percentage_text)
+        else:
+            return None
+
+def process_html_files(directory, threshold=10):
+    results = {}
+    for filename in os.listdir(directory):
+        if filename.endswith(".html"):
+            file_path = os.path.join(directory, filename)
+            percentage = extract_similarity_percentage(file_path)
+            if percentage is not None:
+                results[filename.replace('.html', '.js')] = percentage
+
+    filtered_sorted_results = sorted(
+        ((file, percent) for file, percent in results.items() if percent >= threshold),
+        key=lambda x: x[1], reverse=True
+    )
+
+    with open('plagiarism_results.txt', 'w') as output_file:
+        output_file.write("\nFiltered and Sorted Results (Above 10%):\n")
+        for file, percent in filtered_sorted_results:
+            output_file.write(f"{file}: {percent}%\n")
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python extract_percentages.py <saved_dir_path>")
+        sys.exit(1)
+
+    saved_dir_path = sys.argv[1]
+    process_html_files(saved_dir_path)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/scripts/plagiarism_check.py b/.github/scripts/plagiarism_check.py
new file mode 100644
index 0000000000..d1917aceb1
--- /dev/null
+++ b/.github/scripts/plagiarism_check.py
@@ -0,0 +1,62 @@
+import sys
+import subprocess
+import os
+import glob
+import shutil
+
+def run_compare50(single_file, directory, output_dir, saved_dir_base):
+    try:
+        if not os.path.exists(saved_dir_base):
+            os.makedirs(saved_dir_base)
+
+        all_js_files = glob.glob(os.path.join(directory, "*.js"))
+        total_files = len(all_js_files)
+        current_file_number = 0
+
+        for file in all_js_files:
+            current_file_number += 1
+            if os.path.abspath(file) == os.path.abspath(single_file):
+                continue
+
+            print(f"Processing file {current_file_number} of {total_files}: {file}")
+            if os.path.exists(output_dir):
+                shutil.rmtree(output_dir)
+            
+            command = [
+                "compare50",
+                single_file,
+                file,
+                "--output", output_dir,
+                "--max-file-size", str(1024 * 1024 * 100),
+                "--passes", "text"
+            ]
+
+            subprocess.run(command, check=True)
+
+            match_file = os.path.join(output_dir, "match_1.html")
+
+            if os.path.exists(match_file):
+                new_filename = os.path.basename(file).replace('.js', '.html')
+                saved_file_path = os.path.join(saved_dir_base, new_filename)
+                print(f"Moving {match_file} to {saved_file_path}")
+                shutil.move(match_file, saved_file_path)
+
+    except subprocess.CalledProcessError as e:
+        print("Error in running Compare50:", e)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+def main():
+    if len(sys.argv) != 5:
+        print("Usage: python plagiarism_check.py <single_file> <directory> <output_dir> <saved_dir_base>")
+        sys.exit(1)
+
+    single_file = sys.argv[1]
+    directory = sys.argv[2]
+    output_dir = sys.argv[3]
+    saved_dir_base = sys.argv[4]
+
+    run_compare50(single_file, directory, output_dir, saved_dir_base)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/.github/workflows/check_plagiarism.yml b/.github/workflows/check_plagiarism.yml
new file mode 100644
index 0000000000..1da9ed82c3
--- /dev/null
+++ b/.github/workflows/check_plagiarism.yml
@@ -0,0 +1,66 @@
+name: Plagiarism Checker
+
+on:
+  pull_request:
+    paths:
+      - "games/**/*.js"
+
+jobs:
+  plagiarism-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install Compare50 && beautifulsoup4
+        run: pip install compare50 beautifulsoup4
+
+      - name: Get list of changed files
+        id: changed-files
+        run: |
+          base_sha="${{ github.event.pull_request.base.sha }}"
+          head_sha="${{ github.event.pull_request.head.sha }}"
+          js_files=$(git diff --name-only --diff-filter=AM $base_sha..$head_sha | grep 'games/.*\.js$' | xargs)
+          echo "FILES=$js_files" >> $GITHUB_ENV
+
+      - name: Run Plagiarism Detection Script
+        run: python .github/scripts/plagiarism_check.py ${{ env.FILES }} games output_dir saved_dir
+
+      - name: Extract and Display Similarity Percentages
+        run: python .github/scripts/extract_percentages.py saved_dir/
+
+      - name: Post Plagiarism Results Comment
+        if: success()
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const output = fs.readFileSync('plagiarism_results.txt', 'utf8');
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: output
+            });
+
+      - name: Check for High Plagiarism Percentages
+        if: success()
+        run: |
+          if grep -qE "(\d{2,3})%" plagiarism_results.txt; then
+            echo "Plagiarism percentage over threshold detected."
+            exit 1
+          fi
+
+      - name: Upload Compare50 Results as Artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: compare50-results
+          path: saved_dir/