From 3a6173ba460c12237d6192af3cf2456f70b0b47b Mon Sep 17 00:00:00 2001 From: DevIos01 Date: Tue, 19 Dec 2023 17:19:55 +0100 Subject: [PATCH] Implemented Better Logs For Plagiarism Checker. --- .github/scripts/extract_percentages.py | 40 +++++++++++++++++++------- .github/scripts/plagiarism_check.py | 36 ++++++++++++++++------- .github/workflows/check_plagiarism.yml | 27 ++++------------- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/.github/scripts/extract_percentages.py b/.github/scripts/extract_percentages.py index 28e1888041..d62c7e3ea7 100644 --- a/.github/scripts/extract_percentages.py +++ b/.github/scripts/extract_percentages.py @@ -1,25 +1,36 @@ from bs4 import BeautifulSoup import os import sys +import time + +def log(message): + timestamp = time.strftime("%Y-%m-%d %H:%M:%S") + print(f"[{timestamp}] {message}") def extract_similarity_percentage(html_file): - with open(html_file, 'r', encoding='utf-8') as file: - soup = BeautifulSoup(file, 'html.parser') - file_name_tag = soup.select_one("#textright > div > h4") - if file_name_tag: - percentage_text = file_name_tag.find("span", class_="text-secondary small").text.strip("()%") - return int(percentage_text) - else: - return None + try: + with open(html_file, 'r', encoding='utf-8') as file: + soup = BeautifulSoup(file, 'html.parser') + file_name_tag = soup.select_one("#textright > div > h4") + if file_name_tag: + percentage_text = file_name_tag.find("span", class_="text-secondary small").text.strip("()%") + return int(percentage_text) + else: + return None + except Exception as e: + log(f"Error processing file {html_file}: {e}") + return None def process_html_files(directory, threshold=10): results = {} + log("Processing HTML files for plagiarism results...") for filename in os.listdir(directory): if filename.endswith(".html"): file_path = os.path.join(directory, filename) percentage = extract_similarity_percentage(file_path) if percentage is not None: results[filename.replace('.html', '.js')] = percentage + log(f"Extracted {percentage}% similarity from {filename}") filtered_sorted_results = sorted( ((file, percent) for file, percent in results.items() if percent >= threshold), @@ -27,17 +38,26 @@ def process_html_files(directory, threshold=10): ) with open('plagiarism_results.txt', 'w') as output_file: - output_file.write("\nFiltered and Sorted Results (Above 10%):\n") + log("Writing results to plagiarism_results.txt") + output_file.write("Filtered and Sorted Results (Above Threshold):\n") for file, percent in filtered_sorted_results: - output_file.write(f"{file}: {percent}%\n") + line = f"{file}: {percent}%\n" + output_file.write(line) + log(line.strip()) + if not filtered_sorted_results: + output_file.write("No results exceeding threshold.\n") + log("No results exceeding threshold.") def main(): if len(sys.argv) != 2: + log("Incorrect number of arguments provided.") print("Usage: python extract_percentages.py ") sys.exit(1) saved_dir_path = sys.argv[1] + log(f"Received saved directory path: {saved_dir_path}") process_html_files(saved_dir_path) + log("Extraction of plagiarism percentages completed.") if __name__ == "__main__": main() \ No newline at end of file diff --git a/.github/scripts/plagiarism_check.py b/.github/scripts/plagiarism_check.py index bdf38ffdae..4946bc127e 100644 --- a/.github/scripts/plagiarism_check.py +++ b/.github/scripts/plagiarism_check.py @@ -3,11 +3,17 @@ import os import glob import shutil +import time + +def log(message): + timestamp = time.strftime("%Y-%m-%d %H:%M:%S") + print(f"[{timestamp}] {message}") def run_compare50(single_file, directory, output_dir, saved_dir_base): try: if not os.path.exists(saved_dir_base): os.makedirs(saved_dir_base) + log("Created base directory for saved files.") all_js_files = glob.glob(os.path.join(directory, "*.js")) total_files = len(all_js_files) @@ -16,11 +22,13 @@ def run_compare50(single_file, directory, output_dir, saved_dir_base): for file in all_js_files: current_file_number += 1 if os.path.abspath(file) == os.path.abspath(single_file): + log(f"Skipping comparison for the same file: {file}") continue - print(f"Processing file {current_file_number} of {total_files}: {file}") + log(f"Processing file {current_file_number} of {total_files}: {file}") if os.path.exists(output_dir): shutil.rmtree(output_dir) + log(f"Cleaned existing output directory: {output_dir}") command = [ "compare50", @@ -32,23 +40,28 @@ def run_compare50(single_file, directory, output_dir, saved_dir_base): ] command_str = ' '.join(command) + log(f"Running command: {command_str}") subprocess.run(command_str, shell=True, check=True) + log("Compare50 command executed successfully.") match_file = os.path.join(output_dir, "match_1.html") if os.path.exists(match_file): new_filename = os.path.basename(file).replace('.js', '.html') saved_file_path = os.path.join(saved_dir_base, new_filename) - print(f"Moving {match_file} to {saved_file_path}") + log(f"Match found. Moving {match_file} to {saved_file_path}") shutil.move(match_file, saved_file_path) + else: + log(f"No match found for file: {file}") except subprocess.CalledProcessError as e: - print("Error in running Compare50:", e) + log(f"Error in running Compare50: {e}") except Exception as e: - print(f"An error occurred: {e}") + log(f"An error occurred: {e}") def main(): if len(sys.argv) != 5: + log("Incorrect number of arguments provided.") print("Usage: python plagiarism_check.py ") sys.exit(1) @@ -57,17 +70,18 @@ def main(): output_dir = sys.argv[3] saved_dir_base = sys.argv[4] - print(f"Received arguments:") - print(f"Single file: {single_file}") - print(f"Directory: {directory}") - print(f"Output directory: {output_dir}") - print(f"Saved directory base: {saved_dir_base}") + log(f"Starting plagiarism check with the following arguments:") + log(f"Single file: {single_file}") + log(f"Directory: {directory}") + log(f"Output directory: {output_dir}") + log(f"Saved directory base: {saved_dir_base}") - print(f"All files in directory '{directory}':") + log(f"Listing all JavaScript files in directory '{directory}':") for f in glob.glob(os.path.join(directory, "*.js")): - print(f) + log(f) run_compare50(single_file, directory, output_dir, saved_dir_base) + log("Plagiarism check completed.") if __name__ == "__main__": main() \ No newline at end of file diff --git a/.github/workflows/check_plagiarism.yml b/.github/workflows/check_plagiarism.yml index f268936e9a..7d8f52c43f 100644 --- a/.github/workflows/check_plagiarism.yml +++ b/.github/workflows/check_plagiarism.yml @@ -30,27 +30,18 @@ jobs: head_sha="${{ github.event.pull_request.head.sha }}" js_files=$(git diff --name-only --diff-filter=AM $base_sha..$head_sha | grep 'games/.*\.js$' | xargs) echo "FILES=$js_files" >> $GITHUB_ENV + - name: Run Plagiarism Detection Script - run: python .github/scripts/plagiarism_check.py ${{ env.FILES }} games output_dir saved_dir - name: Extract and Display Similarity Percentages run: python .github/scripts/extract_percentages.py saved_dir/ - - name: Post Plagiarism Results Comment - if: success() - uses: actions/github-script@v7 + - name: Upload Compare50 Results as Artifacts + uses: actions/upload-artifact@v3 with: - github-token: ${{ secrets.PR_COMMENT_PRIVATE_KEY }} - script: | - const fs = require('fs'); - const output = fs.readFileSync('plagiarism_results.txt', 'utf8'); - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: output - }); + name: compare50-results + path: saved_dir/ - name: Check for High Plagiarism Percentages if: success() @@ -58,10 +49,4 @@ jobs: if grep -qE "(\d{2,3})%" plagiarism_results.txt; then echo "Plagiarism percentage over threshold detected." exit 1 - fi - - - name: Upload Compare50 Results as Artifacts - uses: actions/upload-artifact@v3 - with: - name: compare50-results - path: saved_dir/ \ No newline at end of file + fi \ No newline at end of file