diff --git a/.vscode/launch.json b/.vscode/launch.json index 91ae1bf..7fc9df2 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -7,7 +7,7 @@ "request": "attach", "connect": { "host": "localhost", - "port": "5678" + "port": 5678 }, "pathMappings": [ { diff --git a/Dockerfile.development b/Dockerfile.development index ea2fbf6..32a6153 100644 --- a/Dockerfile.development +++ b/Dockerfile.development @@ -12,24 +12,31 @@ ENV FLASK_APP=/dMLPA-flask/app.py ENV PYTHONPATH=/dMLPA-flask/dMLPA-flask ENV SESSION_FILE_DIR /var/local/dMLPA-flask/flask_sessions ENV SCRIPT_NAME=dMLPA-flask -ENV UPLOAD_FOLDER /var/local/dMLPA-flask/uploads -ENV FLASK_DEBUG=1 +ENV UPLOAD_FOLDER_1 /var/local/dMLPA-flask/probe_file +ENV UPLOAD_FOLDER_2 /var/local/dMLPA-flask/inputs +ENV OUTPUT_FOLDER /var/local/dMLPA-flask/outputs +ENV FLASK_DEBUG=1 # only for development # add and install requirements COPY ./requirements.txt . RUN pip3 install -r requirements.txt -USER 0 +USER 0 RUN mkdir -p /var/local/dMLPA-flask/logs/ -RUN mkdir -p /var/local/dMLPA-flask/uploads/ +RUN mkdir -p /var/local/dMLPA-flask/probe_file +RUN mkdir -p /var/local/dMLPA-flask/inputs +RUN mkdir -p /var/local/dMLPA-flask/outputs RUN mkdir -p /var/local/dMLPA-flask/flask_sessions/ RUN chmod 777 /var/local/dMLPA-flask/logs/ -RUN chmod 777 /var/local/dMLPA-flask/uploads/ +RUN chmod 777 /var/local/dMLPA-flask/probe_file +RUN chmod 777 /var/local/dMLPA-flask/inputs +RUN chmod 777 /var/local/dMLPA-flask/outputs RUN chmod 777 /var/local/dMLPA-flask/flask_sessions/ USER $CONTAINER_USER_ID RUN pip install debugpy +RUN apt-get update && apt-get install -y wkhtmltopdf # add app COPY . . diff --git a/dMLPA-flask/app.py b/dMLPA-flask/app.py index 4c3bce0..ac1ada9 100644 --- a/dMLPA-flask/app.py +++ b/dMLPA-flask/app.py @@ -28,7 +28,8 @@ from werkzeug.utils import secure_filename import zipfile - +import re +import config import logging log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" @@ -39,15 +40,18 @@ app = Flask(__name__) -app.config["UPLOAD_FOLDER"] = os.environ["UPLOAD_FOLDER"] +app.config["UPLOAD_FOLDER_1"] = os.environ["UPLOAD_FOLDER_1"] +app.config["UPLOAD_FOLDER_2"] = os.environ["UPLOAD_FOLDER_2"] +app.config["OUTPUT_FOLDER"] = os.environ["OUTPUT_FOLDER"] app.config["SECRET_KEY"] = "catchmeifyoucan" app.config["SESSION_TYPE"] = "filesystem" app.config["SESSION_PERMANENT"] = True app.config["SESSION_FILE_DIR"] = os.environ["SESSION_FILE_DIR"] app.config["UPLOAD_EXTENSIONS"] = [ ".xlsx", - ".bam", -] # TODO: If multiple file types, add to list i.e. [".txt", ".csv", ".xlsm", ".xlsx"] + ".xlsm", + ".xls" +] app.config["MAX_CONTENT_LENGTH"] = 2 * 1024 * 1024 # File has to be less than 2MB app.config["APPLICATION_ROOT"] = "/dMLPA-flask" app.config["WTF_CSRF_ENABLED"] = True @@ -67,18 +71,28 @@ def validate_input_file(form, field): "xlsx", "xlsm", "xls", - ] # TODO: If multiple file types, add to list i.e. ["xlsm", "xlsx"] + ] file = field.data if not file: - raise ValidationError("No file provided.") + raise ValidationError("No probe file provided.") if not file.filename: - raise ValidationError("No file selected.") + raise ValidationError("No probe file selected.") if not ( "." in file.filename and file.filename.rsplit(".", 1)[1].lower() in allowed_extensions ): raise ValidationError( - f"Invalid file type for input file '{file.filename}'. Allowed types are: xlsx" # TODO: Ammend file types as appropriate i.e. "xlsm, xlsx" + f"Invalid file type for probe file '{file.filename}'. " + "Allowed types are: xlsx, xlsm, xls" + ) + file_check = re.search(config.PROBEFILE_NAME_CONVENTION, file.filename) + if not file_check: + raise ValidationError( + f"File {file.filename} is not a valid probe file. " + "File name does not match expected naming convention for a probe " + "file which may result in processing errors. Please provide an " + "appropriate probe file which matches the expected naming " + "convention." ) @@ -87,19 +101,20 @@ def validate_input_files_array(form, field): "xlsx", "xlsm", "xls", - ] # TODO: Ammend file types as appropriate i.e. ["xlsm", "xlsx"] + ] files = field.data if not files: - raise ValidationError("No files provided.") + raise ValidationError("No results files provided.") for file in files: if not file.filename: - raise ValidationError("No files selected.") + raise ValidationError("No results files selected.") if not ( "." in file.filename and file.filename.rsplit(".", 1)[1].lower() in allowed_extensions ): raise ValidationError( - f"Invalid file type for input files '{file.filename}'. Allowed types are: bam" # TODO: Ammend file types as appropriate i.e. "xlsm, xlsx" + f"Invalid file type for results files '{file.filename}'." + "Allowed types are: xlsx, xlsm, xls" ) @@ -110,7 +125,7 @@ class ChangeForm(FlaskForm): render_kw={ "accept": ".xlsm, .xlsx, .xls", "id": "single_file", - }, # TODO: If multiple file types, add to list i.e. ".xlsm, .xlsx" + }, ) multiple_array_of_files = MultipleFileField( "Single/Multiple Patient Data Files (Excel):", @@ -119,7 +134,7 @@ class ChangeForm(FlaskForm): "accept": ".xlsm, .xlsx, .xls", "multiple": "True", "id": "multiple_array_of_files", - }, # TODO: If multiple file types, add to list i.e. ".xlsm, .xlsx" + }, ) submit = SubmitField("Run dMLPA-flask") @@ -135,13 +150,15 @@ def form(backend_state="initial"): return render_template( "index.html", form=chgForm, - file_errors=chgForm.errors, + validation_errors=chgForm.errors, ) # Use FileHandler() to log to a file session["timestr"] = datetime.now().strftime("%Y%m%d-%H%M%S") - os.mkdir(os.path.join(app.config["UPLOAD_FOLDER"], session["timestr"])) + os.mkdir(os.path.join(app.config["UPLOAD_FOLDER_1"], session["timestr"])) + os.mkdir(os.path.join(app.config["UPLOAD_FOLDER_2"], session["timestr"])) + os.mkdir(os.path.join(app.config["OUTPUT_FOLDER"], session["timestr"])) file_handler = logging.FileHandler( f"/var/local/dMLPA-flask/logs/dMLPA-flask_error.log" ) @@ -170,48 +187,61 @@ def form(backend_state="initial"): # Create the paths to the uploaded files input_single_file_tmp_path = os.path.join( - app.config["UPLOAD_FOLDER"], session["timestr"], input_single_file_basename + app.config["UPLOAD_FOLDER_1"], session["timestr"], input_single_file_basename ) input_files_tmp_paths = [ - os.path.join(app.config["UPLOAD_FOLDER"], session["timestr"], basename) + os.path.join(app.config["UPLOAD_FOLDER_2"], session["timestr"], basename) for basename in input_files_basenames ] - zipped_output, input_errors, input_ok_flag = run_backend( + # Create the path to the output folder + output_path = os.path.join(app.config["OUTPUT_FOLDER"], session["timestr"]) + + zipped_output, results_ok_flag, probe_ok_flag, probe_errors, unprocessed_files = run_backend( input_single_file_tmp_path, input_files_tmp_paths, - app.config["UPLOAD_FOLDER"], + output_path, ) + + # Establishing count of files + no_files_submitted = len(input_files_basenames) + no_files_processed = no_files_submitted - len(unprocessed_files) + + # TODO add logging of any errors encountered with the probe or results flags + # Save the zipped output path to the session session["zipped_output"] = zipped_output - if input_ok_flag == False: + if probe_ok_flag == False: return render_template( "index.html", form=chgForm, - backend_state="initial", - errors=input_errors, # Pass the errors to the template - file_errors=chgForm.errors, + backend_state="failed", + single_file_name=input_file.filename, + validation_errors=chgForm.errors, + probe_errors=probe_errors, # Pass the errors to the template ) else: return render_template( "index.html", form=chgForm, - backend_state=backend_state, + backend_state="finished", + files_submitted = no_files_submitted, + files_processed = no_files_processed, single_file_name=input_file.filename, - multiple_array_of_files_names=", ".join( - [x.filename for x in input_files_array] - ), + multiple_array_of_files_names=input_files_basenames, zipped_folder_name=zipped_output, - file_errors=chgForm.errors, + results_flag=results_ok_flag, + errors=unprocessed_files, + validation_errors=chgForm.errors, ) return render_template( "index.html", form=chgForm, backend_state=backend_state, - file_errors=chgForm.errors, + validation_errors=chgForm.errors, ) @@ -225,7 +255,7 @@ def upload(self, file): secure_file_name = secure_filename(file_name) file.save( os.path.join( - app.config["UPLOAD_FOLDER"], + app.config["UPLOAD_FOLDER_1"], session["timestr"], secure_file_name, ) @@ -233,7 +263,7 @@ def upload(self, file): return str( os.path.join( - app.config["UPLOAD_FOLDER"], + app.config["UPLOAD_FOLDER_1"], session["timestr"], secure_file_name, ) @@ -257,7 +287,7 @@ def upload(self, files): secure_file_name = secure_filename(file_name) file.save( os.path.join( - app.config["UPLOAD_FOLDER"], + app.config["UPLOAD_FOLDER_2"], session["timestr"], secure_file_name, ) @@ -265,7 +295,7 @@ def upload(self, files): file_names.append( str( os.path.join( - app.config["UPLOAD_FOLDER"], + app.config["UPLOAD_FOLDER_2"], session["timestr"], secure_file_name, ) diff --git a/dMLPA-flask/backend.py b/dMLPA-flask/backend.py index dc628f3..b05242c 100644 --- a/dMLPA-flask/backend.py +++ b/dMLPA-flask/backend.py @@ -1,6 +1,13 @@ import os +import re import zipfile from datetime import datetime +from jinja2 import Environment, PackageLoader +import pdfkit +import pandas as pd +import config +from qc_data import get_test_codes, df_to_html, get_qc_status, summary_data +from cna import split_df_column, clinical_df, copy_number_analysis, cnv_plot, pms2_plot img_versioned = os.getenv( "IMG_VERSIONED" @@ -8,6 +15,17 @@ def run_backend(single_file, array_of_files, output_directory): + + # USER WARNINGS: Anything added to the string 'probe_errors' will be passed from the backend to + # the frontend and displayed in a warning to the user to give immediate feedback to the + # user about errors such as incorrect formatting of input files etc. + # 'unprocessed_results_files on the other hand records a list of file names that won't be processed + # and will be returned to the user after other files have been processed. + probe_file_ok_flag: bool = True + results_file_ok_flag: bool = True + probe_errors = "" + unprocessed_results_files: list[str] = [] + # Ensure the output directory exists, if not, create it os.makedirs(output_directory, exist_ok=True) @@ -15,27 +33,156 @@ def run_backend(single_file, array_of_files, output_directory): timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") zip_name = os.path.join(output_directory, f"output_files_{timestamp}.zip") + # Creating empty list of files to zip + files_to_zip = [] + # Ensure all paths are absolute single_file = os.path.abspath(single_file) array_of_files = [os.path.abspath(file) for file in array_of_files] - # Prepare the files to be zipped - files_to_zip: list[str] = [single_file] + array_of_files + # Reading probe data into dataframe and formatting + try: + probe_df = pd.read_excel(single_file, sheet_name="probe_info") + probe_df.rename(columns={"Exon number": "Exon"}, inplace=True) + probe_df = split_df_column(probe_df, "Mapview (hg38)a", config.NEW_PROBE_DF_HEADERS) + probe_df["chr"] = probe_df["chr"].str.replace("chr", "").str.lstrip("0") + except Exception: #TODO Add cusom exception when adding logging. + error_message = ( + "Probe File Error: Spreadsheet headers don't match expected file format "\ + "resultantly result files could not be processed. Confirm that an "\ + "appropriate probe file was provided for analysis" + ) + probe_file_ok_flag = False + return zip_name, results_file_ok_flag, probe_file_ok_flag, error_message, unprocessed_results_files - # USER WARNINGS: Anything added to the list 'input_errors' will be passed from the backend to - # the frontend and displayed in a warning to the user. Used to give immediate feedback to the - # user about errors such as incorrect formatting of input files etc. - input_errors: list[str] = [] - input_ok_flag: bool = True - - # TODO: Add any relevant data validation here. Currently only checks if the file has "error_test" - # in the name, and if so, sends warning to frontend. This is just a placeholder for now. - for file_name in files_to_zip: - if "error_test" in file_name: - input_errors, input_ok_flag = [ - "TEST ERROR: File passed to trigger this warning for testing purposes" - ], False - break + # Iterating over files to generate reports + for file in array_of_files: + + # Getting input file basenames + file_name = os.path.basename(file) + probe_file = os.path.basename(single_file) + + file_check = re.search(config.FILE_NAME_CONVENTION, file_name) + + if not file_check: + unprocessed_results_files.append(file_name) + results_file_ok_flag = False + continue + + else: + # Reading data from results file in dataframes and formatting + metadata_df = pd.read_excel(file, sheet_name="Sample info") + raw_results_df = pd.read_excel(file, sheet_name="Gene filtered results") + raw_results_df.rename(columns={raw_results_df.columns[-1]:"dosage quotient"}, inplace=True) + + # controlling dataypes + probe_df["start position"] = probe_df["start position"].astype(int) + probe_df["end position"] = probe_df["end position"].astype(int) + + # creating results_clinical dataframe + results_clinical_df = clinical_df( + raw_results_df, + probe_df, + config.COLUMNS_RESULTS_DF, + config.COLUMNS_PROBE_DF + ) + + # check certain columns for any NaN entries + # This would indicated that for some probes in the results file + # the matching probe data couldn't be found in the probe file + columns_to_check = config.NEW_PROBE_DF_HEADERS + columns_with_nan = results_clinical_df[columns_to_check].columns[results_clinical_df[columns_to_check].isna().any()] + if not columns_with_nan.empty: + probe_file_ok_flag = False + probe_errors = ( + "Probe File Error: Required probe information for some probes " + "could not be found in the probe information file provided. " + "Confirm that an appropriate probe file was provided for analysis" + ) + return zip_name, results_file_ok_flag, probe_file_ok_flag, probe_errors, unprocessed_results_files + + # get list of genes in panel + gene_list = results_clinical_df["gene"].unique() + + # perform copy number analysis + cnv_summary_df, marked_df = copy_number_analysis(results_clinical_df, gene_list) + summary_html = df_to_html(cnv_summary_df) + + # generating summary dataframe with limited columns + summary_data_df = summary_data(marked_df) + + # creating empty dictionaries + summary_tables = {} + cnv_detail = {} + + for gene in gene_list: + # creating cnv summary data by gene + summary_data_gene_df = summary_data_df[summary_data_df["gene"] == gene] + summary_data_gene_df.columns = summary_data_gene_df.columns.str.replace("_", " ").str.title() + summary_tables[gene] = df_to_html(summary_data_gene_df) + + if not cnv_summary_df.empty: + # generating details of cnv by impacted gene if cnv's are detected + impacted_genes = cnv_summary_df["Gene"].unique() + if gene in impacted_genes: + # plotting gene data + results_gene_df = marked_df[marked_df["gene"] == gene] + if gene == "PMS2": + gene_plot = pms2_plot(results_gene_df, gene) + else: + gene_plot = cnv_plot(results_gene_df, gene) + + # creating table of probes involved in CNV + # impacted_probes_df = summary_data_gene_df.dropna(subset=["Variant"]) + impacted_probes_df = summary_data_gene_df.loc[summary_data_gene_df["Variant"] != ""] + impacted_probes_df = (impacted_probes_df.sort_values(by="Start Position")) + impacted_probes_html = df_to_html(impacted_probes_df) + + # saving to cnv_detail dict. + cnv_detail[gene] = [gene_plot, impacted_probes_html] + + # getting panel number and qc data from metadata + pan_number, r_code = get_test_codes(metadata_df) + qc_status = get_qc_status(metadata_df) + qc_data = df_to_html(metadata_df) + + # Creating ouput file name and path + # Input file name of results file is determined in GitHub repository: + # https://github.com/moka-guys/digital_mlpa + report = file_name.replace("_filtered_results.xlsx", ".pdf") \ + .replace("_filtered_results.xls", ".pdf") + report_path = os.path.join(output_directory, report) + + # Rendering template and saving as pdf + env = Environment(loader=PackageLoader("dMLPA-flask", "templates")) + template = env.get_template("report.html") + data_values = { + "app_version": config.DMLPA_FLASK_VERSION, + "production": config.RELEASED_TO_PRODUCTION, + "file_name": report, + "r_code": r_code, + "pan_number": pan_number, + "genes": gene_list, + "run_id": "", # TODO to be added once file name is confirmed, + "genome_build": config.GENOME_BUILD, + "patient_file": file_name, + "probe_file": probe_file, + "qc_status": qc_status, + "cnv_summary": summary_html, + "cnc_comment": config.CNC_CATEGORIES_DISCLAIMER, + "pms2_comment": config.PMS2_DISCLAIMER, + "cnv_detail": cnv_detail, + "report_notes": config.REPORTING_NOTE, + "qc_table": qc_data, + "summary_tables": summary_tables + } + + html_report = template.render(data_values) + + pdfkit.from_string(html_report, report_path) + + # Add report to list of files to be zipped + files_to_zip.append(report_path) # TODO: Add any relevant data processing here. Currently just copies the input files to the # output directory. This is just a placeholder for now. @@ -52,4 +199,4 @@ def run_backend(single_file, array_of_files, output_directory): f"File {file} does not exist." ) # Consider replacing with proper logging - return zip_name, input_errors, input_ok_flag + return zip_name, results_file_ok_flag, probe_file_ok_flag, probe_errors, unprocessed_results_files diff --git a/dMLPA-flask/cna.py b/dMLPA-flask/cna.py new file mode 100644 index 0000000..8bea129 --- /dev/null +++ b/dMLPA-flask/cna.py @@ -0,0 +1,322 @@ +""" +Module for conducting copy number analysis on pan filtered digital +MLPA output data. + +This module provides functions for merging the probe data with +the results data (which is necessary to return the desired output +data), manipulating the relevant dataframes, creating copy number +summary and detailed copy number reports. + +""" +import io +import base64 +import pandas as pd +import numpy as np +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import config + +def shared_columns(df1, df2): + """Identify shared dataframe columns""" + + # streamline column formatting + df1.columns = df1.columns.str.replace("_", " ").str.title() + df2.columns = df2.columns.str.replace("_", " ").str.title() + + # identifying shared columns + shared_columns = list(set(df1) & set(df2)) + + return shared_columns + +def split_df_column(df, column, headers): + """ + Split dataframe columns based on set regular expression + + Arguments: + - df (Pandas Dataframe) + - column (str): name of existing column + - headers (list): list of new column headers + + Return: + - df (Pandas Dataframe): updated dataframe + """ + df[headers] = df[column].str.split(r"[:\-]", expand=True) + return df + +def merge_df(df1, df2, df1_columns, df2_columns): + """Left join databases based on shared columns + + Arguments: + - df1 (Pandas Dataframe): 'left' database in the merge. + Data of this database will be maintained for the 'shared + columns' + - df2 (Pandas Dataframe): 'right' database. Only Data in + the unique df2_columns will be maintained. + - df1_columns (list): Specific unique columns from df1 that + must be maintained + - df2_columns (list): Specific unique columns from df2 that + must be maintained + + Return: + - merged_df (Pandas Dataframe) + + """ + + # streamline column formatting + df1.columns = df1.columns.str.replace("_", " ").str.lower() + df2.columns = df2.columns.str.replace("_", " ").str.lower() + + # identifying shared columns + shared_columns = list(set(df1) & set(df2)) + + # left merge + merged_df = df1[shared_columns + df1_columns].merge( + df2[df2_columns], + how = "left", + on = "probe number" # can be changed based on desired column + ) + + return merged_df + +def clinical_df(df1, df2, df1_columns, df2_columns): + """ + Create dataframe of clinical relevant MLPA probes + + Calls merge_df (function) to left-join probe and results + dataframe. + + Arguments: + - df1 (Pandas Dataframe): 'left' database in the merge. + Data of this database will be maintained for the 'shared + columns' + - df2 (Pandas Dataframe): 'right' database. Only Data in + the unique df2_columns will be maintained. + - df1_columns (list): Specific unique columns from df1 that + must be maintained + - df2_columns (list): Specific unique columns from df2 that + must be maintained + + Return: + - clinical_df (Pandas Dataframe) + """ + + merged_df = merge_df(df1, df2, df1_columns, df2_columns) + + # restructuring df based on column order + merged_df = merged_df[config.DESIRED_COLUMN_ORDER] + + # Filtering results by probetype + results_clinical_df = merged_df[merged_df["probe type"] != "CTRL"] + + return results_clinical_df + +def copy_number_analysis(df, gene_list): + """ + TODO function string needed. + Currently set up to analyse for all genes in run. + Might need to be broken down more in future + + """ + # Empty dataframe for all identified copy number variations + cnv_summary = pd.DataFrame() + + # Create 'variant' column with object dtype + df['variant'] = "" # Initialized with NaN values compatible with object dtype + df["variant"] = df["variant"].astype(object) #controls expected type and prevents FutureError observed in manual testing + + # TODO This is mostly from chatGPT - Need to understand what it's actually doing + for gene in gene_list: + + # Get results per gene + results_gene = df[df["gene"] == gene] + + if gene == "PMS2": + cnc_categories = config.PMS2_CATEGORIES + else: + cnc_categories = config.CNC_CATEGORIES + # iterate through CN categories and identify matching probe entries + for direction, parameters in cnc_categories.items(): + if direction != "Normal": + # print(direction, gene) # TODO Replace with logging + cnv_rows = np.where((parameters[0] <= results_gene["dosage quotient"]) & + (results_gene["dosage quotient"] <= parameters[1]))[0] + + # If no probe entries match CN category skipping next steps + if len(cnv_rows) == 0: + continue + + # If probe entries match CN category, identify uniform CN segments + else: + # gather information for appropriate probe entries + cnv_data = results_gene.iloc[cnv_rows] + # Assess based on probe order if probes are contiguous + cnv_groups = np.cumsum(np.concatenate(([1], np.diff(cnv_data["probe order"]) != 1))) + + # Splitting results per contiguous CN segment if multiple + cnv_list = {cnv_index: group for cnv_index, group in cnv_data.groupby(cnv_groups)} + + # Processing data of individual CN segments + for cnv_index, cnv_lines in cnv_list.items(): + cnv_lines = pd.DataFrame(cnv_lines) + + # Update the input DataFrame by adding a new column containing the CN category + probe_numbers = cnv_lines["probe number"] + for probe_number in probe_numbers: + df.loc[df["probe number"] == probe_number, 'variant'] = direction + + + # Obtainins summary data of contiguous CN segments + start_exon = cnv_lines["exon"].min() + end_exon = cnv_lines["exon"].max() + start_pos = cnv_lines["start position"].min() + end_pos = cnv_lines["end position"].max() + + # Calculate average dosage quotient across CN segment and standard deviation + copy_change_mean = round(cnv_lines["dosage quotient"].mean(), 3) + copy_change_sd = round(cnv_lines["dosage quotient"].std(), 3) + + # Calculating Estimated Copy number for CN segment. Autosomal Logic 2*1=2; 2*0.5=1; 2*1.5=3 + # TODO Does datatype need to be considered before calculation? + if "Ref to Scientist" in direction: + copy_number_est = "-" + else: + copy_number_est = round(np.mean(cnv_lines["normal copy number"] * cnv_lines["dosage quotient"])) + + + # summarize CN data into a df entry + cnv_aggregate = pd.DataFrame({ + "Gene": [gene], + "Variant": [direction], + "Exon(s)": f"{start_exon} - {end_exon}" if start_exon != end_exon else str(start_exon), + "Chr": cnv_lines["chr"].unique(), + "Probe Co-ordinates": f"{start_pos}-{end_pos}", + "Normal Copy Number": int(cnv_lines["normal copy number"].unique()), #Problem if there is more than one potential copy number? + "Mean Dosage Quotient": f"{copy_change_mean}+/-{copy_change_sd}", + "Est. Copy Number": [copy_number_est], + "Supporting Probes": [len(cnv_lines)] + }) + + # Adding df entry into existing CNA_summary dataframe + # TODO concat method is not ideal apparently. Consider alternative + cnv_summary = pd.concat([cnv_summary,cnv_aggregate], ignore_index=True) + + # organize summary by genomic co-ordinates + cnv_summary = cnv_summary.sort_values(by="Probe Co-ordinates") + + # returning output data to backend.py + return cnv_summary, df + +def cnv_plot(df, gene, extend=1000): + # create copy of dataframe + cnv_copy = df.copy() + + # pull specific data from columns + start = min(cnv_copy["start position"]) - extend + end = max(cnv_copy["end position"]) + extend + + # determining y-axis limit for DQ plot + ylim = (0, max(2.0, 1.25*cnv_copy["dosage quotient"].max())) + + #setting up plot layout + plt.figure(figsize=(10,4)) + + # setting axis + plt.xlim(start,end) + plt.ylim(*ylim) + plt.tick_params(axis='both', labelsize=8) + + # setting plot labels + plt.ylabel("Dosage Quotient", fontdict={'family': 'Sans', 'size': 8}) + plt.xlabel("Co-ordinates", fontdict={'family': 'Sans', 'size': 8}) + plt.title(gene, fontdict={'family': 'Sans', 'size': 10}) + + # formatting x-axis ticks + # but this is from chatGPT and i need to question if this is the best way to do this. + plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:d}'.format(int(x)))) + + # adding shading to define ranges for normal, hetdel and hetdup + cnc_categories = config.CNC_CATEGORIES + plt.axhspan(cnc_categories["Normal"][0], cnc_categories["Normal"][1], facecolor = "#00FF0044", edgecolor = None) + plt.axhspan(cnc_categories["HetDel"][0], cnc_categories["HetDel"][1], facecolor = "#FF000044", edgecolor = None) + plt.axhspan(cnc_categories["HetDup"][0], cnc_categories["HetDup"][1], facecolor = "#318CE744", edgecolor = None) + + # plot DQ value + plt.plot(cnv_copy["start position"], cnv_copy["dosage quotient"], marker = "x", linestyle = "", color = "black") + + # plt.show() + + # Save plot as image + img = io.BytesIO() + plt.savefig(img, format="svg") + img.seek(0) + + # Encode SVG image as Base64 + img_base64 = base64.b64encode(img.getvalue()).decode() + + plt.close() + + return img_base64 + +def pms2_plot(df, gene, extend=1000): + # create copy of dataframe + cnv_copy = df.copy() + + # pull specific data from columns + start = min(cnv_copy["start position"]) - extend + end = max(cnv_copy["end position"]) + extend + + # determining y-axis limit for DQ plot + ylim = (0, max(2.0, 1.25*cnv_copy["dosage quotient"].max())) + + #setting up plot layout + plt.figure(figsize=(10,4)) + + # setting axis + plt.xlim(start,end) + plt.ylim(*ylim) + plt.tick_params(axis='both', labelsize=8) + + # setting plot labels + plt.ylabel("Dosage Quotient", fontdict={'family': 'Sans', 'size': 8}) + plt.xlabel("Co-ordinates", fontdict={'family': 'Sans', 'size': 8}) + plt.title(gene, fontdict={'family': 'Sans', 'size': 10}) + + # formatting x-axis ticks + # but this is from chatGPT and i need to question if this is the best way to do this. + plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: '{:d}'.format(int(x)))) + + # adding shading to define ranges for normal, hetdel and hetdup + cnc_categories = config.PMS2_GRAPH_CATEGORIES + plt.axhspan(cnc_categories["Normal"][0], cnc_categories["Normal"][1], facecolor = "#00FF0044", edgecolor = None) + plt.axhspan(cnc_categories["Del2"][0], cnc_categories["Del2"][1], facecolor = "#FF000044", edgecolor = None) + plt.axhspan(cnc_categories["Del1"][0], cnc_categories["Del1"][1], facecolor = "#FF7F7F44", edgecolor = None) + plt.axhspan(cnc_categories["Dup1"][0], cnc_categories["Dup1"][1], facecolor = "#98C5F144", edgecolor = None) + plt.axhspan(cnc_categories["Dup2"][0], cnc_categories["Dup2"][1], facecolor = "#318CE744", edgecolor = None) + + # adding boundary marker between PMS2 exon 11 and exon 12 + plt.axvline( + x=config.PMS2_PMS2CL_BOUNDARY, + # ymin = cnc_categories["Del2"][0], + # ymax = cnc_categories["Dup2"][1], + color='black', + linestyle = '--', + lw = 0.75 + ) + + # plot DQ value + plt.plot(cnv_copy["start position"], cnv_copy["dosage quotient"], marker = "x", linestyle = "", color = "black") + + # plt.show() + + # Save plot as image + img = io.BytesIO() + plt.savefig(img, format="svg") + img.seek(0) + + # Encode SVG image as Base64 + img_base64 = base64.b64encode(img.getvalue()).decode() + + plt.close() + + return img_base64 \ No newline at end of file diff --git a/dMLPA-flask/config.py b/dMLPA-flask/config.py index e69de29..65be8dd 100644 --- a/dMLPA-flask/config.py +++ b/dMLPA-flask/config.py @@ -0,0 +1,140 @@ + +# config.py + +# TODO Add config docstring + +''' ==================== Production release constants ==================== ''' + +DMLPA_FLASK_VERSION = "v1.0.0" + +RELEASED_TO_PRODUCTION = False + +''' ==================== app.py constants ==================== ''' + +PROBEFILE_NAME_CONVENTION = "PIF_D001-[a-zA-Z][0-9]" #TODO to be confirmed. + +''' ==================== backend.py constants ==================== ''' + +# TODO to be confirmed +FILE_NAME_CONVENTION = "BP01-[0-9]+-dMLPA_R[0-9]+v[0-9]_Pan[0-9]+_[0-9]+_[a-zA-Z]+_[a-zA-Z]+[0-9]+_filtered_results.xlsx" + +GENOME_BUILD = "GRCh38" + +NEW_PROBE_DF_HEADERS = [ + "chr", + "start position", + "end position" +] + +COLUMNS_RESULTS_DF = [ + "dosage quotient" +] + +COLUMNS_PROBE_DF = [ + "probe number", + "chr", + "start position", + "end position", +] + +REPORTING_NOTE = { + # Generic message + "Ref to Scientist": ( + "Ref to Scientist: The dosage quotient for one or " + "more probes falls outside the typical ranges used " + "to evaluate copy number. Scientist review required" + ), +} + +CNC_CATEGORIES_DISCLAIMER = """The following DQ ranges are used to \ + assess copy number in most genes: Normal Copy Number 0.8-1.2; \ + Heterzygous Copy Number Loss 0.4-0.6; Heterzygous Copy Number \ + Gain 1.4-16.""" + +PMS2_DISCLAIMER = """OF NOTE, analysis of copy number variation in \ + PMS2 is complicated by the pseudogene PMS2CL. The following \ + alternative DQ ranges have been used to assess copy number as \ + a result: Normal Copy Number 0.86-1.14; Potential Copy Number Loss \ + 0.4-0.84; Potential Copy Number Gain: 1.16-1.6. Scientist Review \ + is Recommended for all PMS2 Cases.""" + +''' ==================== cna.py constants ==================== ''' + +DESIRED_COLUMN_ORDER = [ + "probe order", + "probe number", + "reference probe", + "chr", + "gene", + "exon", + "start position", + "end position", + "normal copy number", + "dosage quotient", + "probe type", + "additional information", +] + +CNC_CATEGORIES = { + # TODO How does code handle MDQ that is 3 dp in the excel + "Ref to Scientist 1*": [0, 0.399], + "HetDel": [0.400, 0.600], # theoretical value 0.5, represents CN 1 + "Ref to Scientist 2*": [0.601, 0.799], + "Normal": [0.800, 1.200], # theoretical value 1, represents CN 2 + "Ref to Scientist 3*": [1.201, 1.399], + "HetDup": [1.400, 1.600], # theoretical value 1.5, represents CN 3 + "Ref to Scientist 4*": [1.601, float("Inf")] +} + +PMS2_CATEGORIES = { + # TODO Stake holders to confirm thresholds for this + "Ref to Scientist 1*": [0, 0.399], + "Potential PMS2 CN loss": [0.40, 0.840], + "Ref to Scientist 2*": [0.841, 0.859], + "Normal": [0.860, 1.140], # theoretical value 1 + "Ref to Scientist 3*": [1.141, 1.159], + "Potential PMS2 CN gain": [1.160, 1.600], + "Ref to Scientist 4*": [1.601, float("Inf")] +} + +PMS2_GRAPH_CATEGORIES = { + # TODO Stake holders to confirm thresholds for this + "Del2": [0.4, 0.60, "#FF000044"], # theortical value of 0.5 + "Del1": [0.65, 0.84, "#FF7F7F44"], # theoretical value 0.75 + "Normal": [0.86, 1.14, "#00FF0044"], # theoretical value 1 + "Dup1": [1.16, 1.35, "#98C5F144"], # theoretical value 1.25, + "Dup2": [1.4, 1.6, "#318CE744"] # theoretical value of 1.5 +} + +PMS2_CATEGORIES_PREVIOUS = { + # TODO Stake holders to confirm thresholds for this + "Ref to Scientist 5*": [0, 0.39], + "PMS2 CN loss": [0.4, 0.60], # theortical value of 0.5 + "Ref to Scientist 6*": [0.61, 0.64], + "Potential PMS2 CN loss": [0.65, 0.84], # theoretical value 0.75 + "Ref to Scientist 7*": [0.85, 0.85], + "Normal": [0.86, 1.14], # theoretical value 1 + "Ref to Scientist 8*": [1.15, 1.15], + "Potential PMS2 CN gain": [1.16, 1.35], # theoretical value 1.25, + "Ref to Scientist 9*": [1.36, 1.39], + "PMS2 CN gain": [1.4, 1.6], + "Ref to Scientist 10*": [1.61, float("Inf")] +} + +PMS2_PMS2CL_BOUNDARY = 5983991 # 1000bp into the intron proximal to the + # start of exon 12 (i.e. intron 11-12). Based on transcript + # NM_000535.7 build GRCH38.p14 + +''' ==================== qc_data.py constants ==================== ''' + +SUMMARY_DATA_COLUMNS = [ + "probe number", + "chr", + "gene", + "exon", + "start position", + "end position", + "normal copy number", + "dosage quotient", + "variant" +] diff --git a/dMLPA-flask/notes.py b/dMLPA-flask/notes.py new file mode 100644 index 0000000..81573e6 --- /dev/null +++ b/dMLPA-flask/notes.py @@ -0,0 +1,152 @@ +import pandas as pd +import numpy as np +from cna import split_df_column, merge_df, clinical_df, copy_number_analysis, cnv_plot +from qc_data import summary_data, df_to_html +import config + +from jinja2 import Environment, PackageLoader +import pdfkit + +# Test files to test script +# xls = "/home/epayne1/Documents/GSTT/test_input/BP01-16-dMLPA_R210v1_Pan5025_218513_SURNAME_Unaffected_filtered_results.xlsx"" +xls = "/home/epayne1/Documents/GSTT/test_input/BP01-16-dMLPA_R210v1_Pan5025_218513_SURNAME_Firstname16_filtered_results.xlsx" +probe = "/home/epayne1/Documents/GSTT/probe_file/PIF_D001-C1_Hereditary_Cancer_Panel_1-v02_modB1probe.xlsx" + +# Reading data in dataframes +metadata_df = pd.read_excel(xls, sheet_name="Sample info") +raw_results_df = pd.read_excel(xls, sheet_name="Gene filtered results") +probe_df = pd.read_excel(probe, sheet_name="probe_info") + +# Defining new column names +probe_column_headers = [ + "chr", + "start position", + "end position" +] + +# hard manipulation/formatting of df's +probe_df.rename(columns={"Exon number": "Exon"}, inplace=True) +probe_df = split_df_column(probe_df, "Mapview (hg38)a", probe_column_headers) +probe_df["chr"] = probe_df["chr"].str.replace("chr", "").str.lstrip("0") +raw_results_df.rename(columns={raw_results_df.columns[-1]:"dosage quotient"}, inplace=True) + +# creating results_clinical dataframe +results_clinical_df = clinical_df( + raw_results_df, + probe_df, + config.COLUMNS_RESULTS_DF, + config.COLUMNS_PROBE_DF + ) + +results_clinical_df["start position"] = results_clinical_df["start position"].astype(int) +results_clinical_df["end position"] = results_clinical_df["end position"].astype(int) +# results_clinical_df.to_excel("merged_excel.xlsx") + +# get list of genes +gene_list = results_clinical_df["gene"].unique() + +# perform copy number analysis +cnv_summary_df, marked_df = copy_number_analysis(results_clinical_df, gene_list) +# marked_df.to_excel("marked_df.xlsx") + + +# obtaining summary data per gene for appendix +summary_data_df = summary_data(marked_df) +# print(summary_data_df) + +summary_tables = {} +cnv_detail = {} +for gene in gene_list: + # summary data by gene + summary_data_gene = summary_data_df[summary_data_df["gene"] == gene] + # print(summary_data_gene) + summary_data_gene.columns = summary_data_gene.columns.str.replace("_", " ").str.title() + summary_tables[gene] = df_to_html(summary_data_gene) + # summary_tables[gene] = gene_summary_data(gene, summary_data_df) + + print(cnv_summary_df) + + if not cnv_summary_df.empty: + # cnv_detail by impacted gene + impacted_genes = cnv_summary_df["Gene"].unique() + if gene not in impacted_genes: + continue + else: + results_gene = marked_df[marked_df["gene"] == gene] + gene_plot = cnv_plot(results_gene, gene) + impacted_probes = summary_data_gene.dropna(subset=["Variant"]) + impacted_probes = summary_data_gene.loc[summary_data_gene["Variant"] != ""] + impacted_probes = (impacted_probes.sort_values(by="Start Position")) + print(impacted_probes) + + impacted_probes_html = df_to_html(impacted_probes) + + # saving to cnv_detail dict. + cnv_detail[gene] = [gene_plot, impacted_probes_html] + +# env = Environment(loader=PackageLoader("dMLPA-flask", "templates")) +# template = env.get_template("report.html") +# data_values = { +# "file_name": "report.pdf", +# # "r_code": r_code, +# "pan_number": "pan_number", +# "genes": gene_list, +# "patient_file": xls, +# "probe_file": probe, +# "qc_status": "qc_status", +# "cnv_summary": "summary_html", +# "report_notes": "report_notes", +# "qc_table": "passed", +# "summary_tables": "summary_tables", +# "gene_plots": gene_plots +# } + +# html_report = template.render(data_values) + +# pdfkit.from_string(html_report) + +""" NOTES ON PLOT ANNOTATION """ + +def get_text_positions(text, x_data, y_data, txt_width, txt_height): + a = zip(y_data, x_data) + text_positions = list(y_data) + for index, (y, x) in enumerate(a): + local_text_positions = [i for i in a if i[0] > (y - txt_height) + and (abs(i[1] - x) < txt_width * 2) and i != (y,x)] + if local_text_positions: + sorted_ltp = sorted(local_text_positions) + if abs(sorted_ltp[0][0] - y) < txt_height: #True == collision + differ = np.diff(sorted_ltp, axis=0) + a[index] = (sorted_ltp[-1][0] + txt_height, a[index][1]) + text_positions[index] = sorted_ltp[-1][0] + txt_height*1.01 + for k, (j, m) in enumerate(differ): + #j is the vertical distance between words + if j > txt_height * 2: #if True then room to fit a word in + a[index] = (sorted_ltp[k][0] + txt_height, a[index][1]) + text_positions[index] = sorted_ltp[k][0] + txt_height + break + return text_positions + +def text_plotter(text, x_data, y_data, text_positions, txt_width,txt_height): + for z,x,y,t in zip(text, x_data, y_data, text_positions): + plt.annotate(str(z), xy=(x-txt_width/2, t), size=12) + if y != t: + plt.arrow(x, t,0,y-t, color='red',alpha=0.3, width=txt_width*0.1, + head_width=txt_width, head_length=txt_height*0.5, + zorder=0,length_includes_head=True) + +# Pulled from function + # # Annotate probes within het_del_dq_range or het_dup_dq_range with probe numbers + # for index, row in cnv_copy.iterrows(): + # dq_value = row["dosage quotient"] + # position = row["start position"] + # probe_number = row["probe number"] + + # if dq_value < normal_dq_range[-1] or dq_value > normal_dq_range[1]: + # plt.annotate(probe_number, xy=(position, dq_value), + # xytext=(position + 99, dq_value + 0.5), # Adjust xytext position + # rotation=89, + # # textcoords="offset points", # Offset in points + # # ha='center', va='center', # Horizontal and vertical alignment + # # bbox=dict(boxstyle='round,pad=-1.3', fc='yellow', alpha=0.5), # Bounding box style + # arrowprops=dict(arrowstyle='->', connectionstyle='arc,rad=-1', color='black')) # Arrow properties \ No newline at end of file diff --git a/dMLPA-flask/qc_data.py b/dMLPA-flask/qc_data.py new file mode 100644 index 0000000..201a4f1 --- /dev/null +++ b/dMLPA-flask/qc_data.py @@ -0,0 +1,104 @@ +""" +Module for pulling relevant run and raw results data from dMLPA +reporting files. + +This module provides functions for extracting virtual pan number, +obtaining QC status, converting DataFrame tables to specific HTML +format, and generating summary results data. + +Functions: +- get_pan_number(df): Return virtual pan number used for analysis. +- get_qc_status(df): Return QC pass or fail status. +- df_to_html(df): Convert DataFrame table to HTML without index. +- summary_data (df): Create a summary table of key probe data. +""" + +import re +import config + +def get_test_codes(df): + """ + Return virtual pan number and Rcode used for analysis + + Arguments: + - df(Pandas Dataframe): assumed to be a dataframe containing the + Pan(el) number within the second column header (samplename) + + Return Value: + - pan_number(Str): GSTT virtual gene pan(el) number used to filter + raw NGS output + - r_code (Str): Appropriate Rcode from the national genomic test + directory + """ + + # TODO: + # - confirm samplenaming convetion prior to release + # - Logging.info[panel number] and logging.error[pan number not found] + # - Print error onto report. + + # creating empty variables + pan_number = "" + r_code = "" + + # Get samplename from df + samplename = df.columns[1] + + # Searching for Pan number by regular expression + panel_number_match = re.search(r'Pan[0-9]+', samplename) + rcode_match = re.search(r'R[0-9]+', samplename) + + if panel_number_match: + pan_number = panel_number_match.group(0) + + if rcode_match: + r_code = rcode_match.group(0) + + return pan_number, r_code + + +def get_qc_status(df): + """ + Return qc pass or fail status + + Arguments: + - df(Pandas Dataframe): assumed to be a dataframe containing the run + qc status at a specific location. + + Return Value: + - qc_status(Str): + """ + qc_status = df.loc[6, df.columns[1]].upper() + + return qc_status + +def df_to_html(df): + """Convert dataframe table to html without index""" + + #Saving dataframe to html format + html_format = df.to_html(classes="table", index=False) + + return html_format + +def summary_data(df): + """Create summary table of key probe data""" + + # creating copy of selected columns with Copy number estimate column + summary_data_df = df[config.SUMMARY_DATA_COLUMNS].copy() + summary_data_df["copy number est."] = "-" + summary_data_df["copy number est."] = summary_data_df["copy number est."].astype(object) + + # defining regular expression + variant_type = re.compile("Ref to Scientist.*", re.IGNORECASE) + + # creating condition for calculating copy number estimate + # TODO two methods below (both ChatGPT) decide which to keep + condition = ~summary_data_df['variant'].str.match(variant_type) + # condition = ~summary_data_df["variant"].apply(lambda x: bool(variant_type.match(x))) + + # calculating copy number estimate only where condition isn't met + summary_data_df.loc[condition, "copy number est."] = ( + summary_data_df["normal copy number"] * + summary_data_df["dosage quotient"] + ).round() + + return summary_data_df diff --git a/dMLPA-flask/templates/index.html b/dMLPA-flask/templates/index.html index 9fa76b3..805f128 100644 --- a/dMLPA-flask/templates/index.html +++ b/dMLPA-flask/templates/index.html @@ -24,98 +24,114 @@
The following files do not meet the expected naming convention for patient files and could not be processed:
+ + {% for error in errors %} +Expected naming convention for results files: "dMLPA_Rcode_PanNumber_Sample/TestID_Surname_Name"
+Please select the files for your analysis below, then click - the "Run dMLPA-flask" button.
-You submitted the following probe file:
+{{ single_file_name }}
+You submitted {{files_submitted}} patient file(s):
+ {% for file_name in multiple_array_of_files_names %} +{{ report_name }}
+If you have any issues with this tool please contact Synnovis bioinformatics support.
+Backend has failed with the following error:
+{{ probe_errors }}
+You submitted the following probe file:
+{{ single_file_name }}
+If you have any issues with this tool please contact Synnovis bioinformatics support.
+South East Genomics Laboratory Hub
+R Code:{{ r_code }}
+Pan Number: {{ pan_number }}
+Genes: + {%for gene in genes %} + {{ gene }}{% if not loop.last %}, {% endif %} + {% endfor %} +
+Run ID: {{ run_id }}
+Genome Build: {{ genome_build }}
+File-Name: {{ patient_file }}
+Probe-file: {{ probe_file }}
+QC {{ qc_status }}
+ {% else %} +QC {{ qc_status }}
+ {% endif %} +{{ cnv_summary }}
+ {% for key, value in report_notes.items() %} + {% if key is in cnv_summary %} +{{ value}}
+ {% endif %} +{{ cnc_comment }}
+ {% if 'PMS2' in genes %} +{{ pms2_comment }}
+ {% endif %} + + {% endfor %} + {% else %} +NO COPY NUMBER VARIANTS DETECTED
+{{ cnc_comment }}
+ {% if 'PMS2' is in genes %} +{{ psm2_comment }}
+ {% endif %} + {% endif %} + + + +{{ gene }}_plot
+ +{{ data[1] }}
+NO COPY NUMBER VARIANTS DETECTED
+ {% endif %} +{{ qc_table }}
+ +{{ gene }}
+{{ summary_table }}
+ + {% endfor %} +