diff --git a/bedboss/bedclassifier/bedclassifier.py b/bedboss/bedclassifier/bedclassifier.py index 420cea0..f678f2a 100644 --- a/bedboss/bedclassifier/bedclassifier.py +++ b/bedboss/bedclassifier/bedclassifier.py @@ -57,6 +57,16 @@ def get_bed_type(bed: str, no_fail: Optional[bool] = True) -> Tuple[str, str]: raise BedTypeException( reason=f"Bed type could not be determined due to CSV parse error {e}" ) + except UnicodeDecodeError as e: + if no_fail: + _LOGGER.warning( + f"Unable to parse bed file {bed}, due to error {e}, setting bed_type = unknown_bedtype" + ) + return "unknown_bedtype", "unknown_bedtype" + else: + raise BedTypeException( + reason=f"Bed type could not be determined due to CSV parse error {e}" + ) if df is not None: df = df.dropna(axis=1) diff --git a/scripts/bedclassifier_tuning/bedclassify.py b/scripts/bedclassifier_tuning/bedclassify.py index cc608d3..ae5c2ea 100644 --- a/scripts/bedclassifier_tuning/bedclassify.py +++ b/scripts/bedclassifier_tuning/bedclassify.py @@ -8,6 +8,7 @@ from typing import Optional from bedboss.bedclassifier import get_bed_type +from bedboss.exceptions import BedTypeException _LOGGER = logging.getLogger("bedboss") @@ -53,6 +54,7 @@ def __init__( ) # Use existing Pipeline Manager or Construct New one # Want to use Pipeline Manager to log work AND cleanup unzipped gz files. + self.pm = pm # if pm is not None: # self.pm = pm # self.pm_created = False @@ -86,9 +88,16 @@ def __init__( with open(unzipped_input_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) self.input_file = unzipped_input_file - # self.pm.clean_add(unzipped_input_file) + if self.pm: + self.pm.clean_add(unzipped_input_file) + + try: + self.bed_type, self.bed_type_named = get_bed_type(self.input_file) + except BedTypeException as e: + _LOGGER.warning(msg=f"FAILED {bed_digest} Exception {e}") + self.bed_type = "unknown_bedtype" + self.bed_type_named = "unknown_bedtype" - self.bed_type, self.bed_type_named = get_bed_type(self.input_file) # return f"bed{bedtype}+{n}", bed_type_named if self.input_type is not None: @@ -126,8 +135,8 @@ def __init__( # self.pm.report_result(key="bedtype", value=self.bed_type) - # if self.pm_created is True: - # self.pm.stop_pipeline() + if self.pm: + self.pm.stop_pipeline() def main(): @@ -143,6 +152,14 @@ def main(): # gse_list = gse_obj.get_gse_all() # gse_obj.generate_file("data/output.txt", gse_list=gse_list) + logs_dir = os.path.join(os.path.abspath("results"), "logs") + pm = pypiper.PipelineManager( + name="bedclassifier", + outfolder=logs_dir, + recover=True, + ) + pm.start_pipeline() + # for geo in gse_list: geofetcher_obj = Geofetcher( filter="\.(bed|narrowPeak|broadPeak)\.", @@ -185,12 +202,11 @@ def main(): output_dir=os.path.abspath("results"), input_type=bed_type_from_geo, psm=psm, + pm=pm, gsm=geo_accession, ) - # Get list of Bed Files and Download them - - # Open Bed Files, Classify them, Report them. + pm.stop_pipeline() if __name__ == "__main__":