Skip to content

Commit

Permalink
fix row skipping during file opening
Browse files Browse the repository at this point in the history
  • Loading branch information
donaldcampbelljr committed Sep 17, 2024
1 parent e8be1fe commit ccf0c5c
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 2 deletions.
25 changes: 23 additions & 2 deletions bedboss/refgenome_validator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,32 @@ def _read_file_pd(*args, **kwargs) -> pd.DataFrame:
f"Skipped {row_count} rows while standardization. File: '{args}'"
)
df = df.dropna(axis=1)
return df
for index, row in df.iterrows():
if (
isinstance(row[0], str)
and isinstance(row[1], int)
and isinstance(row[2], int)
):
return df
else:
if isinstance(row[1], str):
try:
_ = int(row[1])
df[1] = pd.to_numeric(df[1])
except Exception:
row_count += 1
break
if isinstance(row[2], str):
try:
_ = int(row[2])
df[2] = pd.to_numeric(df[2])
except Exception:
row_count += 1
break
return df
except (pd.errors.ParserError, pd.errors.EmptyDataError) as _:
if row_count <= max_rows:
row_count += 1
# if can't open file after 5 attempts try to open it with gzip
return _read_gzipped_file(*args)


Expand Down
6 changes: 6 additions & 0 deletions scripts/ref_genome_validating/validate_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ def main():
chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm10.chrom.sizes",
)

ucsc_mm39 = GenomeModel(
genome_alias="ucsc_mm10",
chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_mm39.chrom.sizes",
)

ucsc_pantro6 = GenomeModel(
genome_alias="ucsc_pantro6",
chrom_sizes_file="/home/drc/GITHUB/bedboss/bedboss/bedboss/refgenome_validator/chrom_sizes/ucsc_panTro6.chrom.sizes",
Expand All @@ -117,6 +122,7 @@ def main():
all_genome_models.append(ucsc_hg38)
all_genome_models.append(ncbi_hg38)
all_genome_models.append(ucsc_mm10)
all_genome_models.append(ucsc_mm39)
all_genome_models.append(ucsc_hg19)
all_genome_models.append(ensembl_hg38)
all_genome_models.append(ucsc_pantro6)
Expand Down

0 comments on commit ccf0c5c

Please sign in to comment.