From 95255d503201a68a4c8e0e2102106b27fd9ce76d Mon Sep 17 00:00:00 2001 From: RosCraddock <109593931+RosCraddock@users.noreply.github.com> Date: Sun, 14 Jan 2024 18:04:04 +0000 Subject: [PATCH] Squashing changes to Documentation --- docs/source/usage.rst | 126 ++++++++----- src/tinypeel/Peeling/PeelingIO.py | 64 ++++--- src/tinypeel/Peeling/PeelingInfo.py | 27 ++- src/tinypeel/Peeling/PeelingUpdates.py | 12 +- src/tinypeel/tinypeel.py | 241 +++++++++++++++++++++---- 5 files changed, 347 insertions(+), 123 deletions(-) diff --git a/docs/source/usage.rst b/docs/source/usage.rst index dfbca27..7254fe5 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -14,24 +14,29 @@ Input Arguments :: Input Options: - -pedigree [PEDIGREE [PEDIGREE ...]] + -ped_file [PEDIGREE ...] Pedigree file(s) (see format below). - -genotypes [GENOTYPES [GENOTYPES ...]] - Genotype File(s) (see format below). - -seqfile [SEQFILE [SEQFILE ...]] + -geno_file [GENOTYPES ...] + Genotype file(s) (see format below). + -seq_file [SEQFILE ...] Sequence allele read count file(s) (see format below). - -bfile [BFILE [BFILE ...]] + -plink_file [BFILE ...] Plink (binary) file(s). - -startsnp STARTSNP The first marker to consider. The first marker is "1". - -stopsnp STOPSNP The last marker to consider. + -start_snp START_SNP + The first marker to consider. The first marker is "1". Default: 1. + -stop_snp STOP_SNP The last marker to consider. Default: all markers considered. + -alt_allele_prob_file [ALT_ALLELE_PROB_FILE ...] + The alternative allele probabilities per metafounder(s). Default: 0.5 per locus + -main_metafounder + The metafounder to use where parents are unknown with input "0". Default: MF_1. -|Software| requires a pedigree file (``-pedigree``) and one or more genomic data files to run the analysis. +|Software| requires a pedigree file (``-ped_file``) and one or more genomic data files to run the analysis. -|Software| supports the following genomic data files: genotype files in the AlphaGenes format (``-genotypes``), sequence allele read in the AlphaGenes format (``-seqfile``), and binary Plink files (``-bfile``). Use of binary Plink files requires the package ``alphaplinkpython``, which can be installed via ``pip``, but is only stable for Linux. There are known issues with this package, so we do not advocate its use at the moment. +|Software| supports the following genomic data files: genotype files in the AlphaGenes format (``-geno_file``), sequence allele read in the AlphaGenes format (``-seq_file``), and binary Plink files (``-plink_file``). Use of binary Plink files requires the package ``alphaplinkpython``, which can be installed via ``pip``, but is only stable for Linux. There are known issues with this package, so we do not advocate its use at the moment. -Use the ``-startsnp`` and ``-stopsnp`` to run the analysis only on a subset of markers. +Use the ``-start_snp`` and ``-stop_snp`` to run the analysis only on a subset of markers. -The input options in the form of ``[xxx [xxx ...]]`` can take in more than one input file seperated by space. +The input options in the form of ``[xxx ...]`` can take in more than one input file separated by space. Output Arguments ---------------- @@ -39,47 +44,49 @@ Output Arguments :: Output options: - -out PREFIX The output file prefix. All file outputs will be stored + -out_file PREFIX The output file prefix. All file outputs will be stored as "PREFIX.dosage.txt" and so on. - -writekey WRITEKEY Determines the order in which individuals are ordered + -out_id_order OUT_ID_ORDER + Determines the order in which individuals are ordered in the output file based on their order in the corresponding input file. Individuals not in the input file are placed at the end of the file and sorted in alphanumeric order. These individuals can be suppressed - with the "-onlykeyed" option. Options: id, pedigree, + with the "-out_id_only" option. Options: id, pedigree, genotypes, sequence, segregation. Default: id. - -onlykeyed Flag to suppress the individuals not present in - the file used with "-writekey". It also suppresses "dummy" + -out_id_only Flag to suppress the individuals not present in + the file used with "-out_id_order". It also suppresses "dummy" individuals. - -iothreads IOTHREADS Number of threads to use for input/output. Default: 1. + -n_io_thread N_IO_THREAD + Number of threads to use for input/output. Default: 1. Peeling output options: -no_dosage Flag to suppress the dosage files. - -no_params Flag to suppress writing the model parameter files. + -no_param Flag to suppress writing the model parameter files. -seg_prob Flag to enable writing out the segregation probabilities. -phased_geno_prob Flag to enable writing out the phased genotype probabilities. -geno_prob Flag to enable writing out the genotype probabilities. -hap Flag to call and write out the haplotypes. -geno Flag to call and write out the genotypes. - -geno_threshold [GENO_THRESHOLD [GENO_THRESHOLD ...]] + -geno_threshold [GENO_THRESHOLD ...] Genotype calling threshold(s). Multiple space separated values allowed. Value less than 1/3 will be replaced by 1/3. - -hap_threshold [HAPS_CALL_THRESHOLD [HAPS_CALL_THRESHOLD]...] + -hap_threshold [HAP_THRESHOLD ...] Haplotype calling threshold(s). Multiple space separated values allowed. Value less than 1/2 will be replaced by 1/2. - -binary_call_files Flag to write out the called genotype files as a + -binary_call_file Flag to write out the called genotype files as a binary plink output [Not yet implemented]. -By default |Software| produces a dosage file and two model parameter files (genotype error rate and recombination rate). Creation of these files can be suppressed with the ``-no_dosage``, and ``-no_params`` options. |Software| can also write out the phased genotype probability file (.phased_geno_prob.txt) with the `-phased_geno_prob` argument and the segregation probability file (.seg_prob.txt) with the `-seg_prob` argument. +By default |Software| produces a dosage file and two model parameter files (genotype error rate and recombination rate). Creation of these files can be suppressed with the ``-no_dosage``, and ``-no_param`` options. |Software| can also write out the phased genotype probability file (.phased_geno_prob.txt) with the `-phased_geno_prob` argument and the segregation probability file (.seg_prob.txt) with the `-seg_prob` argument. The ``-geno_threshold`` and ``-hap_threshold`` arguments respectively control control which genotypes and haplotypes are called. A threshold of 0.9 will give calls only if the probability mass for one genotype (or haplotype) is higher than 0.9. Using a higher-value will increase the accuracy of called genotypes (or haplotypes), but will result in fewer called genotypes (or haplotypes). Since there are three genotypes states and two haplotype states, "best-guess" genotypes and haplotypes are respectively called with a threshold less than ``1/3`` and ``1/2``. -``-binary_call_files`` option can be used to change the output to a plink binary format. +``-binary_call_file`` option can be used to change the output to a plink binary format. -The order in which individuals are output can be changed by using the ``writekey`` option. This option changes the order in which individuals are written out to the order in which they were observed in the corresponding file. The ```-onlykeyed`` option suppresses the output of dummy individuals (not recommended for hybrid peeling). +The order in which individuals are output can be changed by using the ``out_id_order`` option. This option changes the order in which individuals are written out to the order in which they were observed in the corresponding file. The ```-out_id_only`` option suppresses the output of dummy individuals (not recommended for hybrid peeling). -The argument ``-iothreads`` controls the number of threads/processes used by |Software|. |Software| uses additional threads to parse and format input and output files. Setting this option to a value greater than 1 is only recommended for very large files (i.e. >10,000 individuals). +The argument ``-n_io_thread`` controls the number of threads/processes used by |Software|. |Software| uses additional threads to parse and format input and output files. Setting this option to a value greater than 1 is only recommended for very large files (i.e. >10,000 individuals). Peeling arguments ------------------ @@ -87,37 +94,44 @@ Peeling arguments :: Mandatory peeling arguments: - -runtype RUNTYPE Program run type. Either "single" or "multi". + -method METHOD Program run type. Either "single" or "multi". Optional peeling arguments: - -ncycles NCYCLES Number of peeling cycles. Default: 5. - -maxthreads MAXTHREADS + -n_cycle N_CYCLE Number of peeling cycles. Default: 5. + -n_thread N_THREAD Number of threads to use. Default: 1. - -length LENGTH Estimated length of the chromosome in Morgans. + -rec_length REC_LENGTH + Estimated recombination length of the chromosome in Morgans. [Default 1.00] Peeling control arguments: - -esterrors Flag to re-estimate the genotyping error rates after + -est_geno_error_prob Flag to re-estimate the genotyping error rates after each peeling cycle. - -estmaf Flag to re-estimate the minor allele frequency after + -est_seq_error_prob Flag to re-estimate the sequencing error rates after each peeling cycle. - -nophasefounders A flag phase a heterozygous allele in one of the + -est_rec_prob Flag to re-estimate the recombination rates after + each peeling cycle. + -est_alt_allele_prob Flag to re-estimate the alternative allele probabilities after + each peeling cycle. + -no_phase_founder A flag phase a heterozygous allele in one of the founders (if such an allele can be found). - -sexchrom A flag to that this is a sex chromosome. Sex needs to + -sex_chrom A flag to indicate that input data is for a sex chromosome. Sex needs to be given in the pedigree file. This is currently an experimental option. Genotype probability arguments: - -error ERROR Genotyping error rate. [Default 0.01] - -seqerror SEQERROR Assumed sequencing error rate. [Default 0.001] + -geno_error_prob GENO_ERROR_PROB + Genotyping error rate. [Default 0.0001] + -seq_error_prob SEQ_ERROR_PROB + Sequencing error rate. [Default 0.001] -``-runtype`` controls whether the program is run in "single-locus" or "multi-locus" model. Single locus mode does not use linkage information to perform imputation. It is fast, but not very accurate. Multi-locus mode runs multi-locus iterative peeling which uses linkage information to increase accuracy and calculate segregation values. +``-method`` controls whether the program is run in "single-locus" or "multi-locus" model. Single locus mode does not use linkage information to perform imputation. It is fast, but not very accurate. Multi-locus mode runs multi-locus iterative peeling which uses linkage information to increase accuracy and calculate segregation values. For hybrid peeling, where a large amount (millions of segregating sites) of sequence allele read counts needs to be imputed, first run the program in multi-locus mode to generate a segregation file, and then run the program in single-locus mode with a known segregation file. -The ``-error``, ``-seqerror`` and ``-length`` arguments control some of the model parameters used in the model. ``-seqerror`` must not be zero. |Software| is robust to deviations in genotyping error rate and sequencing error rate so it is not recommended to use these options unless large deviations from the default are known. Changing the ``-length`` argument to match the genetic map length can increase accuracy in some situations. +The ``-geno_error_prob``, ``-seq_error_prob`` and ``-rec_length`` arguments control some of the model parameters used in the model. ``-seq_error_prob`` must not be zero. |Software| is robust to deviations in genotyping error rate and sequencing error rate so it is not recommended to use these options unless large deviations from the default are known. Changing the ``-length`` argument to match the genetic map length can increase accuracy in some situations. -The ``-esterrors`` option estimated the genotyping error rate based on observed information, this option is generally not necessary and can increase runtime. ``-estmaf`` estimates the minor allele frequency after each peeling cycle. This option can be useful if there are a large number of non-genotyped founders. +The ``-est_geno_error_prob`` and ``-est_seq_error_prob`` options estimate the genotyping error rate and the sequencing error rate based on miss-match between observed and inferred states. This option is generally not necessary and can increase runtime. ``-est_alt_allele_prob`` estimates the alternative allele probabilities after each peeling cycle. This option can be useful if there are a large number of non-genotyped founders. If both ``-alt_allele_prob_file`` and ``-est_alt_allele_prob`` are used, the inputted alternative allele probabilities are used as a starting point for alternative allele probabilities estimation. Hybrid peeling arguments ------------------------ @@ -125,12 +139,12 @@ Hybrid peeling arguments :: Single locus arguments: - -segfile SEGFILE A segregation probabilities file for hybrid peeling. - -segmapfile SEGMAPFILE + -seg_file SEG_FILE A segregation probabilities file for hybrid peeling. + -seg_map_file SEG_MAP_FILE A map file for loci in the segregation probabilities file. - -mapfile MAPFILE A map file for all loci in hybrid peeling. + -map_file MAP_FILE A map file for all loci in hybrid peeling. -In order to run hybrid peeling the user needs to supply a ``-mapfile`` which gives the genetic positions for the SNPs in the sequence allele read counts data supplied, a ``-segmapfile`` which gives the genetic position for the SNPs in the segregation file, and a ``-segfile`` which gives the segregation values generated via multi-locus iterative peeling. These arguments are not required for running in multi-locus mode. +In order to run hybrid peeling the user needs to supply a ``-map_file`` which gives the genetic positions for the SNPs in the sequence allele read counts data supplied, a ``-seg_map_file`` which gives the genetic position for the SNPs in the segregation file, and a ``-seg_file`` which gives the segregation values generated via multi-locus iterative peeling. These arguments are not required for running in multi-locus mode. ============ File formats @@ -142,7 +156,7 @@ Input file formats Pedigree file ============= -Each line of a pedigree file has three values, the individual's id, their father's id, and their mother's id. "0" represents an unknown id. +Each line of a pedigree file has three values, the individual's id, their father's id, and their mother's id. "0" represents an unknown id. Individuals with one unknown parent get internally assigned a dummy/unknown parent. Hence all individuals have both or none parents known. Individuals with two unknown parents are considered as founders and are internally allocated to a metafounder (unknown parent group) "MF_1" (or defined by the user through ``-main_metafounder``). Users can provide additional metafounders as shown below - these must start with "MF_". Example: @@ -153,6 +167,15 @@ Example: id3 id1 id2 id4 id1 id2 +or + +:: + + id1 MF_1 MF_1 + id2 MF_2 MF_2 + id3 id1 id2 + id4 id1 id2 + Genotype file ============= @@ -204,6 +227,19 @@ Example: 1 snp_c 65429279 1 snp_d 107421759 +Alternative Allele Probability File +=================================== + +The alternative allele probability file allows for user-defined population alternative allele probabilities. This file contains the metafounder group denoted MF_x, where x is by default "1" but see ``-main_metafounder``, followed by alternative allele probabilities for all the markers. In case of multiple metafounders, provide multiple rows in the file. The default starting alternative allele probabilities are 0.5 for each marker. If you don't have information for some markers, provide 0.5 for these in the file. + +Example: +:: + MF_1 0.30 0.21 0.44 0.24 + +Or +:: + MF_1 0.30 0.21 0.44 0.24 + MF_2 0.40 0.34 0.25 0.40 Output file formats ------------------- @@ -300,9 +336,9 @@ Example: Model parameter files ===================== -|Software| outputs three model parameter files, ``.maf``, ``.seqError``, ``.genoError``. These give the minor allele frequency, sequencing error rates, and genotyping error rates used. All three files contain a single column with an entry for each marker. +|Software| outputs three model parameter files, ``.alt_allele_prob.txt``, ``.seq_error_prob.txt``, ``.geno_error_prob.txt``, ``.rec_prob.txt``. These give the minor allele frequency, sequencing error rates, genotyping error rates and recombination rates used. All three files contain a single column with an entry for each marker. -Example ``.maf`` file for four loci: +Example ``.alt_allele_prob.txt`` file for four loci: :: diff --git a/src/tinypeel/Peeling/PeelingIO.py b/src/tinypeel/Peeling/PeelingIO.py index 93ac799..9133165 100644 --- a/src/tinypeel/Peeling/PeelingIO.py +++ b/src/tinypeel/Peeling/PeelingIO.py @@ -66,33 +66,43 @@ def readInSeg(pedigree, fileName, start=None, stop=None): def writeOutParamaters(peelingInfo): args = InputOutput.args - np.savetxt(args.out + ".genoError", peelingInfo.genoError, fmt="%f") - np.savetxt(args.out + ".seqError", peelingInfo.seqError, fmt="%f") - # np.savetxt(args.out + ".trans", peelingInfo.transmissionRate, fmt = "%f") - np.savetxt(args.out + ".maf", peelingInfo.maf, fmt="%f") + np.savetxt(args.out_file + ".geno_error_prob.txt", peelingInfo.genoError, fmt="%f") + np.savetxt(args.out_file + ".seq_error_prob.txt", peelingInfo.seqError, fmt="%f") + np.savetxt( + args.out_file + ".rec_prob.txt", np.empty((1, 1)), fmt="%f" + ) # not be realized, just as a placeholder + # np.savetxt(args.out_file + ".trans", peelingInfo.transmissionRate, fmt = "%f") + np.savetxt(args.out_file + ".alt_allele_prob.txt", peelingInfo.maf, fmt="%f") -def writeGenotypes(pedigree, genoProbFunc): +def writeGenotypes(pedigree, genoProbFunc, isSexChrom): args = InputOutput.args if not args.no_dosage: - writeDosages(pedigree, genoProbFunc, args.out + ".dosage.txt") + writeDosages(pedigree, genoProbFunc, isSexChrom, args.out_file + ".dosage.txt") if args.phased_geno_prob: - writePhasedGenoProbs(pedigree, genoProbFunc, args.out + ".phased_geno_prob.txt") + writePhasedGenoProbs( + pedigree, genoProbFunc, args.out_file + ".phased_geno_prob.txt" + ) if args.geno_prob: - writeGenoProbs(pedigree, genoProbFunc, args.out + ".geno_prob.txt") + writeGenoProbs(pedigree, genoProbFunc, args.out_file + ".geno_prob.txt") if args.geno_threshold and args.geno: for thresh in args.geno_threshold: if thresh < 1 / 3: thresh = 1 / 3 - if args.binary_call_files: + if args.binary_call_file: writeBinaryCalledGenotypes( - pedigree, genoProbFunc, args.out + ".called." + str(thresh), thresh + pedigree, + genoProbFunc, + isSexChrom, + args.out_file + ".called." + str(thresh), + thresh, ) else: writeCalledGenotypes( pedigree, genoProbFunc, - args.out + ".geno_" + str(thresh) + ".txt", + isSexChrom, + args.out_file + ".geno_" + str(thresh) + ".txt", thresh, ) @@ -100,13 +110,13 @@ def writeGenotypes(pedigree, genoProbFunc): for thresh in args.hap_threshold: if thresh < 1 / 2: thresh = 1 / 2 - if args.binary_call_files: + if args.binary_call_file: pass # this function is not applied else: writeCalledPhase( pedigree, genoProbFunc, - args.out + ".hap_" + str(thresh) + ".txt", + args.out_file + ".hap_" + str(thresh) + ".txt", thresh, ) @@ -146,18 +156,18 @@ def writeGenoProbs(pedigree, genoProbFunc, outputFile): ) -def writeDosages(pedigree, genoProbFunc, outputFile): +def writeDosages(pedigree, genoProbFunc, isSexChrom, outputFile): with open(outputFile, "w+") as f: for idx, ind in pedigree.writeOrder(): matrix = np.dot(np.array([0, 1, 1, 2]), genoProbFunc(ind.idn)) - if InputOutput.args.sexchrom and ind.sex == 0: + if isSexChrom and ind.sex == 0: matrix *= 2 f.write(ind.idx + " " + " ".join(map("{:.4f}".format, matrix)) + "\n") -def writeCalledGenotypes(pedigree, genoProbFunc, outputFile, thresh): +def writeCalledGenotypes(pedigree, genoProbFunc, isSexChrom, outputFile, thresh): with open(outputFile, "w+") as f: for idx, ind in pedigree.writeOrder(): matrix = genoProbFunc(ind.idn) @@ -168,7 +178,7 @@ def writeCalledGenotypes(pedigree, genoProbFunc, outputFile, thresh): ) calledGenotypes = np.argmax(matrixCollapsedHets, axis=0) setMissing(calledGenotypes, matrixCollapsedHets, thresh) - if InputOutput.args.sexchrom and ind.sex == 0: + if isSexChrom and ind.sex == 0: doubleIfNotMissing(calledGenotypes) f.write(ind.idx + " " + " ".join(map(str, calledGenotypes)) + "\n") @@ -198,7 +208,7 @@ def writeCalledPhase(pedigree, genoProbFunc, outputFile, thresh): f.write(ind.idx + " " + " ".join(map(str, maternal_haplotype)) + "\n") -def writeBinaryCalledGenotypes(pedigree, genoProbFunc, outputFile, thresh): +def writeBinaryCalledGenotypes(pedigree, genoProbFunc, isSexChrom, outputFile, thresh): for idx, ind in pedigree.writeOrder(): matrix = genoProbFunc(ind.idn) matrixCollapsedHets = np.array( @@ -206,7 +216,7 @@ def writeBinaryCalledGenotypes(pedigree, genoProbFunc, outputFile, thresh): ) calledGenotypes = np.argmax(matrixCollapsedHets, axis=0) setMissing(calledGenotypes, matrixCollapsedHets, thresh) - if InputOutput.args.sexchrom and ind.sex == 0: + if isSexChrom and ind.sex == 0: doubleIfNotMissing(calledGenotypes) ind.genotypes = calledGenotypes.astype(np.int8) @@ -231,27 +241,29 @@ def setMissing(calledGenotypes, matrix, thresh): def fullOutput(pedigree, peelingInfo, args): InputOutput.writeIdnIndexedMatrix( - pedigree, peelingInfo.penetrance, args.out + ".penetrance" + pedigree, peelingInfo.penetrance, args.out_file + ".penetrance" ) InputOutput.writeIdnIndexedMatrix( - pedigree, peelingInfo.anterior, args.out + ".anterior" + pedigree, peelingInfo.anterior, args.out_file + ".anterior" ) InputOutput.writeIdnIndexedMatrix( - pedigree, peelingInfo.posterior, args.out + ".posterior" + pedigree, peelingInfo.posterior, args.out_file + ".posterior" ) InputOutput.writeFamIndexedMatrix( pedigree, peelingInfo.posteriorSire_minusFam, - args.out + ".posteriorSire_minusFam", + args.out_file + ".posteriorSire_minusFam", ) InputOutput.writeFamIndexedMatrix( - pedigree, peelingInfo.posteriorDam_minusFam, args.out + ".posteriorDam_minusFam" + pedigree, + peelingInfo.posteriorDam_minusFam, + args.out_file + ".posteriorDam_minusFam", ) InputOutput.writeFamIndexedMatrix( - pedigree, peelingInfo.posteriorSire_new, args.out + ".posteriorSire_new" + pedigree, peelingInfo.posteriorSire_new, args.out_file + ".posteriorSire_new" ) InputOutput.writeFamIndexedMatrix( - pedigree, peelingInfo.posteriorDam_new, args.out + ".posteriorDam_new" + pedigree, peelingInfo.posteriorDam_new, args.out_file + ".posteriorDam_new" ) diff --git a/src/tinypeel/Peeling/PeelingInfo.py b/src/tinypeel/Peeling/PeelingInfo.py index 653b2df..7aee8ae 100644 --- a/src/tinypeel/Peeling/PeelingInfo.py +++ b/src/tinypeel/Peeling/PeelingInfo.py @@ -21,7 +21,7 @@ def createPeelingInfo(pedigree, args, createSeg=True, phaseFounder=False): peelingInfo = jit_peelingInformation( nInd=pedigree.maxIdn, nFam=pedigree.maxFam, nLoci=nLoci, createSeg=createSeg ) - peelingInfo.isSexChrom = args.sexchrom + peelingInfo.isSexChrom = args.sex_chrom # Information about the peeling positions are handled elsewhere. peelingInfo.positions = None @@ -34,10 +34,10 @@ def createPeelingInfo(pedigree, args, createSeg=True, phaseFounder=False): peelingInfo.segregationTensorXY = ProbMath.generateSegregationXYChrom(e=1e-06) peelingInfo.segregationTensorXX = ProbMath.generateSegregationXXChrom(e=1e-06) - peelingInfo.genoError[:] = args.error - peelingInfo.seqError[:] = args.seqerror + peelingInfo.genoError[:] = args.geno_error_prob + peelingInfo.seqError[:] = args.seq_error_prob setupTransmission( - args.length, peelingInfo + args.rec_length, peelingInfo ) # Sets up the transmission rates using a custom position list and a total chromosome length. for ind in pedigree: @@ -69,22 +69,29 @@ def createPeelingInfo(pedigree, args, createSeg=True, phaseFounder=False): if ind.isGenotypedFounder() and phaseFounder and ind.genotypes is not None: loci = getHetMidpoint(ind.genotypes) if loci is not None: - e = args.error + e = args.geno_error_prob peelingInfo.penetrance[ind.idn, :, loci] = np.array( [e / 3, e / 3, 1 - e, e / 3], dtype=np.float32 ) if args.penetrance is not None: - if args.sexchrom: + if args.sex_chrom: print( - "Using an external penetrance file and the sexchrom option is highly discouraged. Please do not use." + "Using an external penetrance file and the sex_chrom option is highly discouraged. Please do not use." ) - if args.esterrors: + if args.est_geno_error_prob: print( - "External penetrance file included, but esterrors flag used. The two options are incompatible. esterrors set to false." + "External penetrance file included, but est_geno_error_prob flag used. The two options are incompatible. est_geno_error_prob set to false." ) - args.esterrors = False + args.est_geno_error_prob = False + + if args.est_seq_error_prob: + print( + "External penetrance file included, but est_seq_error_prob flag used. The two options are incompatible. est_seq_error_prob set to false." + ) + args.est_seq_error_prob = False + for pen in args.penetrance: addPenetranceFromExternalFile(pedigree, peelingInfo, pen, args) # updateMaf(pedigree, peelingInfo) diff --git a/src/tinypeel/Peeling/PeelingUpdates.py b/src/tinypeel/Peeling/PeelingUpdates.py index 1d87c10..3f5fee6 100644 --- a/src/tinypeel/Peeling/PeelingUpdates.py +++ b/src/tinypeel/Peeling/PeelingUpdates.py @@ -129,9 +129,11 @@ def addIndividualToUpdate(d, p, LLp, LLpp): # -def updatePenetrance(pedigree, peelingInfo): - peelingInfo.genoError = updateGenoError(pedigree, peelingInfo) - peelingInfo.seqError = updateSeqError(pedigree, peelingInfo) +def updatePenetrance(pedigree, peelingInfo, args): + if args.est_geno_error_prob: + peelingInfo.genoError = updateGenoError(pedigree, peelingInfo) + if args.est_seq_error_prob: + peelingInfo.seqError = updateSeqError(pedigree, peelingInfo) if peelingInfo.isSexChrom: print( @@ -152,7 +154,7 @@ def updatePenetrance(pedigree, peelingInfo): if ( ind.isGenotypedFounder() - and (not InputOutput.args.nophasefounders) + and (not InputOutput.args.no_phase_founder) and ind.genotypes is not None ): loci = PeelingInfo.getHetMidpoint(ind.genotypes) @@ -169,7 +171,7 @@ def updateGenoError(pedigree, peelingInfo): # We use a max value of 5% and a min value of .0001 percent to make sure the values are reasonable counts = np.full(pedigree.nLoci, 1, dtype=np.float32) - errors = np.full(pedigree.nLoci, 0.01, dtype=np.float32) + errors = np.full(pedigree.nLoci, 0.0001, dtype=np.float32) for ind in pedigree: updateGenoError_ind( diff --git a/src/tinypeel/tinypeel.py b/src/tinypeel/tinypeel.py index 209144d..c09db16 100644 --- a/src/tinypeel/tinypeel.py +++ b/src/tinypeel/tinypeel.py @@ -15,10 +15,10 @@ def runPeelingCycles(pedigree, peelingInfo, args, singleLocusMode=False): # Right now maf _only_ uses the penetrance so can be estimated once. - if args.estmaf: + if args.est_alt_allele_prob: PeelingUpdates.updateMaf(pedigree, peelingInfo) - for i in range(args.ncycles): + for i in range(args.n_cycle): print("Cycle ", i) peelingCycle(pedigree, peelingInfo, args=args, singleLocusMode=singleLocusMode) peelingInfo.iteration += 1 @@ -27,9 +27,8 @@ def runPeelingCycles(pedigree, peelingInfo, args, singleLocusMode=False): # if args.esttransitions: # print("Estimating the transmission rate is currently a disabled option") # PeelingUpdates.updateSeg(peelingInfo) #Option currently disabled. - - if args.esterrors: - PeelingUpdates.updatePenetrance(pedigree, peelingInfo) + if args.est_geno_error_prob or args.est_seq_error_prob: + PeelingUpdates.updatePenetrance(pedigree, peelingInfo, args) def peelingCycle(pedigree, peelingInfo, args, singleLocusMode=False): @@ -191,15 +190,15 @@ def generateSingleLocusSegregation(peelingInfo, pedigree, args): if args.segfile is not None: # This just gets the locations in the map files. snpMap = np.array( - InputOutput.readMapFile(args.mapfile, args.startsnp, args.stopsnp)[2] + InputOutput.readMapFile(args.map_file, args.startsnp, args.stopsnp)[2] ) - segMap = np.array(InputOutput.readMapFile(args.segmapfile)[2]) + segMap = np.array(InputOutput.readMapFile(args.seg_map_file)[2]) loci, distance = getLociAndDistance(snpMap, segMap) start = np.min(loci) stop = np.max(loci) - seg = InputOutput.readInSeg(pedigree, args.segfile, start=start, stop=stop) + seg = InputOutput.readInSeg(pedigree, args.seg_file, start=start, stop=stop) loci -= start # Re-align to seg file. for i in range(len(distance)): segLoc0 = loci[i, 0] @@ -213,6 +212,147 @@ def generateSingleLocusSegregation(peelingInfo, pedigree, args): peelingInfo.segregation[:, :, :] = 0.25 +def get_probability_options(): + parse_dictionary = dict() + parse_dictionary["geno_error_prob"] = lambda parser: parser.add_argument( + "-geno_error_prob", + default=0.0001, + required=False, + type=float, + help="Genotyping error rate. Default: 0.0001.", + ) + parse_dictionary["seq_error_prob"] = lambda parser: parser.add_argument( + "-seq_error_prob", + default=0.001, + required=False, + type=float, + help="Sequencing error rate. Default: 0.001.", + ) + + return parse_dictionary + + +def get_input_options(): + parse_dictionary = dict() + parse_dictionary["bfile"] = lambda parser: parser.add_argument( + "-plink_file", + default=None, + required=False, + type=str, + nargs="*", + help="plink (binary) file(s).", + ) + parse_dictionary["genotypes"] = lambda parser: parser.add_argument( + "-geno_file", + default=None, + required=False, + type=str, + nargs="*", + help="Genotype file(s) in AlphaGenes format.", + ) + parse_dictionary["reference"] = lambda parser: parser.add_argument( + "-reference", + default=None, + required=False, + type=str, + nargs="*", + help="A haplotype reference panel in AlphaGenes format.", + ) + parse_dictionary["seqfile"] = lambda parser: parser.add_argument( + "-seq_file", + default=None, + required=False, + type=str, + nargs="*", + help="Sequence allele read count file(s).", + ) + parse_dictionary["pedigree"] = lambda parser: parser.add_argument( + "-ped_file", + default=None, + required=False, + type=str, + nargs="*", + help="Pedigree file(s) in AlphaGenes format.", + ) + parse_dictionary["phasefile"] = lambda parser: parser.add_argument( + "-hap_file", + default=None, + required=False, + type=str, + nargs="*", + help="A haplotype file in AlphaGenes format.", + ) + parse_dictionary["startsnp"] = lambda parser: parser.add_argument( + "-start_snp", + default=None, + required=False, + type=int, + help="The first marker to consider. The first marker in the file is marker '1'. Default: 1.", + ) + parse_dictionary["stopsnp"] = lambda parser: parser.add_argument( + "-stop_snp", + default=None, + required=False, + type=int, + help="The last marker to consider. Default: all markers considered.", + ) + parse_dictionary["seed"] = lambda parser: parser.add_argument( + "-seed", + default=None, + required=False, + type=int, + help="A random seed to use for debugging.", + ) + + return parse_dictionary + + +def get_output_options(): + parse_dictionary = dict() + + parse_dictionary["writekey"] = lambda parser: parser.add_argument( + "-out_id_order", + default="id", + required=False, + type=str, + help='Determines the order in which individuals are ordered in the output file based on their order in the corresponding input file. Individuals not in the input file are placed at the end of the file and sorted in alphanumeric order. These inividuals can be surpressed with the "-out_id_only" option. Options: id, pedigree, genotypes, sequence, segregation. Defualt: id.', + ) + parse_dictionary["onlykeyed"] = lambda parser: parser.add_argument( + "-out_id_only", + action="store_true", + required=False, + help='Flag to surpress the individuals who are not present in the file used with -out_id_order. Also surpresses "dummy" individuals.', + ) + parse_dictionary["iothreads"] = lambda parser: parser.add_argument( + "-n_io_thread", + default=1, + required=False, + type=int, + help="Number of threads to use for io. Default: 1.", + ) + + return parse_dictionary + + +def get_multithread_options(): + parse_dictionary = dict() + parse_dictionary["iothreads"] = lambda parser: parser.add_argument( + "-n_io_thread", + default=1, + required=False, + type=int, + help="Number of threads to use for input and output. Default: 1.", + ) + parse_dictionary["maxthreads"] = lambda parser: parser.add_argument( + "-n_thread", + default=1, + required=False, + type=int, + help="Maximum number of threads to use for analysis. Default: 1.", + ) + return parse_dictionary + + # ACTUAL PROGRAM BELOW @@ -220,12 +360,12 @@ def getArgs(): parser = argparse.ArgumentParser(description="") core_parser = parser.add_argument_group("Core arguments") core_parser.add_argument( - "-out", required=True, type=str, help="The output file prefix." + "-out_file", required=True, type=str, help="The output file prefix." ) core_peeling_parser = parser.add_argument_group("Mandatory peeling arguments") core_peeling_parser.add_argument( - "-runtype", + "-method", default=None, required=False, type=str, @@ -236,7 +376,7 @@ def getArgs(): input_parser = parser.add_argument_group("Input Options") InputOutput.add_arguments_from_dictionary( input_parser, - InputOutput.get_input_options(), + get_input_options(), options=[ "bfile", "genotypes", @@ -264,7 +404,7 @@ def getArgs(): help="Flag to enable writing out the segregation probabilities.", ) output_parser.add_argument( - "-no_params", + "-no_param", action="store_true", required=False, help="Flag to suppress writing the model parameter files.", @@ -306,7 +446,7 @@ def getArgs(): help="Flag to call and write out the genotypes.", ) output_parser.add_argument( - "-binary_call_files", + "-binary_call_file", action="store_true", required=False, help="Flag to write out the called genotype files as a binary plink output [Not yet implemented].", @@ -320,7 +460,7 @@ def getArgs(): InputOutput.add_arguments_from_dictionary( output_parser, - InputOutput.get_output_options(), + get_output_options(), options=["writekey", "onlykeyed"], ) @@ -328,24 +468,24 @@ def getArgs(): multithread_parser = parser.add_argument_group("Multithreading Options") InputOutput.add_arguments_from_dictionary( multithread_parser, - InputOutput.get_multithread_options(), + get_multithread_options(), options=["iothreads", "maxthreads"], ) peeling_parser = parser.add_argument_group("Optional peeling arguments") peeling_parser.add_argument( - "-ncycles", + "-n_cycle", default=5, required=False, type=int, help="Number of peeling cycles. Default: 5.", ) peeling_parser.add_argument( - "-length", + "-rec_length", default=1.0, required=False, type=float, - help="Estimated length of the chromosome in Morgans. [Default 1.00]", + help="Estimated recombination length of the chromosome in Morgans. [Default 1.00]", ) peeling_parser.add_argument( "-penetrance", @@ -357,53 +497,59 @@ def getArgs(): ) # help='An optional external penetrance file. This will overwrite the default penetrance values.') InputOutput.add_arguments_from_dictionary( peeling_parser, - InputOutput.get_probability_options(), - options=["error", "seqerror"], + get_probability_options(), + options=["geno_error_prob", "seq_error_prob"], ) peeling_control_parser = parser.add_argument_group("Peeling control arguments") peeling_control_parser.add_argument( - "-esterrors", + "-est_geno_error_prob", action="store_true", required=False, help="Flag to re-estimate the genotyping error rates after each peeling cycle.", ) peeling_control_parser.add_argument( - "-estmaf", + "-est_seq_error_prob", + action="store_true", + required=False, + help="Flag to re-estimate the sequencing error rates after each peeling cycle.", + ) + peeling_control_parser.add_argument( + "-est_alt_allele_prob", action="store_true", required=False, - help="Flag to re-estimate the minor allele frequency after each peeling cycle.", + help="Flag to re-estimate the alternative allele probabilities after each peeling cycle.", ) peeling_control_parser.add_argument( - "-nophasefounders", + "-no_phase_founder", action="store_true", required=False, help="A flag phase a heterozygous allele in one of the founders (if such an allele can be found).", ) peeling_control_parser.add_argument( - "-sexchrom", + "-sex_chrom", action="store_true", required=False, - help="A flag to that this is a sex chromosome. Sex needs to be given in the pedigree file. This is currently an experimental option.", + help="A flag to indicate that input data is for a sex chromosome. Sex needs to be given in the pedigree file. This is currently an experimental option.", ) singleLocus_parser = parser.add_argument_group("Hybrid peeling arguments") singleLocus_parser.add_argument( - "-mapfile", + "-map_file", default=None, required=False, type=str, - help="a map file for genotype data.", + help="a map file for all loci in hybrid peeling.", ) singleLocus_parser.add_argument( - "-segmapfile", + "-seg_map_file", default=None, required=False, type=str, - help="a map file for the segregation estimates for hybrid peeling.", + help="a map file for loci in the segregation probabilities file.", ) singleLocus_parser.add_argument( - "-segfile", + "-seg_file", default=None, required=False, type=str, @@ -416,15 +562,32 @@ def getArgs(): def main(): args = getArgs() + if args.start_snp: + args.start_snp -= 1 + if args.stop_snp: + args.stop_snp -= 1 + args.bfile = args.plink_file + args.genotypes = args.geno_file + args.phasefile = args.hap_file + args.seqfile = args.seq_file + args.pedigree = args.ped_file + args.startsnp = args.start_snp + args.stopsnp = args.stop_snp + args.writekey = args.out_id_order + args.onlykeyed = args.out_id_only + args.iothreads = args.n_io_thread + args.maxthreads = args.n_thread + args.segfile = args.seg_file + pedigree = Pedigree.Pedigree() InputOutput.readInPedigreeFromInputs(pedigree, args) - singleLocusMode = args.runtype == "single" - if args.runtype == "multi" and args.segfile: + singleLocusMode = args.method == "single" + if args.method == "multi" and args.segfile: print("Running in multi-locus mode, external segfile ignored") peelingInfo = PeelingInfo.createPeelingInfo( - pedigree, args, phaseFounder=(not args.nophasefounders) + pedigree, args, phaseFounder=(not args.no_phase_founder) ) if singleLocusMode: @@ -432,12 +595,16 @@ def main(): generateSingleLocusSegregation(peelingInfo, pedigree, args) runPeelingCycles(pedigree, peelingInfo, args, singleLocusMode=singleLocusMode) - PeelingIO.writeGenotypes(pedigree, genoProbFunc=peelingInfo.getGenoProbs) - if not args.no_params: + PeelingIO.writeGenotypes( + pedigree, + genoProbFunc=peelingInfo.getGenoProbs, + isSexChrom=peelingInfo.isSexChrom, + ) + if not args.no_param: PeelingIO.writeOutParamaters(peelingInfo) if not singleLocusMode and args.seg_prob: InputOutput.writeIdnIndexedMatrix( - pedigree, peelingInfo.segregation, args.out + ".seg_prob.txt" + pedigree, peelingInfo.segregation, args.out_file + ".seg_prob.txt" )