ENH Add citation subcommand

Make it as easy as possible for people to cite us
BigDataBiology · Oct 22, 2023 · 5ea5d59 · 5ea5d59
1 parent 13096c1
commit 5ea5d59
Show file tree

Hide file tree

Showing 6 changed files with 179 additions and 2 deletions.
diff --git a/CITATION.md b/CITATION.md
@@ -4,3 +4,10 @@ If you use this software in a publication please cite:
 >  improves metagenome-assembled genomes in microbiome datasets across
 >  different environments. *Nat Commun* **13,** 2326 (2022).
 >  https://doi.org/10.1038/s41467-022-29843-y
+
+And
+
+> Pan, S., Zhao, XM; Coelho, LP. SemiBin2: Self-Supervised Contrastive Learning
+> Leads to Better MAGs for Short- and Long-Read Sequencing. Bioinformatics  39
+> (39 Suppl 1): i21–29. https://doi.org/10.1038/s41467-022-29843-y
+
diff --git a/ChangeLog b/ChangeLog
@@ -1,4 +1,5 @@
 Unreleased
+	* citation: Add citation subcommand
 	* SemiBin1: Introduce separate SemiBin1 command
 
 Version 2.0.1 Oct 21 2023 by BigDataBiology

diff --git a/SemiBin/citation.py b/SemiBin/citation.py
@@ -0,0 +1,121 @@
+BIBTEX = '''\
+@ARTICLE{Pan2022semibin,
+  title     = "A deep siamese neural network improves metagenome-assembled
+               genomes in microbiome datasets across different environments",
+  author    = "Pan, Shaojun and Zhu, Chengkai and Zhao, Xing-Ming and Coelho,
+               Luis Pedro",
+  abstract  = "Metagenomic binning is the step in building metagenome-assembled
+               genomes (MAGs) when sequences predicted to originate from the
+               same genome are automatically grouped together. The most
+               widely-used methods for binning are reference-independent,
+               operating de novo and enable the recovery of genomes from
+               previously unsampled clades. However, they do not leverage the
+               knowledge in existing databases. Here, we introduce SemiBin, an
+               open source tool that uses deep siamese neural networks to
+               implement a semi-supervised approach, i.e. SemiBin exploits the
+               information in reference genomes, while retaining the capability
+               of reconstructing high-quality bins that are outside the
+               reference dataset. Using simulated and real microbiome datasets
+               from several different habitats from GMGCv1 (Global Microbial
+               Gene Catalog), including the human gut, non-human guts, and
+               environmental habitats (ocean and soil), we show that SemiBin
+               outperforms existing state-of-the-art binning methods. In
+               particular, compared to other methods, SemiBin returns more
+               high-quality bins with larger taxonomic diversity, including
+               more distinct genera and species.",
+  journal   = "Nat. Commun.",
+  publisher = "Nature Publishing Group",
+  volume    =  13,
+  number    =  1,
+  pages     = "2326",
+  month     =  apr,
+  year      =  2022,
+  language  = "en",
+  doi       = "10.1038/s41467-022-29843-y"
+}
+
+@ARTICLE{Pan2023semibin2,
+  title    = "{SemiBin2}: self-supervised contrastive learning leads to better
+              {MAGs} for short- and long-read sequencing",
+  author   = "Pan, Shaojun and Zhao, Xing-Ming and Coelho, Luis Pedro",
+  abstract = "MOTIVATION: Metagenomic binning methods to reconstruct
+              metagenome-assembled genomes (MAGs) from environmental samples
+              have been widely used in large-scale metagenomic studies. The
+              recently proposed semi-supervised binning method, SemiBin,
+              achieved state-of-the-art binning results in several
+              environments. However, this required annotating contigs, a
+              computationally costly and potentially biased process. RESULTS:
+              We propose SemiBin2, which uses self-supervised learning to learn
+              feature embeddings from the contigs. In simulated and real
+              datasets, we show that self-supervised learning achieves better
+              results than the semi-supervised learning used in SemiBin1 and
+              that SemiBin2 outperforms other state-of-the-art binners.
+              Compared to SemiBin1, SemiBin2 can reconstruct 8.3-21.5\% more
+              high-quality bins and requires only 25\% of the running time and
+              11\% of peak memory usage in real short-read sequencing samples.
+              To extend SemiBin2 to long-read data, we also propose
+              ensemble-based DBSCAN clustering algorithm, resulting in
+              13.1-26.3\% more high-quality genomes than the second best binner
+              for long-read data. AVAILABILITY AND IMPLEMENTATION: SemiBin2 is
+              available as open source software at
+              https://github.com/BigDataBiology/SemiBin/ and the analysis
+              scripts used in the study can be found at
+              https://github.com/BigDataBiology/SemiBin2\_benchmark.",
+  journal  = "Bioinformatics",
+  volume   =  39,
+  number   = "39 Suppl 1",
+  pages    = "i21--i29",
+  month    =  jun,
+  year     =  2023,
+  language = "en",
+  doi      = "10.1093/bioinformatics/btad209"
+}
+'''
+
+RIS = '''\
+TY  - JOUR
+AU  - Pan, Shaojun
+AU  - Zhu, Chengkai
+AU  - Zhao, Xing-Ming
+AU  - Coelho, Luis Pedro
+PY  - 2022
+DA  - 2022/04/28
+TI  - A deep siamese neural network improves metagenome-assembled genomes in microbiome datasets across different environments
+JO  - Nature Communications
+SP  - 2326
+VL  - 13
+IS  - 1
+AB  - Metagenomic binning is the step in building metagenome-assembled genomes (MAGs) when sequences predicted to originate from the same genome are automatically grouped together. The most widely-used methods for binning are reference-independent, operating de novo and enable the recovery of genomes from previously unsampled clades. However, they do not leverage the knowledge in existing databases. Here, we introduce SemiBin, an open source tool that uses deep siamese neural networks to implement a semi-supervised approach, i.e. SemiBin exploits the information in reference genomes, while retaining the capability of reconstructing high-quality bins that are outside the reference dataset. Using simulated and real microbiome datasets from several different habitats from GMGCv1 (Global Microbial Gene Catalog), including the human gut, non-human guts, and environmental habitats (ocean and soil), we show that SemiBin outperforms existing state-of-the-art binning methods. In particular, compared to other methods, SemiBin returns more high-quality bins with larger taxonomic diversity, including more distinct genera and species.
+SN  - 2041-1723
+UR  - https://doi.org/10.1038/s41467-022-29843-y
+DO  - 10.1038/s41467-022-29843-y
+ID  - Pan2022
+ER  - 
+
+TY  - JOUR
+AU  - Pan, Shaojun
+AU  - Zhao, Xing-Ming
+AU  - Coelho, Luis Pedro
+T1  - SemiBin2: self-supervised contrastive learning leads to better MAGs for short- and long-read sequencing
+PY  - 2023
+Y1  - 2023/06/01
+DO  - 10.1093/bioinformatics/btad209
+JO  - Bioinformatics
+JA  - Bioinformatics
+VL  - 39
+IS  - Supplement_1
+SP  - i21
+EP  - i29
+SN  - 1367-4811
+AB  - Metagenomic binning methods to reconstruct metagenome-assembled genomes (MAGs) from environmental samples have been widely used in large-scale metagenomic studies. The recently proposed semi-supervised binning method, SemiBin, achieved state-of-the-art binning results in several environments. However, this required annotating contigs, a computationally costly and potentially biased process.We propose SemiBin2, which uses self-supervised learning to learn feature embeddings from the contigs. In simulated and real datasets, we show that self-supervised learning achieves better results than the semi-supervised learning used in SemiBin1 and that SemiBin2 outperforms other state-of-the-art binners. Compared to SemiBin1, SemiBin2 can reconstruct 8.3–21.5% more high-quality bins and requires only 25% of the running time and 11% of peak memory usage in real short-read sequencing samples. To extend SemiBin2 to long-read data, we also propose ensemble-based DBSCAN clustering algorithm, resulting in 13.1–26.3% more high-quality genomes than the second best binner for long-read data.SemiBin2 is available as open source software at https://github.com/BigDataBiology/SemiBin/ and the analysis scripts used in the study can be found at https://github.com/BigDataBiology/SemiBin2_benchmark.
+Y2  - 10/22/2023
+UR  - https://doi.org/10.1093/bioinformatics/btad209
+ER  - 
+'''
+
+
+CHICAGO = '''\
+Pan, Shaojun, Chengkai Zhu, Xing-Ming Zhao, and Luis Pedro Coelho. 2022. "A Deep Siamese Neural Network Improves Metagenome-Assembled Genomes in Microbiome Datasets across Different Environments." Nature Communications 13 (1): 2326. https://doi.org/10.1093/bioinformatics/btad209
+
+Pan, Shaojun, Xing-Ming Zhao, and Luis Pedro Coelho. 2023. "SemiBin2: Self-Supervised Contrastive Learning Leads to Better MAGs for Short- and Long-Read Sequencing." Bioinformatics  39 (39 Suppl 1): i21–29. https://doi.org/10.1038/s41467-022-29843-y
+'''
diff --git a/SemiBin/main.py b/SemiBin/main.py
@@ -136,6 +136,25 @@ def parse_args(args, is_semibin2):
                             action='store_true',
                             default=None)
 
+    citation = subparsers.add_parser('citation',
+            help='Print citation information')
+    cite_format = citation.add_mutually_exclusive_group()
+    cite_format.add_argument('--bibtex',
+                        action='store_const',
+                        dest='cite_format',
+                        const='bibtex',
+                        help='Print bibTeX formatted citations')
+    cite_format.add_argument('--ris',
+                        action='store_const',
+                        dest='cite_format',
+                        const='ris',
+                        help='Print RIS formatted citations (can be used with Endnote)')
+    cite_format.add_argument('--chicago',
+                        action='store_const',
+                        dest='cite_format',
+                        const='chicago',
+                        help='Print Chicago-style citation (default)')
+
     training_mandatory = train_semi.add_argument_group('Mandatory arguments')
     training_mandatory.add_argument('-i', '--input-fasta',
                    required=True,
@@ -299,7 +318,7 @@ def parse_args(args, is_semibin2):
         p.add_argument('--orf-finder',
                        required=False,
                        type=str,
-                       help='ORF finder used to estimate the number of bins (prodigal/fraggenescan)',
+                       help='ORF finder used to estimate the number of bins (fast-naive/prodigal/fraggenescan)',
                        dest='orf_finder',
                        default=('fast-naive' if is_semibin2 else 'prodigal'))
         p.add_argument('--prodigal-output-faa',
@@ -1308,7 +1327,7 @@ def main2(args=None, is_semibin2=True):
         sh.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s'))
         logger.addHandler(sh)
 
-    if args.cmd not in ['download_GTDB', 'check_install']:
+    if args.cmd not in ['citation', 'download_GTDB', 'check_install']:
         os.makedirs(args.output, exist_ok=True)
         fh = logging.FileHandler(os.path.join(args.output, "SemiBinRun.log"))
         fh.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s'))
@@ -1329,6 +1348,18 @@ def main2(args=None, is_semibin2=True):
     if is_semibin2 and getattr(args, 'training_type', None) == 'semi':
         logger.info('Currently using semi-supervised mode. This is generally only useful for backwards compability.')
 
+    if args.cmd == 'citation':
+        from . import citation
+        if args.cite_format == 'bibtex':
+            print(citation.BIBTEX)
+        elif args.cite_format == 'ris':
+            print(citation.RIS)
+        elif args.cite_format == 'chicago':
+            print(citation.CHICAGO)
+        else:
+            print(citation.CHICAGO)
+            print(f'\nUse `SemiBin2 citation --help` to see all available citation formats')
+        sys.exit(0)
     if args.cmd in ['single_easy_bin', 'multi_easy_bin', 'train', 'train_semi', 'bin', 'train_self', 'bin_long']:
         import torch
         if args.engine == 'cpu':

diff --git a/docs/subcommands.md b/docs/subcommands.md
@@ -298,3 +298,14 @@ The separator character cannot occur in any of your samples, so if any sample co
 * `--compression` (since version `1.6`): whether to compress the output (defaults to `gz` if using `SemiBin2`)
 
 
+### citation
+
+_Available since version 2.1_
+
+Prints citation
+
+#### Optional argument
+
+* `--bibtex`: Use BibTeX format
+* `--ris`: Use RIS format (for Endnote and other tools)
+* `--chicago`: Use Chicago format (default)
diff --git a/test/test_args.py b/test/test_args.py
@@ -220,3 +220,9 @@ def test_write_prerecluster():
                 is_semibin2=is_semibin2)
         validate_normalize_args(logging, args)
         assert not args.write_pre_reclustering_bins
+
+def test_bibtex():
+    for is_semibin2 in [False, True]:
+        args = parse_args(['citation', '--bibtex'], is_semibin2=is_semibin2)
+        assert args.cmd == 'citation'
+        assert args.cite_format == 'bibtex'